In [ ]:
%matplotlib inline
import folium
import json
import geojsonio
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import nltk
import pandas as pd
import re
import string
import sys
import time
import vincent
from collections import Counter
from collections import defaultdict
from datetime import datetime
from matplotlib import dates
from matplotlib import rcParams
from matplotlib.ticker import MaxNLocator
from os import path
from pandas.tseries.resample import TimeGrouper
from pandas.tseries.offsets import DateOffset
from scipy.misc import imread
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
print 'OK!'
In [ ]:
tweets_data_path = 'data/small-data.json'
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
tweets = pd.DataFrame()
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data)
tweets['country'] = map(lambda tweet: tweet['place']['country']
if tweet['place'] != None else None, tweets_data)
print 'OK!'
In [ ]:
print 'Número de Tweets total: %s' % len(tweets_data)
In [ ]:
# exemplo de tweet
print tweets_data[0]
In [ ]:
tweets_by_lang = tweets['lang'].value_counts()
fig, ax = plt.subplots(figsize=(20,10))
ax.tick_params(axis='x', labelsize=20)
ax.tick_params(axis='y', labelsize=15)
ax.set_xlabel('Idiomas'.decode('utf-8'), fontsize=20)
ax.set_ylabel('Número de tweets'.decode('utf-8') , fontsize=20)
ax.set_title('Top 4 Idiomas'.decode('utf-8'), fontsize=20, fontweight='bold')
tweets_by_lang[:4].plot(ax=ax, kind='bar', color='mediumspringgreen')
plt.grid()
In [ ]:
tweets_by_country = tweets['country'].value_counts()
fig, ax = plt.subplots(figsize=(20,10))
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=15)
ax.set_xlabel('Países'.decode('utf-8'), fontsize=20)
ax.set_ylabel('Número de tweets'.decode('utf-8') , fontsize=20)
ax.set_title('Top 5 Países'.decode('utf-8'), fontsize=20, fontweight='bold')
tweets_by_country[:5].plot(ax=ax, kind='bar', color='lightskyblue')
plt.grid()
In [ ]:
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
In [ ]:
tweets['NaoVaiTerGolpe'] = tweets['text'].apply(lambda tweet: word_in_text('NaoVaiTerGolpe', tweet))
tweets['TchauQuerida'] = tweets['text'].apply(lambda tweet: word_in_text('TchauQuerida', tweet))
tweets['ForaDilma'] = tweets['text'].apply(lambda tweet: word_in_text('ForaDilma', tweet))
tweets['BrasilContraOGolpe'] = tweets['text'].apply(lambda tweet: word_in_text('BrasilContraOGolpe', tweet))
tweets['ForaCunha'] = tweets['text'].apply(lambda tweet: word_in_text('ForaCunha', tweet))
hashtags = ['ForaDilma', 'NaoVaiTerGolpe', 'TchauQuerida', 'BrasilContraOGolpe', 'ForaCunha']
tweets_by_hashtags = [tweets['ForaDilma'].value_counts()[True],
tweets['NaoVaiTerGolpe'].value_counts()[True],
tweets['TchauQuerida'].value_counts()[True],
tweets['BrasilContraOGolpe'].value_counts()[True],
tweets['ForaCunha'].value_counts()[True]]
plt.subplots(figsize=(10,10))
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue', 'peachpuff']
explode = (0.03, 0.03, 0.03, 0.03, 0.03)
plt.pie(tweets_by_hashtags, explode=explode, labels=hashtags, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=140)
plt.rcParams['font.size'] = 15
plt.legend(tweets_by_hashtags, loc=(.95,.6), title='Número de Tweets:'.decode('utf-8'), fontsize=15)
plt.axis('equal')
plt.show()
In [ ]:
tweets['nao'] = tweets['text'].apply(lambda tweet: word_in_text(' nao ', tweet))
tweets['sim'] = tweets['text'].apply(lambda tweet: word_in_text(' sim ', tweet))
tweets['ImpeachmentDay'] = tweets['text'].apply(lambda tweet: word_in_text(' sim ', tweet)
or word_in_text(' nao ', tweet))
hashtags = ['ForaDilma', 'NaoVaiTerGolpe']
tweets_by_hashtags = [tweets[tweets['ImpeachmentDay'] == True]['ForaDilma'].value_counts()[True],
tweets[tweets['ImpeachmentDay'] == True]['NaoVaiTerGolpe'].value_counts()[True]]
ind = np.arange(2)
width = 0.5
width2 = 0.3
x_pos = list(range(len(hashtags)))
fig, ax = plt.subplots(figsize=(12,10))
ax.bar(ind + width2, tweets_by_hashtags, width, color='yellowgreen')
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=15)
ax.set_ylabel('Número de tweets'.decode('utf-8'), fontsize=20)
ax.set_title('Ranking: ForaDilma vs. NaoVaiTerGolpe (Votação SIM x NÃO)'.decode('utf-8'),
fontsize=15, fontweight='bold')
ax.set_xticks([p + 1.1 * width for p in x_pos])
ax.set_xticklabels(hashtags)
plt.grid()
In [ ]:
print tweets_by_hashtags
In [ ]:
def extract_link(text):
regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
match = re.search(regex, text)
if match:
return match.group()
return ''
tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet))
tweets_relevant = tweets[tweets['ImpeachmentDay'] == True]
tweets_relevant_with_link = tweets_relevant[tweets_relevant['link'] != '']
print tweets_relevant_with_link[tweets_relevant_with_link['TchauQuerida'] == True]['link']
print tweets_relevant_with_link[tweets_relevant_with_link['ForaDilma'] == True]['link']
print tweets_relevant_with_link[tweets_relevant_with_link['ForaCunha'] == True]['link']
print tweets_relevant_with_link[tweets_relevant_with_link['NaoVaiTerGolpe'] == True]['link']
In [ ]:
tweets['moro'] = tweets['text'].apply(lambda tweet: word_in_text('moro', tweet))
tweets['cunha'] = tweets['text'].apply(lambda tweet: word_in_text('cunha', tweet))
tweets['bolsonaro'] = tweets['text'].apply(lambda tweet: word_in_text('bolsonaro', tweet))
tweets['lula'] = tweets['text'].apply(lambda tweet: word_in_text('lula', tweet))
tweets['temer'] = tweets['text'].apply(lambda tweet: word_in_text('temer', tweet))
tweets['feliciano'] = tweets['text'].apply(lambda tweet: word_in_text('feliciano', tweet))
hashtags = ['Sérgio Moro'.decode('utf-8'), 'Eduardo Cunha', 'Jair Bolsonaro', 'Lula', 'Marcos Feliciano', 'Michel Temer']
tweets_by_hashtags = [tweets['moro'].value_counts()[True],
tweets['cunha'].value_counts()[True],
tweets['bolsonaro'].value_counts()[True],
tweets['lula'].value_counts()[True],
tweets['feliciano'].value_counts()[True],
tweets['temer'].value_counts()[True]]
plt.subplots(figsize=(10,10))
colors = ['gold', 'yellowgreen', 'lightskyblue', 'lightcoral', 'peachpuff', 'mediumturquoise']
explode = (0.03, 0.03, 0.03, 0.05, 0.03, 0.03)
plt.pie(tweets_by_hashtags, explode=explode, labels=hashtags, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=90)
plt.rcParams['font.size'] = 15
# plt.legend(tweets_by_hashtags, loc='best')
plt.legend(tweets_by_hashtags, loc=(-.22,.6), title='Número de Tweets:'.decode('utf-8'), fontsize=15)
plt.axis('equal')
plt.show()
In [14]:
tweets['created_at'] = map(lambda tweet: time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')), tweets_data)
tweets['user'] = map(lambda tweet: tweet['user']['screen_name'], tweets_data)
tweets['user_followers_count'] = map(lambda tweet: tweet['user']['followers_count'], tweets_data)
tweets['retweet_count'] = map(lambda tweet: tweet['retweet_count'], tweets_data)
tweets['favorite_count'] = map(lambda tweet: tweet['favorite_count'], tweets_data)
tweets['text'] = map(lambda tweet: tweet['text'].encode('utf-8'), tweets_data)
tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data)
tweets['Location'] = map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data)
tweets.head()
Out[14]:
In [17]:
list_of_original_tweets = [element for element in tweets['text'].values if not element.startswith('RT')]
print "Número de Tweets originais : " + str(len(list_of_original_tweets))
list_of_retweets = [element for element in tweets['text'].values if element.startswith('RT')]
print "Número de Retweets : " + str(len(list_of_retweets))
In [18]:
def plot_tweets_per_category(category, title, x_title, y_title, top_n=5, output_filename="plot.png"):
tweets_by_cat = category.value_counts()
fig, ax = plt.subplots(figsize=(20,10))
ax.tick_params(axis='x')
ax.tick_params(axis='y')
ax.set_xlabel(x_title)
ax.set_ylabel(y_title)
ax.set_title(title)
tweets_by_cat[:top_n].plot(ax=ax, kind='bar', color='mediumturquoise')
fig.savefig(output_filename)
fig.show()
In [19]:
plot_tweets_per_category(tweets['user'],
"#ImpeachmentDay usuarios ativos",
"Usuários".decode('utf-8'),
"Número de Tweets".decode('utf-8'), 20)
In [21]:
text = " ".join(tweets['text'].values.astype(str))
no_urls_no_tags = " ".join([word for word in text.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
punctuation = list(string.punctuation)
stop = nltk.corpus.stopwords.words('portuguese') + punctuation
wordcloud = WordCloud(background_color="white", max_words=500,
stopwords=stop, width=1800, height=1400).generate(no_urls_no_tags)
plt.figure(figsize=(20,20))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [25]:
text = " ".join(tweets['text'].values.astype(str))
no_urls_no_tags = " ".join([word for word in text.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
tweet_coloring = imread(path.join("dilma.png"))
punctuation = list(string.punctuation)
stop = nltk.corpus.stopwords.words('portuguese') + punctuation
wordcloud = WordCloud(background_color="white", max_words=500, mask=tweet_coloring,
stopwords=stop, width=1800, height=1400).generate(no_urls_no_tags)
plt.figure(figsize=(10,10))
image_colors = ImageColorGenerator(tweet_coloring)
plt.imshow(wordcloud)
plt.axis("off")
plt.figure(figsize=(10,10))
plt.imshow(tweet_coloring, cmap=plt.cm.gray)
plt.axis("off")
plt.figure(figsize=(10,10))
plt.imshow(wordcloud.recolor(color_func=image_colors))
plt.axis("off")
plt.show()
In [3]:
tweets2 = pd.read_json("data/small-data-fixed.json")
print 'OK!'
type(tweets2)
Out[3]:
In [20]:
tweets2.info()
In [15]:
coordinate = []
for col in tweets2['coordinates'][~tweets2['coordinates'].isnull()]:
coord = col['coordinates'][::-1]
# coord = col['coordinates']
coordinate.append(coord)
print coordinate[10]
In [16]:
coord_text = []
for col in tweets2['text'][~tweets2['coordinates'].isnull()]:
coord = col.encode('utf-8')
coord_text.append(coord)
print coord_text[10]
In [22]:
tweets2[['coordinates','text']][~tweets2['coordinates'].isnull()].head(11)
Out[22]:
In [23]:
coords = tweets2['coordinates']
coords = coords[~coords.isnull()]
coords = coords.apply(lambda d: d['coordinates'][::-1])
coords.head(20)
Out[23]:
In [24]:
m = folium.Map([-14,-53.25], zoom_start=4)
for x, text in enumerate(coord_text):
folium.Marker(coordinate[x], popup=str(coordinate[x])).add_to(m)
m
Out[24]:
In [26]:
tweets2.text.head()
Out[26]:
In [27]:
tweets2['created_at'] = pd.to_datetime(pd.Series(tweets2['created_at']))
tweets2.set_index('created_at', drop=False, inplace=True)
tweets2.index = tweets2.index.tz_localize('GMT')
tweets2.index = tweets2.index - DateOffset(hours = 3)
tweets2.index
tweets2.head()
Out[27]:
In [28]:
tweets30s = tweets2['created_at'].resample('1h', how='count')
tweets30s.head()
Out[28]:
In [29]:
avg = tweets30s.mean()
vincent.core.initialize_notebook()
area = vincent.Area(tweets30s)
area.colors(brew='Spectral')
area.display()