In [2]:

    
%matplotlib inline


import folium
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
import numpy as np
import operator 
import os
import pandas as pd
import plotly.plotly as py
import pytz
import random
import re
import seaborn as sns
import string
import sys
import time
import vincent

from collections import Counter
from collections import defaultdict
from datetime import datetime
from matplotlib import dates
from matplotlib import rcParams
from matplotlib.ticker import MaxNLocator
from mpltools import style
from nltk import FreqDist
from nltk import bigrams 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from os import path
from pandas.tseries.resample import TimeGrouper
from pandas.tseries.offsets import DateOffset
from scipy.misc import imread
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

nltk.download('punkt')
nltk.download('mac_morpho')
nltk.download('stopwords')









    



[nltk_data] Downloading package punkt to /Users/thiago/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package mac_morpho to
[nltk_data]     /Users/thiago/nltk_data...
[nltk_data]   Package mac_morpho is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thiago/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!






    Out[2]:





True



In [3]:

    
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})
style.use('ggplot')
rcParams['axes.labelsize'] = 9
rcParams['xtick.labelsize'] = 9
rcParams['ytick.labelsize'] = 9
rcParams['legend.fontsize'] = 7
# rcParams['font.serif'] = ['Computer Modern Roman']
rcParams['font.serif'] = ['Ubuntu']
rcParams['text.usetex'] = False
rcParams['figure.figsize'] = 20, 10

# pd.set_option('display.max_colwidth', 200)
# pd.options.display.mpl_style = 'default'
# matplotlib.style.use('ggplot')
# sns.set_context('talk')
# sns.set_style('whitegrid')

print 'OK!'

OK!

Testando folium e data frames



In [4]:

    
tweets = pd.read_json("data/small-data-fixed.json")
print 'OK!'

OK!



In [5]:

    
type(tweets)









    Out[5]:





pandas.core.frame.DataFrame



In [6]:

    
tweets.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 358293 entries, 0 to 358292
Data columns (total 31 columns):
contributors                 0 non-null float64
coordinates                  470 non-null object
created_at                   358293 non-null datetime64[ns]
entities                     358293 non-null object
extended_entities            164504 non-null object
favorite_count               358293 non-null int64
favorited                    358293 non-null bool
filter_level                 358293 non-null object
geo                          470 non-null object
id                           358293 non-null int64
id_str                       358293 non-null int64
in_reply_to_screen_name      4820 non-null object
in_reply_to_status_id        3303 non-null float64
in_reply_to_status_id_str    3303 non-null float64
in_reply_to_user_id          4820 non-null float64
in_reply_to_user_id_str      4820 non-null float64
is_quote_status              358293 non-null bool
lang                         358293 non-null object
place                        7803 non-null object
possibly_sensitive           197101 non-null float64
quoted_status                18300 non-null object
quoted_status_id             18300 non-null float64
quoted_status_id_str         18300 non-null float64
retweet_count                358293 non-null int64
retweeted                    358293 non-null bool
retweeted_status             258109 non-null object
source                       358293 non-null object
text                         358293 non-null object
timestamp_ms                 358293 non-null datetime64[ns]
truncated                    358293 non-null bool
user                         358293 non-null object
dtypes: bool(4), datetime64[ns](2), float64(8), int64(4), object(13)
memory usage: 77.9+ MB



In [7]:

    
coordinate = []
for col in tweets['coordinates'][~tweets['coordinates'].isnull()]:
    coord = col['coordinates'][::-1]
    coordinate.append(coord)
    
print coordinate[10]









    



[-20.32700724, -40.33692957]



In [8]:

    
coord_text = []
for col in tweets['text'][~tweets['coordinates'].isnull()]:
    coord = col.encode('utf-8')
    coord_text.append(coord)
    
print coord_text[10]









    



Que seja feita a vontade de Deus! 🙏🏻😊😘

#boatarde #sejafeitaavontadededeus #foradilma #forapt… https://t.co/NrWCuYNQ8B



In [9]:

    
tweets[['coordinates','text']][~tweets['coordinates'].isnull()].head(11)









    Out[9]:






  
    
      
      coordinates
      text
    
  
  
    
      96
      {u'type': u'Point', u'coordinates': [-43.98846...
      @ApyusCom Isso vai ser muito, muito legal! Tô ...
    
    
      257
      {u'type': u'Point', u'coordinates': [-44.07219...
      Que o principado da corrupção seja extinto do ...
    
    
      598
      {u'type': u'Point', u'coordinates': [-49.94293...
      Esse papo de "impiechment é golpe" é tão verda...
    
    
      1428
      {u'type': u'Point', u'coordinates': [-47.9287,...
      1. #ImpeachmentDay\n2. Leicester\n3. #DomingoD...
    
    
      1572
      {u'type': u'Point', u'coordinates': [-47.88279...
      é hoje Pátria Amada ✌ #pazeamor #impeachment #...
    
    
      1615
      {u'type': u'Point', u'coordinates': [-9.1413, ...
      6. Benfica B\n7. Porto B\n8. #EquipaB\n9. #Imp...
    
    
      1701
      {u'type': u'Point', u'coordinates': [-43.98848...
      Saindo...\r É agora ou nunca! Pena q n tem @lo...
    
    
      1723
      {u'type': u'Point', u'coordinates': [-47.89916...
      Brasília ta fervendo hoje contra o golpe! #juv...
    
    
      3149
      {u'type': u'Point', u'coordinates': [-43.17408...
      #naovaitergolpe @ Praia De Copacabana https://...
    
    
      3346
      {u'type': u'Point', u'coordinates': [-43.17109...
      #naovaitergolpe @ Praia do Leme https://t.co/g...
    
    
      3594
      {u'type': u'Point', u'coordinates': [-40.33692...
      Que seja feita a vontade de Deus! 🙏🏻😊😘\n\n...



In [10]:

    
coords = tweets['coordinates']
coords = coords[~coords.isnull()]
coords = coords.apply(lambda d: d['coordinates'][::-1])
coords.head(20)









    Out[10]:





96      [-19.83755667, -43.98846667]
257       [-22.4835006, -44.0721959]
598       [-22.2458872, -49.9429378]
1428            [-15.7785, -47.9287]
1572    [-15.79442326, -47.88279271]
1615              [38.9901, -9.1413]
1701       [-19.83754, -43.98848333]
1723        [-15.7835, -47.89916389]
3149    [-22.96394383, -43.17408138]
3346          [-22.96423, -43.17109]
3594    [-20.32700724, -40.33692957]
4338      [-21.6855632, -43.3545121]
4510      [-29.6890964, -51.1366438]
4537    [-22.88908811, -43.43264018]
4886        [-29.68912, -51.1366868]
6737              [19.0728, 72.8826]
7284    [-23.65133278, -46.75984952]
7351            [-19.9319, -44.0539]
7611      [-20.9308903, -54.9694217]
7851      [-25.6440025, -49.3246874]
Name: coordinates, dtype: object



In [11]:

    
m = folium.Map([-14,-53.25], zoom_start=4)

for x, text in enumerate(coord_text):
    folium.Marker(coordinate[x], popup=str(coordinate[x])).add_to(m)

m









    Out[11]:



In [14]:

    
tweets.text.head()









    Out[14]:





0    RT @GringaBrazilien: Bom Dia Brasil, #Impeachm...
1    RT @marisascruz: NOSSO HINO!\nNOSSA BANDEIRA!\...
2    RT @GarotaCiume: Não sou petista, só não sou c...
3                         #ThauQuerida #ImpeachmentDay
4    RT @luadacamz: Eu quero Impeachment desse calo...
Name: text, dtype: object



In [15]:

    
tweets.user.head()









    Out[15]:





0    {u'follow_request_sent': None, u'profile_use_b...
1    {u'follow_request_sent': None, u'profile_use_b...
2    {u'follow_request_sent': None, u'profile_use_b...
3    {u'follow_request_sent': None, u'profile_use_b...
4    {u'follow_request_sent': None, u'profile_use_b...
Name: user, dtype: object



In [16]:

    
df = pd.DataFrame()
df['text'] = tweets['text']
df['coordinates'] = tweets['coordinates']
df['user'] = tweets['user']
df.head()


# df['text'] = map(lambda df: df['text'].encode('utf-8'), tweets)
# df['user'] = map(lambda df: df['user']['screen_name'], tweets)









    Out[16]:






  
    
      
      text
      coordinates
      user
    
  
  
    
      0
      RT @GringaBrazilien: Bom Dia Brasil, #Impeachm...
      None
      {u'follow_request_sent': None, u'profile_use_b...
    
    
      1
      RT @marisascruz: NOSSO HINO!\nNOSSA BANDEIRA!\...
      None
      {u'follow_request_sent': None, u'profile_use_b...
    
    
      2
      RT @GarotaCiume: Não sou petista, só não sou c...
      None
      {u'follow_request_sent': None, u'profile_use_b...
    
    
      3
      #ThauQuerida #ImpeachmentDay
      None
      {u'follow_request_sent': None, u'profile_use_b...
    
    
      4
      RT @luadacamz: Eu quero Impeachment desse calo...
      None
      {u'follow_request_sent': None, u'profile_use_b...

Funções de teste



In [ ]:

    
def datetimeify(df):
    df['created_at'] = pd.DatetimeIndex(df.created_at)
    return df



In [ ]:

    
def sentiment(df):
    text = df.dropna(subset=['text']).text
    sentiment = text.apply(lambda text: TextBlob(text).sentiment)
    df['polarity'] = sentiment.apply(lambda sentiment: sentiment.polarity)
    df['subjectivity'] = sentiment.apply(lambda sentiment: sentiment.subjectivity)
    return df



In [ ]:

    
def influence(df):
    internal = np.sqrt(df.user_followers_count.apply(lambda x: x + 1))
    external = np.sqrt(df.retweet_count.apply(lambda x: x + 1))
    df['influence'] = internal * external
    return df



In [ ]:

    
def influenced_polarity(df):
    df['influenced_polarity'] = df.polarity * df['influence']
    return df



In [ ]:

    
def georeference(df):
    def place_to_coordinate(place_str, kind):
        if pd.isnull(place_str):
            return float('nan')
        number_matcher = r'(-?\d+\.\d+)[,\]]'
        coordinates = re.findall(number_matcher, place_str)
        coordinate = tuple(float(n) for n in coordinates[:2])

        if kind == 'longitude':
            return coordinate[0]
        elif kind == 'latitude':
            return coordinate[1]
    df['latitude'] = df.place.apply(place_to_coordinate, kind='latitude')
    df['longitude'] = df.place.apply(place_to_coordinate, kind='longitude')

    return df



In [ ]:

    
def preprocess(df):
    return (df.pipe(datetimeify))



In [ ]:

    
def preprocess_df(df):
    cleaned = df.pipe(set_hashtags)
    copy = cleaned.copy()
    return preprocess(copy)



In [ ]:

    
def load_df(input_filename):
    raw_df = pd.read_json(input_filename)
    return preprocess(raw_df)

print 'OK'

Horários de Tweets



In [19]:

    
tweets['created_at'] = pd.to_datetime(pd.Series(tweets['created_at']))

tweets.set_index('created_at', drop=False, inplace=True)

tweets.index = tweets.index.tz_localize('GMT')
tweets.index = tweets.index - DateOffset(hours = 3)
tweets.index

tweets.head()









    Out[19]:






  
    
      
      contributors
      coordinates
      created_at
      entities
      extended_entities
      favorite_count
      favorited
      filter_level
      geo
      id
      ...
      source
      text
      timestamp_ms
      truncated
      user
      NaoVaiTerGolpe
      TchauQuerida
      ForaDilma
      BrasilContraOGolpe
      ForaCunha
    
    
      created_at
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2016-04-17 12:15:21+00:00
      NaN
      None
      2016-04-17 15:15:21
      {u'user_mentions': [{u'id': 2294780467, u'indi...
      {u'media': [{u'source_user_id': 2294780467, u'...
      0
      False
      low
      None
      721718699565178880
      ...
      <a href="http://twitter.com" rel="nofollow">Tw...
      RT @GringaBrazilien: Bom Dia Brasil, #Impeachm...
      2016-04-17 15:15:21.518
      False
      {u'follow_request_sent': None, u'profile_use_b...
      False
      False
      False
      False
      False
    
    
      2016-04-17 12:15:21+00:00
      NaN
      None
      2016-04-17 15:15:21
      {u'user_mentions': [{u'id': 87843887, u'indice...
      NaN
      0
      False
      low
      None
      721718699288342528
      ...
      <a href="http://twitter.com/download/android" ...
      RT @marisascruz: NOSSO HINO!\nNOSSA BANDEIRA!\...
      2016-04-17 15:15:21.452
      False
      {u'follow_request_sent': None, u'profile_use_b...
      False
      False
      False
      False
      False
    
    
      2016-04-17 12:15:21+00:00
      NaN
      None
      2016-04-17 15:15:21
      {u'user_mentions': [{u'id': 206181302, u'indic...
      NaN
      0
      False
      low
      None
      721718699418202112
      ...
      <a href="http://twitter.com/download/android" ...
      RT @GarotaCiume: Não sou petista, só não sou c...
      2016-04-17 15:15:21.483
      False
      {u'follow_request_sent': None, u'profile_use_b...
      False
      False
      False
      False
      False
    
    
      2016-04-17 12:15:21+00:00
      NaN
      None
      2016-04-17 15:15:21
      {u'user_mentions': [], u'symbols': [], u'hasht...
      NaN
      0
      False
      low
      None
      721718700496273408
      ...
      <a href="http://twitter.com/#!/download/ipad" ...
      #ThauQuerida #ImpeachmentDay
      2016-04-17 15:15:21.740
      False
      {u'follow_request_sent': None, u'profile_use_b...
      False
      False
      False
      False
      False
    
    
      2016-04-17 12:15:22+00:00
      NaN
      None
      2016-04-17 15:15:22
      {u'user_mentions': [{u'id': 409967714, u'indic...
      NaN
      0
      False
      low
      None
      721718702153023488
      ...
      <a href="http://twitter.com/download/android" ...
      RT @luadacamz: Eu quero Impeachment desse calo...
      2016-04-17 15:15:22.135
      False
      {u'follow_request_sent': None, u'profile_use_b...
      False
      False
      False
      False
      False
    
  

5 rows × 36 columns



In [20]:

    
tweets30s = tweets['created_at'].resample('1h', how='count')
tweets30s.head()









    Out[20]:





created_at
2016-04-17 12:00:00+00:00    11536
2016-04-17 13:00:00+00:00     9286
2016-04-17 14:00:00+00:00    97672
2016-04-17 15:00:00+00:00    78594
2016-04-17 16:00:00+00:00    43528
Freq: H, Name: created_at, dtype: int64



In [21]:

    
avg = tweets30s.mean()

vincent.core.initialize_notebook()
area = vincent.Area(tweets30s)
area.colors(brew='Spectral')
area.display()

Testes sentimental analysis



In [22]:

    
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=True):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
 
tweet = "RT @medeirosthiiago: testando exemplo TCC! :D http://example.com #ImpeachmentDay"
print(preprocess(tweet))
# ['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']









    



['rt', '@medeirosthiiago', ':', 'testando', 'exemplo', 'tcc', '!', ':D', 'http://example.com', '#impeachmentday']

	coordinates	text
96	{u'type': u'Point', u'coordinates': [-43.98846...	@ApyusCom Isso vai ser muito, muito legal! Tô ...
257	{u'type': u'Point', u'coordinates': [-44.07219...	Que o principado da corrupção seja extinto do ...
598	{u'type': u'Point', u'coordinates': [-49.94293...	Esse papo de "impiechment é golpe" é tão verda...
1428	{u'type': u'Point', u'coordinates': [-47.9287,...	1. #ImpeachmentDay\n2. Leicester\n3. #DomingoD...
1572	{u'type': u'Point', u'coordinates': [-47.88279...	é hoje Pátria Amada ✌ #pazeamor #impeachment #...
1615	{u'type': u'Point', u'coordinates': [-9.1413, ...	6. Benfica B\n7. Porto B\n8. #EquipaB\n9. #Imp...
1701	{u'type': u'Point', u'coordinates': [-43.98848...	Saindo...\r É agora ou nunca! Pena q n tem @lo...
1723	{u'type': u'Point', u'coordinates': [-47.89916...	Brasília ta fervendo hoje contra o golpe! #juv...
3149	{u'type': u'Point', u'coordinates': [-43.17408...	#naovaitergolpe @ Praia De Copacabana https://...
3346	{u'type': u'Point', u'coordinates': [-43.17109...	#naovaitergolpe @ Praia do Leme https://t.co/g...
3594	{u'type': u'Point', u'coordinates': [-40.33692...	Que seja feita a vontade de Deus! 🙏🏻😊😘\n\n...

	text	coordinates	user
0	RT @GringaBrazilien: Bom Dia Brasil, #Impeachm...	None	{u'follow_request_sent': None, u'profile_use_b...
1	RT @marisascruz: NOSSO HINO!\nNOSSA BANDEIRA!\...	None	{u'follow_request_sent': None, u'profile_use_b...
2	RT @GarotaCiume: Não sou petista, só não sou c...	None	{u'follow_request_sent': None, u'profile_use_b...
3	#ThauQuerida #ImpeachmentDay	None	{u'follow_request_sent': None, u'profile_use_b...
4	RT @luadacamz: Eu quero Impeachment desse calo...	None	{u'follow_request_sent': None, u'profile_use_b...

	contributors	coordinates	created_at	entities	extended_entities	favorite_count	favorited	filter_level	geo	id	...	source	text	timestamp_ms	truncated	user	NaoVaiTerGolpe	TchauQuerida	ForaDilma	BrasilContraOGolpe	ForaCunha
created_at
2016-04-17 12:15:21+00:00	NaN	None	2016-04-17 15:15:21	{u'user_mentions': [{u'id': 2294780467, u'indi...	{u'media': [{u'source_user_id': 2294780467, u'...	0	False	low	None	721718699565178880	...	<a href="http://twitter.com" rel="nofollow">Tw...	RT @GringaBrazilien: Bom Dia Brasil, #Impeachm...	2016-04-17 15:15:21.518	False	{u'follow_request_sent': None, u'profile_use_b...	False	False	False	False	False
2016-04-17 12:15:21+00:00	NaN	None	2016-04-17 15:15:21	{u'user_mentions': [{u'id': 87843887, u'indice...	NaN	0	False	low	None	721718699288342528	...	<a href="http://twitter.com/download/android" ...	RT @marisascruz: NOSSO HINO!\nNOSSA BANDEIRA!\...	2016-04-17 15:15:21.452	False	{u'follow_request_sent': None, u'profile_use_b...	False	False	False	False	False
2016-04-17 12:15:21+00:00	NaN	None	2016-04-17 15:15:21	{u'user_mentions': [{u'id': 206181302, u'indic...	NaN	0	False	low	None	721718699418202112	...	<a href="http://twitter.com/download/android" ...	RT @GarotaCiume: Não sou petista, só não sou c...	2016-04-17 15:15:21.483	False	{u'follow_request_sent': None, u'profile_use_b...	False	False	False	False	False
2016-04-17 12:15:21+00:00	NaN	None	2016-04-17 15:15:21	{u'user_mentions': [], u'symbols': [], u'hasht...	NaN	0	False	low	None	721718700496273408	...	<a href="http://twitter.com/#!/download/ipad" ...	#ThauQuerida #ImpeachmentDay	2016-04-17 15:15:21.740	False	{u'follow_request_sent': None, u'profile_use_b...	False	False	False	False	False
2016-04-17 12:15:22+00:00	NaN	None	2016-04-17 15:15:22	{u'user_mentions': [{u'id': 409967714, u'indic...	NaN	0	False	low	None	721718702153023488	...	<a href="http://twitter.com/download/android" ...	RT @luadacamz: Eu quero Impeachment desse calo...	2016-04-17 15:15:22.135	False	{u'follow_request_sent': None, u'profile_use_b...	False	False	False	False	False