In [2]:
%matplotlib inline


import folium
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
import numpy as np
import operator 
import os
import pandas as pd
import plotly.plotly as py
import pytz
import random
import re
import seaborn as sns
import string
import sys
import time
import vincent

from collections import Counter
from collections import defaultdict
from datetime import datetime
from matplotlib import dates
from matplotlib import rcParams
from matplotlib.ticker import MaxNLocator
from mpltools import style
from nltk import FreqDist
from nltk import bigrams 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from os import path
from pandas.tseries.resample import TimeGrouper
from pandas.tseries.offsets import DateOffset
from scipy.misc import imread
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

nltk.download('punkt')
nltk.download('mac_morpho')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /Users/thiago/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package mac_morpho to
[nltk_data]     /Users/thiago/nltk_data...
[nltk_data]   Package mac_morpho is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thiago/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[2]:
True

In [3]:
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})
style.use('ggplot')
rcParams['axes.labelsize'] = 9
rcParams['xtick.labelsize'] = 9
rcParams['ytick.labelsize'] = 9
rcParams['legend.fontsize'] = 7
# rcParams['font.serif'] = ['Computer Modern Roman']
rcParams['font.serif'] = ['Ubuntu']
rcParams['text.usetex'] = False
rcParams['figure.figsize'] = 20, 10

# pd.set_option('display.max_colwidth', 200)
# pd.options.display.mpl_style = 'default'
# matplotlib.style.use('ggplot')
# sns.set_context('talk')
# sns.set_style('whitegrid')

print 'OK!'


OK!

Testando folium e data frames


In [4]:
tweets = pd.read_json("data/small-data-fixed.json")
print 'OK!'


OK!

In [5]:
type(tweets)


Out[5]:
pandas.core.frame.DataFrame

In [6]:
tweets.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 358293 entries, 0 to 358292
Data columns (total 31 columns):
contributors                 0 non-null float64
coordinates                  470 non-null object
created_at                   358293 non-null datetime64[ns]
entities                     358293 non-null object
extended_entities            164504 non-null object
favorite_count               358293 non-null int64
favorited                    358293 non-null bool
filter_level                 358293 non-null object
geo                          470 non-null object
id                           358293 non-null int64
id_str                       358293 non-null int64
in_reply_to_screen_name      4820 non-null object
in_reply_to_status_id        3303 non-null float64
in_reply_to_status_id_str    3303 non-null float64
in_reply_to_user_id          4820 non-null float64
in_reply_to_user_id_str      4820 non-null float64
is_quote_status              358293 non-null bool
lang                         358293 non-null object
place                        7803 non-null object
possibly_sensitive           197101 non-null float64
quoted_status                18300 non-null object
quoted_status_id             18300 non-null float64
quoted_status_id_str         18300 non-null float64
retweet_count                358293 non-null int64
retweeted                    358293 non-null bool
retweeted_status             258109 non-null object
source                       358293 non-null object
text                         358293 non-null object
timestamp_ms                 358293 non-null datetime64[ns]
truncated                    358293 non-null bool
user                         358293 non-null object
dtypes: bool(4), datetime64[ns](2), float64(8), int64(4), object(13)
memory usage: 77.9+ MB

In [7]:
coordinate = []
for col in tweets['coordinates'][~tweets['coordinates'].isnull()]:
    coord = col['coordinates'][::-1]
    coordinate.append(coord)
    
print coordinate[10]


[-20.32700724, -40.33692957]

In [8]:
coord_text = []
for col in tweets['text'][~tweets['coordinates'].isnull()]:
    coord = col.encode('utf-8')
    coord_text.append(coord)
    
print coord_text[10]


Que seja feita a vontade de Deus! 🙏🏻😊😘

#boatarde #sejafeitaavontadededeus #foradilma #forapt… https://t.co/NrWCuYNQ8B

In [9]:
tweets[['coordinates','text']][~tweets['coordinates'].isnull()].head(11)


Out[9]:
coordinates text
96 {u'type': u'Point', u'coordinates': [-43.98846... @ApyusCom Isso vai ser muito, muito legal! Tô ...
257 {u'type': u'Point', u'coordinates': [-44.07219... Que o principado da corrupção seja extinto do ...
598 {u'type': u'Point', u'coordinates': [-49.94293... Esse papo de "impiechment é golpe" é tão verda...
1428 {u'type': u'Point', u'coordinates': [-47.9287,... 1. #ImpeachmentDay\n2. Leicester\n3. #DomingoD...
1572 {u'type': u'Point', u'coordinates': [-47.88279... é hoje Pátria Amada ✌ #pazeamor #impeachment #...
1615 {u'type': u'Point', u'coordinates': [-9.1413, ... 6. Benfica B\n7. Porto B\n8. #EquipaB\n9. #Imp...
1701 {u'type': u'Point', u'coordinates': [-43.98848... Saindo...\r É agora ou nunca! Pena q n tem @lo...
1723 {u'type': u'Point', u'coordinates': [-47.89916... Brasília ta fervendo hoje contra o golpe! #juv...
3149 {u'type': u'Point', u'coordinates': [-43.17408... #naovaitergolpe @ Praia De Copacabana https://...
3346 {u'type': u'Point', u'coordinates': [-43.17109... #naovaitergolpe @ Praia do Leme https://t.co/g...
3594 {u'type': u'Point', u'coordinates': [-40.33692... Que seja feita a vontade de Deus! 🙏🏻😊😘\n\n...

In [10]:
coords = tweets['coordinates']
coords = coords[~coords.isnull()]
coords = coords.apply(lambda d: d['coordinates'][::-1])
coords.head(20)


Out[10]:
96      [-19.83755667, -43.98846667]
257       [-22.4835006, -44.0721959]
598       [-22.2458872, -49.9429378]
1428            [-15.7785, -47.9287]
1572    [-15.79442326, -47.88279271]
1615              [38.9901, -9.1413]
1701       [-19.83754, -43.98848333]
1723        [-15.7835, -47.89916389]
3149    [-22.96394383, -43.17408138]
3346          [-22.96423, -43.17109]
3594    [-20.32700724, -40.33692957]
4338      [-21.6855632, -43.3545121]
4510      [-29.6890964, -51.1366438]
4537    [-22.88908811, -43.43264018]
4886        [-29.68912, -51.1366868]
6737              [19.0728, 72.8826]
7284    [-23.65133278, -46.75984952]
7351            [-19.9319, -44.0539]
7611      [-20.9308903, -54.9694217]
7851      [-25.6440025, -49.3246874]
Name: coordinates, dtype: object

In [11]:
m = folium.Map([-14,-53.25], zoom_start=4)

for x, text in enumerate(coord_text):
    folium.Marker(coordinate[x], popup=str(coordinate[x])).add_to(m)

m


Out[11]:

In [14]:
tweets.text.head()


Out[14]:
0    RT @GringaBrazilien: Bom Dia Brasil, #Impeachm...
1    RT @marisascruz: NOSSO HINO!\nNOSSA BANDEIRA!\...
2    RT @GarotaCiume: Não sou petista, só não sou c...
3                         #ThauQuerida #ImpeachmentDay
4    RT @luadacamz: Eu quero Impeachment desse calo...
Name: text, dtype: object

In [15]:
tweets.user.head()


Out[15]:
0    {u'follow_request_sent': None, u'profile_use_b...
1    {u'follow_request_sent': None, u'profile_use_b...
2    {u'follow_request_sent': None, u'profile_use_b...
3    {u'follow_request_sent': None, u'profile_use_b...
4    {u'follow_request_sent': None, u'profile_use_b...
Name: user, dtype: object

In [16]:
df = pd.DataFrame()
df['text'] = tweets['text']
df['coordinates'] = tweets['coordinates']
df['user'] = tweets['user']
df.head()


# df['text'] = map(lambda df: df['text'].encode('utf-8'), tweets)
# df['user'] = map(lambda df: df['user']['screen_name'], tweets)


Out[16]:
text coordinates user
0 RT @GringaBrazilien: Bom Dia Brasil, #Impeachm... None {u'follow_request_sent': None, u'profile_use_b...
1 RT @marisascruz: NOSSO HINO!\nNOSSA BANDEIRA!\... None {u'follow_request_sent': None, u'profile_use_b...
2 RT @GarotaCiume: Não sou petista, só não sou c... None {u'follow_request_sent': None, u'profile_use_b...
3 #ThauQuerida #ImpeachmentDay None {u'follow_request_sent': None, u'profile_use_b...
4 RT @luadacamz: Eu quero Impeachment desse calo... None {u'follow_request_sent': None, u'profile_use_b...

Funções de teste


In [ ]:
def datetimeify(df):
    df['created_at'] = pd.DatetimeIndex(df.created_at)
    return df

In [ ]:
def sentiment(df):
    text = df.dropna(subset=['text']).text
    sentiment = text.apply(lambda text: TextBlob(text).sentiment)
    df['polarity'] = sentiment.apply(lambda sentiment: sentiment.polarity)
    df['subjectivity'] = sentiment.apply(lambda sentiment: sentiment.subjectivity)
    return df

In [ ]:
def influence(df):
    internal = np.sqrt(df.user_followers_count.apply(lambda x: x + 1))
    external = np.sqrt(df.retweet_count.apply(lambda x: x + 1))
    df['influence'] = internal * external
    return df

In [ ]:
def influenced_polarity(df):
    df['influenced_polarity'] = df.polarity * df['influence']
    return df

In [ ]:
def georeference(df):
    def place_to_coordinate(place_str, kind):
        if pd.isnull(place_str):
            return float('nan')
        number_matcher = r'(-?\d+\.\d+)[,\]]'
        coordinates = re.findall(number_matcher, place_str)
        coordinate = tuple(float(n) for n in coordinates[:2])

        if kind == 'longitude':
            return coordinate[0]
        elif kind == 'latitude':
            return coordinate[1]
    df['latitude'] = df.place.apply(place_to_coordinate, kind='latitude')
    df['longitude'] = df.place.apply(place_to_coordinate, kind='longitude')

    return df

In [ ]:
def preprocess(df):
    return (df.pipe(datetimeify))

In [ ]:
def preprocess_df(df):
    cleaned = df.pipe(set_hashtags)
    copy = cleaned.copy()
    return preprocess(copy)

In [ ]:
def load_df(input_filename):
    raw_df = pd.read_json(input_filename)
    return preprocess(raw_df)

print 'OK'

Horários de Tweets


In [19]:
tweets['created_at'] = pd.to_datetime(pd.Series(tweets['created_at']))

tweets.set_index('created_at', drop=False, inplace=True)

tweets.index = tweets.index.tz_localize('GMT')
tweets.index = tweets.index - DateOffset(hours = 3)
tweets.index

tweets.head()


Out[19]:
contributors coordinates created_at entities extended_entities favorite_count favorited filter_level geo id ... source text timestamp_ms truncated user NaoVaiTerGolpe TchauQuerida ForaDilma BrasilContraOGolpe ForaCunha
created_at
2016-04-17 12:15:21+00:00 NaN None 2016-04-17 15:15:21 {u'user_mentions': [{u'id': 2294780467, u'indi... {u'media': [{u'source_user_id': 2294780467, u'... 0 False low None 721718699565178880 ... <a href="http://twitter.com" rel="nofollow">Tw... RT @GringaBrazilien: Bom Dia Brasil, #Impeachm... 2016-04-17 15:15:21.518 False {u'follow_request_sent': None, u'profile_use_b... False False False False False
2016-04-17 12:15:21+00:00 NaN None 2016-04-17 15:15:21 {u'user_mentions': [{u'id': 87843887, u'indice... NaN 0 False low None 721718699288342528 ... <a href="http://twitter.com/download/android" ... RT @marisascruz: NOSSO HINO!\nNOSSA BANDEIRA!\... 2016-04-17 15:15:21.452 False {u'follow_request_sent': None, u'profile_use_b... False False False False False
2016-04-17 12:15:21+00:00 NaN None 2016-04-17 15:15:21 {u'user_mentions': [{u'id': 206181302, u'indic... NaN 0 False low None 721718699418202112 ... <a href="http://twitter.com/download/android" ... RT @GarotaCiume: Não sou petista, só não sou c... 2016-04-17 15:15:21.483 False {u'follow_request_sent': None, u'profile_use_b... False False False False False
2016-04-17 12:15:21+00:00 NaN None 2016-04-17 15:15:21 {u'user_mentions': [], u'symbols': [], u'hasht... NaN 0 False low None 721718700496273408 ... <a href="http://twitter.com/#!/download/ipad" ... #ThauQuerida #ImpeachmentDay 2016-04-17 15:15:21.740 False {u'follow_request_sent': None, u'profile_use_b... False False False False False
2016-04-17 12:15:22+00:00 NaN None 2016-04-17 15:15:22 {u'user_mentions': [{u'id': 409967714, u'indic... NaN 0 False low None 721718702153023488 ... <a href="http://twitter.com/download/android" ... RT @luadacamz: Eu quero Impeachment desse calo... 2016-04-17 15:15:22.135 False {u'follow_request_sent': None, u'profile_use_b... False False False False False

5 rows × 36 columns


In [20]:
tweets30s = tweets['created_at'].resample('1h', how='count')
tweets30s.head()


Out[20]:
created_at
2016-04-17 12:00:00+00:00    11536
2016-04-17 13:00:00+00:00     9286
2016-04-17 14:00:00+00:00    97672
2016-04-17 15:00:00+00:00    78594
2016-04-17 16:00:00+00:00    43528
Freq: H, Name: created_at, dtype: int64

In [21]:
avg = tweets30s.mean()

vincent.core.initialize_notebook()
area = vincent.Area(tweets30s)
area.colors(brew='Spectral')
area.display()



Testes sentimental analysis


In [22]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=True):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
 
tweet = "RT @medeirosthiiago: testando exemplo TCC! :D http://example.com #ImpeachmentDay"
print(preprocess(tweet))
# ['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']


['rt', '@medeirosthiiago', ':', 'testando', 'exemplo', 'tcc', '!', ':D', 'http://example.com', '#impeachmentday']