Mineração de Tweets

Dados coletados durante o domingo (17/04/2015) de votação do Congresso para a continuação do processo de Impeachment da senhora Presidente Dilma Rousseff.


In [1]:
%matplotlib inline

import folium
import json
import geojsonio
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import nltk
import pandas as pd
import re
import string
import sys
import time
import vincent

from collections import Counter
from collections import defaultdict
from datetime import datetime
from matplotlib import dates
from matplotlib import rcParams
from matplotlib.ticker import MaxNLocator
from os import path
from pandas.tseries.resample import TimeGrouper
from pandas.tseries.offsets import DateOffset
from scipy.misc import imread
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

print 'OK!'


OK!

In [2]:
tweets_data_path = 'data/small-data.json'

tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
        
tweets = pd.DataFrame()

tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data)
tweets['country'] = map(lambda tweet: tweet['place']['country']
                        if tweet['place'] != None else None, tweets_data)
        
print 'OK!'


OK!

In [3]:
print 'Número de Tweets total: %s' % len(tweets_data)


Número de Tweets total: 358293

In [4]:
# exemplo de tweet
print tweets_data[0]


{u'contributors': None, u'truncated': False, u'text': u'RT @GringaBrazilien: Bom Dia Brasil, #ImpeachmentDay\nO n\xfamero da Beast \U0001f608 \xe9 3\u20e34\u20e32\u20e3\nO da Democracia \xe9 alegria infinita do povo Brasileiro \U0001f497 h\u2026', u'is_quote_status': False, u'in_reply_to_status_id': None, u'id': 721718699565178880, u'favorite_count': 0, u'source': u'<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', u'retweeted': False, u'coordinates': None, u'timestamp_ms': u'1460906121518', u'entities': {u'user_mentions': [{u'id': 2294780467, u'indices': [3, 19], u'id_str': u'2294780467', u'screen_name': u'GringaBrazilien', u'name': u'Gringa Brazilien'}], u'symbols': [], u'hashtags': [{u'indices': [37, 52], u'text': u'ImpeachmentDay'}], u'urls': [], u'media': [{u'source_user_id': 2294780467, u'source_status_id_str': u'721662403067428864', u'expanded_url': u'http://twitter.com/GringaBrazilien/status/721662403067428864/photo/1', u'display_url': u'pic.twitter.com/mNMywDPyNC', u'url': u'https://t.co/mNMywDPyNC', u'media_url_https': u'https://pbs.twimg.com/media/CgPcIY4WIAAkKAV.jpg', u'source_user_id_str': u'2294780467', u'source_status_id': 721662403067428864, u'id_str': u'721662401985257472', u'sizes': {u'small': {u'h': 170, u'resize': u'fit', u'w': 340}, u'large': {u'h': 490, u'resize': u'fit', u'w': 980}, u'medium': {u'h': 300, u'resize': u'fit', u'w': 600}, u'thumb': {u'h': 150, u'resize': u'crop', u'w': 150}}, u'indices': [139, 140], u'type': u'photo', u'id': 721662401985257472, u'media_url': u'http://pbs.twimg.com/media/CgPcIY4WIAAkKAV.jpg'}]}, u'in_reply_to_screen_name': None, u'id_str': u'721718699565178880', u'retweet_count': 0, u'in_reply_to_user_id': None, u'favorited': False, u'retweeted_status': {u'contributors': None, u'truncated': False, u'text': u'Bom Dia Brasil, #ImpeachmentDay\nO n\xfamero da Beast \U0001f608 \xe9 3\u20e34\u20e32\u20e3\nO da Democracia \xe9 alegria infinita do povo Brasileiro \U0001f497 https://t.co/mNMywDPyNC', u'is_quote_status': False, u'in_reply_to_status_id': None, u'id': 721662403067428864, u'favorite_count': 30, u'source': u'<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', u'retweeted': False, u'coordinates': None, u'entities': {u'user_mentions': [], u'symbols': [], u'hashtags': [{u'indices': [16, 31], u'text': u'ImpeachmentDay'}], u'urls': [], u'media': [{u'expanded_url': u'http://twitter.com/GringaBrazilien/status/721662403067428864/photo/1', u'display_url': u'pic.twitter.com/mNMywDPyNC', u'url': u'https://t.co/mNMywDPyNC', u'media_url_https': u'https://pbs.twimg.com/media/CgPcIY4WIAAkKAV.jpg', u'id_str': u'721662401985257472', u'sizes': {u'small': {u'h': 170, u'resize': u'fit', u'w': 340}, u'large': {u'h': 490, u'resize': u'fit', u'w': 980}, u'medium': {u'h': 300, u'resize': u'fit', u'w': 600}, u'thumb': {u'h': 150, u'resize': u'crop', u'w': 150}}, u'indices': [117, 140], u'type': u'photo', u'id': 721662401985257472, u'media_url': u'http://pbs.twimg.com/media/CgPcIY4WIAAkKAV.jpg'}]}, u'in_reply_to_screen_name': None, u'id_str': u'721662403067428864', u'retweet_count': 32, u'in_reply_to_user_id': None, u'favorited': False, u'user': {u'follow_request_sent': None, u'profile_use_background_image': True, u'default_profile_image': False, u'id': 2294780467, u'verified': False, u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/633424454778028032/xopKVbk__normal.jpg', u'profile_sidebar_fill_color': u'DDEEF6', u'profile_text_color': u'333333', u'followers_count': 3582, u'profile_sidebar_border_color': u'FFFFFF', u'id_str': u'2294780467', u'profile_background_color': u'C0DEED', u'listed_count': 33, u'profile_background_image_url_https': u'https://pbs.twimg.com/profile_background_images/513407100194541569/ScNnjirT.jpeg', u'utc_offset': 3600, u'statuses_count': 15012, u'description': u'Isabel Monteiro: sharp eyes on Brazil: from politics to travel tips. Singer-songwriter: London - S\xe3oPaulo http://www.youtube.com/watch?v=9U3wCDrJRDY', u'friends_count': 1046, u'location': None, u'profile_link_color': u'02A0DA', u'profile_image_url': u'http://pbs.twimg.com/profile_images/633424454778028032/xopKVbk__normal.jpg', u'following': None, u'geo_enabled': False, u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/2294780467/1460896614', u'profile_background_image_url': u'http://pbs.twimg.com/profile_background_images/513407100194541569/ScNnjirT.jpeg', u'name': u'Gringa Brazilien', u'lang': u'en', u'profile_background_tile': False, u'favourites_count': 17204, u'screen_name': u'GringaBrazilien', u'notifications': None, u'url': u'http://en.wikipedia.org/wiki/Isabel_Monteiro', u'created_at': u'Thu Jan 16 17:31:19 +0000 2014', u'contributors_enabled': False, u'time_zone': u'Casablanca', u'protected': False, u'default_profile': False, u'is_translator': False}, u'geo': None, u'in_reply_to_user_id_str': None, u'possibly_sensitive': False, u'lang': u'pt', u'created_at': u'Sun Apr 17 11:31:39 +0000 2016', u'filter_level': u'low', u'in_reply_to_status_id_str': None, u'place': None, u'extended_entities': {u'media': [{u'expanded_url': u'http://twitter.com/GringaBrazilien/status/721662403067428864/photo/1', u'display_url': u'pic.twitter.com/mNMywDPyNC', u'url': u'https://t.co/mNMywDPyNC', u'media_url_https': u'https://pbs.twimg.com/media/CgPcIY4WIAAkKAV.jpg', u'id_str': u'721662401985257472', u'sizes': {u'small': {u'h': 170, u'resize': u'fit', u'w': 340}, u'large': {u'h': 490, u'resize': u'fit', u'w': 980}, u'medium': {u'h': 300, u'resize': u'fit', u'w': 600}, u'thumb': {u'h': 150, u'resize': u'crop', u'w': 150}}, u'indices': [117, 140], u'type': u'photo', u'id': 721662401985257472, u'media_url': u'http://pbs.twimg.com/media/CgPcIY4WIAAkKAV.jpg'}]}}, u'user': {u'follow_request_sent': None, u'profile_use_background_image': True, u'default_profile_image': False, u'id': 266109015, u'verified': False, u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/1280409620/Omolu_verger_normal.jpg', u'profile_sidebar_fill_color': u'DDEEF6', u'profile_text_color': u'333333', u'followers_count': 28, u'profile_sidebar_border_color': u'C0DEED', u'id_str': u'266109015', u'profile_background_color': u'C0DEED', u'listed_count': 1, u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme1/bg.png', u'utc_offset': -10800, u'statuses_count': 974, u'description': u'Tinha eu 14 anos de idade, quando meu pai me falou. Perguntou-me se eu queria estudar filosofia, medicina ou engenharia. Tinha eu que ser doutor. Mas a ...', u'friends_count': 136, u'location': u'Minas Gerais, Brasil', u'profile_link_color': u'0084B4', u'profile_image_url': u'http://pbs.twimg.com/profile_images/1280409620/Omolu_verger_normal.jpg', u'following': None, u'geo_enabled': False, u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/266109015/1458966843', u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme1/bg.png', u'name': u'edson reinehr', u'lang': u'en', u'profile_background_tile': False, u'favourites_count': 18, u'screen_name': u'exxonre', u'notifications': None, u'url': None, u'created_at': u'Mon Mar 14 17:35:07 +0000 2011', u'contributors_enabled': False, u'time_zone': u'Brasilia', u'protected': False, u'default_profile': True, u'is_translator': False}, u'geo': None, u'in_reply_to_user_id_str': None, u'possibly_sensitive': False, u'lang': u'pt', u'created_at': u'Sun Apr 17 15:15:21 +0000 2016', u'filter_level': u'low', u'in_reply_to_status_id_str': None, u'place': None, u'extended_entities': {u'media': [{u'source_user_id': 2294780467, u'source_status_id_str': u'721662403067428864', u'expanded_url': u'http://twitter.com/GringaBrazilien/status/721662403067428864/photo/1', u'display_url': u'pic.twitter.com/mNMywDPyNC', u'url': u'https://t.co/mNMywDPyNC', u'media_url_https': u'https://pbs.twimg.com/media/CgPcIY4WIAAkKAV.jpg', u'source_user_id_str': u'2294780467', u'source_status_id': 721662403067428864, u'id_str': u'721662401985257472', u'sizes': {u'small': {u'h': 170, u'resize': u'fit', u'w': 340}, u'large': {u'h': 490, u'resize': u'fit', u'w': 980}, u'medium': {u'h': 300, u'resize': u'fit', u'w': 600}, u'thumb': {u'h': 150, u'resize': u'crop', u'w': 150}}, u'indices': [139, 140], u'type': u'photo', u'id': 721662401985257472, u'media_url': u'http://pbs.twimg.com/media/CgPcIY4WIAAkKAV.jpg'}]}}

In [5]:
tweets_by_lang = tweets['lang'].value_counts()

fig, ax = plt.subplots(figsize=(20,10))
ax.tick_params(axis='x', labelsize=20)
ax.tick_params(axis='y', labelsize=15)
ax.set_xlabel('Idiomas'.decode('utf-8'), fontsize=20)
ax.set_ylabel('Número de tweets'.decode('utf-8') , fontsize=20)
ax.set_title('Top 4 Idiomas'.decode('utf-8'), fontsize=20, fontweight='bold')
tweets_by_lang[:4].plot(ax=ax, kind='bar', color='mediumspringgreen')
plt.grid()



In [6]:
tweets_by_country = tweets['country'].value_counts()

fig, ax = plt.subplots(figsize=(20,10))
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=15)
ax.set_xlabel('Países'.decode('utf-8'), fontsize=20)
ax.set_ylabel('Número de tweets'.decode('utf-8') , fontsize=20)
ax.set_title('Top 5 Países'.decode('utf-8'), fontsize=20, fontweight='bold')
tweets_by_country[:5].plot(ax=ax, kind='bar', color='lightskyblue')
plt.grid()



In [7]:
def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match = re.search(word, text)
    if match:
        return True
    return False

In [8]:
tweets['NaoVaiTerGolpe'] = tweets['text'].apply(lambda tweet: word_in_text('NaoVaiTerGolpe', tweet))
tweets['TchauQuerida'] = tweets['text'].apply(lambda tweet: word_in_text('TchauQuerida', tweet))
tweets['ForaDilma'] = tweets['text'].apply(lambda tweet: word_in_text('ForaDilma', tweet))
tweets['BrasilContraOGolpe'] = tweets['text'].apply(lambda tweet: word_in_text('BrasilContraOGolpe', tweet))
tweets['ForaCunha'] = tweets['text'].apply(lambda tweet: word_in_text('ForaCunha', tweet))

hashtags = ['ForaDilma', 'NaoVaiTerGolpe', 'TchauQuerida', 'BrasilContraOGolpe', 'ForaCunha']
tweets_by_hashtags = [tweets['ForaDilma'].value_counts()[True],
                      tweets['NaoVaiTerGolpe'].value_counts()[True],
                      tweets['TchauQuerida'].value_counts()[True],
                      tweets['BrasilContraOGolpe'].value_counts()[True],
                      tweets['ForaCunha'].value_counts()[True]]

plt.subplots(figsize=(10,10))
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue', 'peachpuff']
explode = (0.03, 0.03, 0.03, 0.03, 0.03)
plt.pie(tweets_by_hashtags, explode=explode, labels=hashtags, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.rcParams['font.size'] = 15
plt.legend(tweets_by_hashtags, loc=(.95,.6), title='Número de Tweets:'.decode('utf-8'), fontsize=15)
plt.axis('equal')
plt.show()



In [9]:
tweets['nao'] = tweets['text'].apply(lambda tweet: word_in_text(' nao ', tweet))
tweets['sim'] = tweets['text'].apply(lambda tweet: word_in_text(' sim ', tweet))

tweets['ImpeachmentDay'] = tweets['text'].apply(lambda tweet: word_in_text(' sim ', tweet) 
                                          or word_in_text(' nao ', tweet))

hashtags = ['ForaDilma', 'NaoVaiTerGolpe']
tweets_by_hashtags = [tweets[tweets['ImpeachmentDay'] == True]['ForaDilma'].value_counts()[True], 
                      tweets[tweets['ImpeachmentDay'] == True]['NaoVaiTerGolpe'].value_counts()[True]]


ind = np.arange(2)
width = 0.5
width2 = 0.3
x_pos = list(range(len(hashtags)))
fig, ax = plt.subplots(figsize=(12,10))
ax.bar(ind + width2, tweets_by_hashtags, width, color='yellowgreen')
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=15)
ax.set_ylabel('Número de tweets'.decode('utf-8'), fontsize=20)
ax.set_title('Ranking: ForaDilma vs. NaoVaiTerGolpe (Votação SIM x NÃO)'.decode('utf-8'),
             fontsize=15, fontweight='bold')
ax.set_xticks([p + 1.1 * width for p in x_pos])
ax.set_xticklabels(hashtags)
plt.grid()



In [10]:
print tweets_by_hashtags


[414, 122]

In [11]:
def extract_link(text):
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match = re.search(regex, text)
    if match:
        return match.group()
    return ''

tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet))

tweets_relevant = tweets[tweets['ImpeachmentDay'] == True]
tweets_relevant_with_link = tweets_relevant[tweets_relevant['link'] != '']

print tweets_relevant_with_link[tweets_relevant_with_link['TchauQuerida'] == True]['link']
print tweets_relevant_with_link[tweets_relevant_with_link['ForaDilma'] == True]['link']
print tweets_relevant_with_link[tweets_relevant_with_link['ForaCunha'] == True]['link']
print tweets_relevant_with_link[tweets_relevant_with_link['NaoVaiTerGolpe'] == True]['link']


8871      https://t.co/rtOavI9uQP
8906      https://t.co/VswJPAt5ul
9619      https://t.co/IiXU4ICRrH
9685      https://t.co/tlt3wTQ969
64182     https://t.co/Odd6HZPlJU
101356    https://t.co/6hOBQnYHhW
101573    https://t.co/nmY20txeMW
102084    https://t.co/9H9LU1CebE
102695    https://t.co/FdpW6olPVF
103954    https://t.co/brnPZh3Ruc
104091    https://t.co/h9SUq42jJp
105028    https://t.co/KQd41ZTwYW
105442    https://t.co/qZszc7jYNy
105724    https://t.co/I8Wv0IBzpe
106003    https://t.co/5HcF1xPBDj
110873    https://t.co/3f9crnn6Wl
112773    https://t.co/3f9crnn6Wl
116954    https://t.co/Xw8EDpB6Vp
117108    https://t.co/cjGcnmCN8J
118773    https://t.co/26T3oPIREo
119310    https://t.co/6qYZPS4ch6
119883    https://t.co/nv8TuAkWag
120311    https://t.co/f8Q8NWD5dG
120597    https://t.co/koeOCI7qwa
120850    https://t.co/sc82GUHdHp
121499    https://t.co/syaigshcm6
123385    https://t.co/eGTNFuNDMh
124877    https://t.co/dbn3M31GhS
125940    https://t.co/6jivJOttkw
128092    https://t.co/YKpUiNpLfo
                   ...           
295641    https://t.co/Eu9pPzNueh
299679    https://t.co/98BSmFP6c4
302914    https://t.co/4HHPfznhTk
303245    https://t.co/Eu9pPzNueh
303979    https://t.co/MlWkkRv55M
304765    https://t.co/1FQMmmR5jV
305414    https://t.co/D6Sv88jRto
306520    https://t.co/ZCaOjyrr31
306698    https://t.co/PKQSkYtHkc
309693    https://t.co/ZCaOjyrr31
316029    https://t.co/yaKQQsQm1a
317746    https://t.co/zTfde807KN
320688    https://t.co/5cKXDIpWaR
320921    https://t.co/2X87rEGo5y
326569    https://t.co/Sd5tMFB9zq
329399    https://t.co/933qeF9LMP
330894    https://t.co/TvjCJ5prZK
331241    https://t.co/j10vD6EiRg
334610    https://t.co/TX82qJLtlZ
335834    https://t.co/OMa5uznIwL
338646    https://t.co/Sd5tMFB9zq
340625    https://t.co/FuJqvQguTF
343771    https://t.co/Sd5tMFB9zq
345936    https://t.co/PKLWsnZ4DL
347438    https://t.co/DT7GwnMCsq
349014    https://t.co/RxPeUXquuf
353052    https://t.co/fjDntcV5v7
354196    https://t.co/RxPeUXquuf
356578    https://t.co/Sd5tMFB9zq
357183    https://t.co/Sd5tMFB9zq
Name: link, dtype: object
435       https://t.co/2s1ypSXViK
2739      https://t.co/2s1ypSXViK
4455      https://t.co/gwm4RuymdQ
4680      https://t.co/IliUBXWBAL
5464      https://t.co/2s1ypSXViK
8871      https://t.co/rtOavI9uQP
8906      https://t.co/VswJPAt5ul
9619      https://t.co/IiXU4ICRrH
9685      https://t.co/tlt3wTQ969
14293     https://t.co/oEGiYOphCw
85219     https://t.co/mnEU2UD6vI
93018     https://t.co/vJv6YHn31R
101356    https://t.co/6hOBQnYHhW
101573    https://t.co/nmY20txeMW
102084    https://t.co/9H9LU1CebE
102695    https://t.co/FdpW6olPVF
103954    https://t.co/brnPZh3Ruc
104091    https://t.co/h9SUq42jJp
105028    https://t.co/KQd41ZTwYW
105442    https://t.co/qZszc7jYNy
105724    https://t.co/I8Wv0IBzpe
106003    https://t.co/5HcF1xPBDj
116954    https://t.co/Xw8EDpB6Vp
117108    https://t.co/cjGcnmCN8J
118773    https://t.co/26T3oPIREo
119310    https://t.co/6qYZPS4ch6
119883    https://t.co/nv8TuAkWag
120311    https://t.co/f8Q8NWD5dG
120597    https://t.co/koeOCI7qwa
120850    https://t.co/sc82GUHdHp
                   ...           
305124    https://t.co/Qcf1i5Ga47
314072    https://t.co/R1aWo2JHXV
317465    https://t.co/mQDp8xDDW5
318992    https://t.co/YpHbKBFAPE
319417    https://t.co/SkSrhs3ShJ
319809    https://t.co/SkSrhs3ShJ
323352    https://t.co/Qcf1i5Ga47
327405    https://t.co/SkSrhs3ShJ
327604    https://t.co/f3N5TNvKJi
329156    https://t.co/SkSrhs3ShJ
331812    https://t.co/kxtvPvVjtT
332563    https://t.co/xhwGlWR7CQ
334773    https://t.co/kCMBHv2RRW
341742    https://t.co/sjDsgx3B6C
341887    https://t.co/dOmC13xq7G
342701    https://t.co/ggpQwajVAO
346522                  https://…
347207               https://t.c…
347326                  https://…
347438    https://t.co/DT7GwnMCsq
347681    https://t.co/ggpQwajVAO
349949    https://t.co/ggpQwajVAO
352889    https://t.co/WWaSqfVLzU
353190    https://t.co/ggpQwajVAO
355580    https://t.co/WWaSqfVLzU
356069    https://t.co/WWaSqfVLzU
357203    https://t.co/WWaSqfVLzU
357657    https://t.co/WWaSqfVLzU
357673    https://t.co/rQt3fuumRy
357861                  https://…
Name: link, dtype: object
Series([], Name: link, dtype: object)
2746                 https://t.c…
5658      https://t.co/xwWUHCq8sv
14365     https://t.co/3zIzUJgIU6
39934     https://t.co/jafllPOIZS
77400     https://t.co/etE8Ora6g1
117269    https://t.co/e90MEQqa8f
147829    https://t.co/knpdHWK1sS
175715    https://t.co/UCQEYXZehX
185252    https://t.co/qdaxe4kcNr
256300    https://t.co/bVrlXqo8Yn
285604    https://t.co/c0ovynwfbm
309386              https://t.co…
312959    https://t.co/00j2iif2kL
314073    https://t.co/rOnfHEtRjq
325553    https://t.co/P2lhLnX1Ik
330406    https://t.co/8cy1vaYdhe
Name: link, dtype: object

In [12]:
tweets['moro'] = tweets['text'].apply(lambda tweet: word_in_text('moro', tweet))
tweets['cunha'] = tweets['text'].apply(lambda tweet: word_in_text('cunha', tweet))
tweets['bolsonaro'] = tweets['text'].apply(lambda tweet: word_in_text('bolsonaro', tweet))
tweets['lula'] = tweets['text'].apply(lambda tweet: word_in_text('lula', tweet))
tweets['temer'] = tweets['text'].apply(lambda tweet: word_in_text('temer', tweet))
tweets['feliciano'] = tweets['text'].apply(lambda tweet: word_in_text('feliciano', tweet))

hashtags = ['Sérgio Moro'.decode('utf-8'), 'Eduardo Cunha', 'Jair Bolsonaro', 'Lula', 'Marcos Feliciano', 'Michel Temer']
tweets_by_hashtags = [tweets['moro'].value_counts()[True],
                      tweets['cunha'].value_counts()[True],
                      tweets['bolsonaro'].value_counts()[True],
                      tweets['lula'].value_counts()[True],
                      tweets['feliciano'].value_counts()[True],
                      tweets['temer'].value_counts()[True]]

plt.subplots(figsize=(10,10))
colors = ['gold', 'yellowgreen', 'lightskyblue', 'lightcoral', 'peachpuff', 'mediumturquoise']
explode = (0.03, 0.03, 0.03, 0.05, 0.03, 0.03)
plt.pie(tweets_by_hashtags, explode=explode, labels=hashtags, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90)
plt.rcParams['font.size'] = 15
# plt.legend(tweets_by_hashtags, loc='best')
plt.legend(tweets_by_hashtags, loc=(-.22,.6), title='Número de Tweets:'.decode('utf-8'), fontsize=15)
plt.axis('equal')
plt.show()



In [14]:
tweets['created_at'] = map(lambda tweet: time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')), tweets_data)
tweets['user'] = map(lambda tweet: tweet['user']['screen_name'], tweets_data)
tweets['user_followers_count'] = map(lambda tweet: tweet['user']['followers_count'], tweets_data)
tweets['retweet_count'] = map(lambda tweet: tweet['retweet_count'], tweets_data)
tweets['favorite_count'] = map(lambda tweet: tweet['favorite_count'], tweets_data)

tweets['text'] = map(lambda tweet: tweet['text'].encode('utf-8'), tweets_data)
tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data)
tweets['Location'] = map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data)

tweets.head()


Out[14]:
text lang country NaoVaiTerGolpe TchauQuerida ForaDilma BrasilContraOGolpe ForaCunha nao sim ... bolsonaro lula temer feliciano created_at user user_followers_count retweet_count favorite_count Location
0 RT @GringaBrazilien: Bom Dia Brasil, #Impeachm... pt None False False False False False False False ... False False False False 2016-04-17 15:15:21 exxonre 28 0 0 None
1 RT @marisascruz: NOSSO HINO!\nNOSSA BANDEIRA!\... pt None False False False False False False False ... False False False False 2016-04-17 15:15:21 DetritoFederal1 2428 0 0 None
2 RT @GarotaCiume: Não sou petista, só não sou c... pt None False False False False False False False ... False False False False 2016-04-17 15:15:21 Souzaa_mih1 433 0 0 None
3 #ThauQuerida #ImpeachmentDay und Brasil False False False False False False False ... False False False False 2016-04-17 15:15:21 italo_filho 42 0 0 Brasil
4 RT @luadacamz: Eu quero Impeachment desse calo... pt None False False False False False False False ... False False False False 2016-04-17 15:15:22 camzminhao 443 0 0 None

5 rows × 24 columns


In [17]:
list_of_original_tweets = [element for element in tweets['text'].values if not element.startswith('RT')]

print "Número de Tweets originais : " + str(len(list_of_original_tweets))

list_of_retweets = [element for element in tweets['text'].values if element.startswith('RT')]
print "Número de Retweets : " + str(len(list_of_retweets))


Número de Tweets originais : 100025
Número de Retweets : 258268

In [18]:
def plot_tweets_per_category(category, title, x_title, y_title, top_n=5, output_filename="plot.png"):
    tweets_by_cat = category.value_counts()
    fig, ax = plt.subplots(figsize=(20,10))
    ax.tick_params(axis='x')
    ax.tick_params(axis='y')
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    ax.set_title(title)
    tweets_by_cat[:top_n].plot(ax=ax, kind='bar', color='mediumturquoise')
    fig.savefig(output_filename)
    fig.show()

In [19]:
plot_tweets_per_category(tweets['user'], 
                             "#ImpeachmentDay usuarios ativos", 
                             "Usuários".decode('utf-8'), 
                             "Número de Tweets".decode('utf-8'), 20)


/Users/thiago/Envs/tcc-py2/lib/python2.7/site-packages/matplotlib/figure.py:397: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
  "matplotlib is currently using a non-GUI backend, "

Wordcloud


In [21]:
text = " ".join(tweets['text'].values.astype(str))

no_urls_no_tags = " ".join([word for word in text.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and word != 'RT'
                            ])

punctuation = list(string.punctuation)
stop = nltk.corpus.stopwords.words('portuguese') + punctuation
wordcloud = WordCloud(background_color="white", max_words=500,
                      stopwords=stop, width=1800, height=1400).generate(no_urls_no_tags)

plt.figure(figsize=(20,20))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()



In [25]:
text = " ".join(tweets['text'].values.astype(str))

no_urls_no_tags = " ".join([word for word in text.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and word != 'RT'
                            ])

tweet_coloring = imread(path.join("dilma.png"))

punctuation = list(string.punctuation)
stop = nltk.corpus.stopwords.words('portuguese') + punctuation
wordcloud = WordCloud(background_color="white", max_words=500, mask=tweet_coloring,
                      stopwords=stop, width=1800, height=1400).generate(no_urls_no_tags)

plt.figure(figsize=(10,10))
image_colors = ImageColorGenerator(tweet_coloring)
plt.imshow(wordcloud)
plt.axis("off")
plt.figure(figsize=(10,10))
plt.imshow(tweet_coloring, cmap=plt.cm.gray)
plt.axis("off")
plt.figure(figsize=(10,10))
plt.imshow(wordcloud.recolor(color_func=image_colors))
plt.axis("off")
plt.show()




Mapa


In [3]:
tweets2 = pd.read_json("data/small-data-fixed.json")
print 'OK!'
type(tweets2)


OK!
Out[3]:
pandas.core.frame.DataFrame

In [20]:
tweets2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 358293 entries, 0 to 358292
Data columns (total 31 columns):
contributors                 0 non-null float64
coordinates                  470 non-null object
created_at                   358293 non-null datetime64[ns]
entities                     358293 non-null object
extended_entities            164504 non-null object
favorite_count               358293 non-null int64
favorited                    358293 non-null bool
filter_level                 358293 non-null object
geo                          470 non-null object
id                           358293 non-null int64
id_str                       358293 non-null int64
in_reply_to_screen_name      4820 non-null object
in_reply_to_status_id        3303 non-null float64
in_reply_to_status_id_str    3303 non-null float64
in_reply_to_user_id          4820 non-null float64
in_reply_to_user_id_str      4820 non-null float64
is_quote_status              358293 non-null bool
lang                         358293 non-null object
place                        7803 non-null object
possibly_sensitive           197101 non-null float64
quoted_status                18300 non-null object
quoted_status_id             18300 non-null float64
quoted_status_id_str         18300 non-null float64
retweet_count                358293 non-null int64
retweeted                    358293 non-null bool
retweeted_status             258109 non-null object
source                       358293 non-null object
text                         358293 non-null object
timestamp_ms                 358293 non-null datetime64[ns]
truncated                    358293 non-null bool
user                         358293 non-null object
dtypes: bool(4), datetime64[ns](2), float64(8), int64(4), object(13)
memory usage: 77.9+ MB

In [15]:
coordinate = []
for col in tweets2['coordinates'][~tweets2['coordinates'].isnull()]:
    coord = col['coordinates'][::-1]
#     coord = col['coordinates']
    coordinate.append(coord)
    
print coordinate[10]


[-20.32700724, -40.33692957]

In [16]:
coord_text = []
for col in tweets2['text'][~tweets2['coordinates'].isnull()]:
    coord = col.encode('utf-8')
    coord_text.append(coord)
    
print coord_text[10]


Que seja feita a vontade de Deus! 🙏🏻😊😘

#boatarde #sejafeitaavontadededeus #foradilma #forapt… https://t.co/NrWCuYNQ8B

In [22]:
tweets2[['coordinates','text']][~tweets2['coordinates'].isnull()].head(11)


Out[22]:
coordinates text
96 {u'type': u'Point', u'coordinates': [-43.98846... @ApyusCom Isso vai ser muito, muito legal! Tô ...
257 {u'type': u'Point', u'coordinates': [-44.07219... Que o principado da corrupção seja extinto do ...
598 {u'type': u'Point', u'coordinates': [-49.94293... Esse papo de "impiechment é golpe" é tão verda...
1428 {u'type': u'Point', u'coordinates': [-47.9287,... 1. #ImpeachmentDay\n2. Leicester\n3. #DomingoD...
1572 {u'type': u'Point', u'coordinates': [-47.88279... é hoje Pátria Amada ✌ #pazeamor #impeachment #...
1615 {u'type': u'Point', u'coordinates': [-9.1413, ... 6. Benfica B\n7. Porto B\n8. #EquipaB\n9. #Imp...
1701 {u'type': u'Point', u'coordinates': [-43.98848... Saindo...\r É agora ou nunca! Pena q n tem @lo...
1723 {u'type': u'Point', u'coordinates': [-47.89916... Brasília ta fervendo hoje contra o golpe! #juv...
3149 {u'type': u'Point', u'coordinates': [-43.17408... #naovaitergolpe @ Praia De Copacabana https://...
3346 {u'type': u'Point', u'coordinates': [-43.17109... #naovaitergolpe @ Praia do Leme https://t.co/g...
3594 {u'type': u'Point', u'coordinates': [-40.33692... Que seja feita a vontade de Deus! 🙏🏻😊😘\n\n...

In [23]:
coords = tweets2['coordinates']
coords = coords[~coords.isnull()]
coords = coords.apply(lambda d: d['coordinates'][::-1])
coords.head(20)


Out[23]:
96      [-19.83755667, -43.98846667]
257       [-22.4835006, -44.0721959]
598       [-22.2458872, -49.9429378]
1428            [-15.7785, -47.9287]
1572    [-15.79442326, -47.88279271]
1615              [38.9901, -9.1413]
1701       [-19.83754, -43.98848333]
1723        [-15.7835, -47.89916389]
3149    [-22.96394383, -43.17408138]
3346          [-22.96423, -43.17109]
3594    [-20.32700724, -40.33692957]
4338      [-21.6855632, -43.3545121]
4510      [-29.6890964, -51.1366438]
4537    [-22.88908811, -43.43264018]
4886        [-29.68912, -51.1366868]
6737              [19.0728, 72.8826]
7284    [-23.65133278, -46.75984952]
7351            [-19.9319, -44.0539]
7611      [-20.9308903, -54.9694217]
7851      [-25.6440025, -49.3246874]
Name: coordinates, dtype: object

In [24]:
m = folium.Map([-14,-53.25], zoom_start=4)

for x, text in enumerate(coord_text):
    folium.Marker(coordinate[x], popup=str(coordinate[x])).add_to(m)

m


Out[24]:

In [26]:
tweets2.text.head()


Out[26]:
0    RT @GringaBrazilien: Bom Dia Brasil, #Impeachm...
1    RT @marisascruz: NOSSO HINO!\nNOSSA BANDEIRA!\...
2    RT @GarotaCiume: Não sou petista, só não sou c...
3                         #ThauQuerida #ImpeachmentDay
4    RT @luadacamz: Eu quero Impeachment desse calo...
Name: text, dtype: object

Picos de Horários


In [27]:
tweets2['created_at'] = pd.to_datetime(pd.Series(tweets2['created_at']))

tweets2.set_index('created_at', drop=False, inplace=True)

tweets2.index = tweets2.index.tz_localize('GMT')
tweets2.index = tweets2.index - DateOffset(hours = 3)
tweets2.index

tweets2.head()


Out[27]:
contributors coordinates created_at entities extended_entities favorite_count favorited filter_level geo id ... quoted_status_id quoted_status_id_str retweet_count retweeted retweeted_status source text timestamp_ms truncated user
created_at
2016-04-17 12:15:21+00:00 NaN None 2016-04-17 15:15:21 {u'user_mentions': [{u'id': 2294780467, u'indi... {u'media': [{u'source_user_id': 2294780467, u'... 0 False low None 721718699565178880 ... NaN NaN 0 False {u'contributors': None, u'truncated': False, u... <a href="http://twitter.com" rel="nofollow">Tw... RT @GringaBrazilien: Bom Dia Brasil, #Impeachm... 2016-04-17 15:15:21.518 False {u'follow_request_sent': None, u'profile_use_b...
2016-04-17 12:15:21+00:00 NaN None 2016-04-17 15:15:21 {u'user_mentions': [{u'id': 87843887, u'indice... NaN 0 False low None 721718699288342528 ... NaN NaN 0 False {u'contributors': None, u'truncated': False, u... <a href="http://twitter.com/download/android" ... RT @marisascruz: NOSSO HINO!\nNOSSA BANDEIRA!\... 2016-04-17 15:15:21.452 False {u'follow_request_sent': None, u'profile_use_b...
2016-04-17 12:15:21+00:00 NaN None 2016-04-17 15:15:21 {u'user_mentions': [{u'id': 206181302, u'indic... NaN 0 False low None 721718699418202112 ... NaN NaN 0 False {u'contributors': None, u'truncated': False, u... <a href="http://twitter.com/download/android" ... RT @GarotaCiume: Não sou petista, só não sou c... 2016-04-17 15:15:21.483 False {u'follow_request_sent': None, u'profile_use_b...
2016-04-17 12:15:21+00:00 NaN None 2016-04-17 15:15:21 {u'user_mentions': [], u'symbols': [], u'hasht... NaN 0 False low None 721718700496273408 ... NaN NaN 0 False NaN <a href="http://twitter.com/#!/download/ipad" ... #ThauQuerida #ImpeachmentDay 2016-04-17 15:15:21.740 False {u'follow_request_sent': None, u'profile_use_b...
2016-04-17 12:15:22+00:00 NaN None 2016-04-17 15:15:22 {u'user_mentions': [{u'id': 409967714, u'indic... NaN 0 False low None 721718702153023488 ... NaN NaN 0 False {u'contributors': None, u'truncated': False, u... <a href="http://twitter.com/download/android" ... RT @luadacamz: Eu quero Impeachment desse calo... 2016-04-17 15:15:22.135 False {u'follow_request_sent': None, u'profile_use_b...

5 rows × 31 columns


In [28]:
tweets30s = tweets2['created_at'].resample('1h', how='count')
tweets30s.head()


Out[28]:
created_at
2016-04-17 12:00:00+00:00    11536
2016-04-17 13:00:00+00:00     9286
2016-04-17 14:00:00+00:00    97672
2016-04-17 15:00:00+00:00    78594
2016-04-17 16:00:00+00:00    43528
Freq: H, Name: created_at, dtype: int64

In [29]:
avg = tweets30s.mean()

vincent.core.initialize_notebook()
area = vincent.Area(tweets30s)
area.colors(brew='Spectral')
area.display()