In [2]:
%matplotlib inline
import folium
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
import numpy as np
import operator
import os
import pandas as pd
import plotly.plotly as py
import pytz
import random
import re
import seaborn as sns
import string
import sys
import time
import vincent
from collections import Counter
from collections import defaultdict
from datetime import datetime
from matplotlib import dates
from matplotlib import rcParams
from matplotlib.ticker import MaxNLocator
from mpltools import style
from nltk import FreqDist
from nltk import bigrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from os import path
from pandas.tseries.resample import TimeGrouper
from pandas.tseries.offsets import DateOffset
from scipy.misc import imread
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
nltk.download('punkt')
nltk.download('mac_morpho')
nltk.download('stopwords')
Out[2]:
In [3]:
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})
style.use('ggplot')
rcParams['axes.labelsize'] = 9
rcParams['xtick.labelsize'] = 9
rcParams['ytick.labelsize'] = 9
rcParams['legend.fontsize'] = 7
# rcParams['font.serif'] = ['Computer Modern Roman']
rcParams['font.serif'] = ['Ubuntu']
rcParams['text.usetex'] = False
rcParams['figure.figsize'] = 20, 10
# pd.set_option('display.max_colwidth', 200)
# pd.options.display.mpl_style = 'default'
# matplotlib.style.use('ggplot')
# sns.set_context('talk')
# sns.set_style('whitegrid')
print 'OK!'
In [4]:
tweets = pd.read_json("data/small-data-fixed.json")
print 'OK!'
In [5]:
type(tweets)
Out[5]:
In [6]:
tweets.info()
In [7]:
coordinate = []
for col in tweets['coordinates'][~tweets['coordinates'].isnull()]:
coord = col['coordinates'][::-1]
coordinate.append(coord)
print coordinate[10]
In [8]:
coord_text = []
for col in tweets['text'][~tweets['coordinates'].isnull()]:
coord = col.encode('utf-8')
coord_text.append(coord)
print coord_text[10]
In [9]:
tweets[['coordinates','text']][~tweets['coordinates'].isnull()].head(11)
Out[9]:
In [10]:
coords = tweets['coordinates']
coords = coords[~coords.isnull()]
coords = coords.apply(lambda d: d['coordinates'][::-1])
coords.head(20)
Out[10]:
In [11]:
m = folium.Map([-14,-53.25], zoom_start=4)
for x, text in enumerate(coord_text):
folium.Marker(coordinate[x], popup=str(coordinate[x])).add_to(m)
m
Out[11]:
In [14]:
tweets.text.head()
Out[14]:
In [15]:
tweets.user.head()
Out[15]:
In [16]:
df = pd.DataFrame()
df['text'] = tweets['text']
df['coordinates'] = tweets['coordinates']
df['user'] = tweets['user']
df.head()
# df['text'] = map(lambda df: df['text'].encode('utf-8'), tweets)
# df['user'] = map(lambda df: df['user']['screen_name'], tweets)
Out[16]:
In [ ]:
def datetimeify(df):
df['created_at'] = pd.DatetimeIndex(df.created_at)
return df
In [ ]:
def sentiment(df):
text = df.dropna(subset=['text']).text
sentiment = text.apply(lambda text: TextBlob(text).sentiment)
df['polarity'] = sentiment.apply(lambda sentiment: sentiment.polarity)
df['subjectivity'] = sentiment.apply(lambda sentiment: sentiment.subjectivity)
return df
In [ ]:
def influence(df):
internal = np.sqrt(df.user_followers_count.apply(lambda x: x + 1))
external = np.sqrt(df.retweet_count.apply(lambda x: x + 1))
df['influence'] = internal * external
return df
In [ ]:
def influenced_polarity(df):
df['influenced_polarity'] = df.polarity * df['influence']
return df
In [ ]:
def georeference(df):
def place_to_coordinate(place_str, kind):
if pd.isnull(place_str):
return float('nan')
number_matcher = r'(-?\d+\.\d+)[,\]]'
coordinates = re.findall(number_matcher, place_str)
coordinate = tuple(float(n) for n in coordinates[:2])
if kind == 'longitude':
return coordinate[0]
elif kind == 'latitude':
return coordinate[1]
df['latitude'] = df.place.apply(place_to_coordinate, kind='latitude')
df['longitude'] = df.place.apply(place_to_coordinate, kind='longitude')
return df
In [ ]:
def preprocess(df):
return (df.pipe(datetimeify))
In [ ]:
def preprocess_df(df):
cleaned = df.pipe(set_hashtags)
copy = cleaned.copy()
return preprocess(copy)
In [ ]:
def load_df(input_filename):
raw_df = pd.read_json(input_filename)
return preprocess(raw_df)
print 'OK'
In [19]:
tweets['created_at'] = pd.to_datetime(pd.Series(tweets['created_at']))
tweets.set_index('created_at', drop=False, inplace=True)
tweets.index = tweets.index.tz_localize('GMT')
tweets.index = tweets.index - DateOffset(hours = 3)
tweets.index
tweets.head()
Out[19]:
In [20]:
tweets30s = tweets['created_at'].resample('1h', how='count')
tweets30s.head()
Out[20]:
In [21]:
avg = tweets30s.mean()
vincent.core.initialize_notebook()
area = vincent.Area(tweets30s)
area.colors(brew='Spectral')
area.display()
In [22]:
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=True):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
tweet = "RT @medeirosthiiago: testando exemplo TCC! :D http://example.com #ImpeachmentDay"
print(preprocess(tweet))
# ['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']