In [20]:
import pandas as pd
from pandas.tseries.resample import TimeGrouper
from pandas.tseries.offsets import DateOffset


dataset = pd.read_csv('2pope.csv')
dataset['created_at'] = pd.to_datetime(pd.Series(dataset['created_at']))
dataset.set_index('created_at', drop=False, inplace=True)
dataset.index = dataset.index.tz_localize('GMT').tz_convert('CET')
dataset.index = dataset.index - DateOffset(hours = 12)
dataset.head()


Out[20]:
text created_at geo source
2014-04-26 20:30:05+02:00 Et autour du Vatican, des foules cherchent à a... 2014-04-27 06:30:05 NaN iOS
2014-04-26 20:30:05+02:00 RT @romereportsesp: ¡HOY es la Canonización de... 2014-04-27 06:30:05 NaN Twitter for Android
2014-04-26 20:30:06+02:00 RT @AchinchillaA: Últimas imágenes desde Plaza... 2014-04-27 06:30:06 NaN web
2014-04-26 20:30:06+02:00 RT @jackvalero: You get a great view of St Pet... 2014-04-27 06:30:06 NaN Twitter for iPad
2014-04-26 20:30:07+02:00 RT @fam_cristiana: #2popesaints, la lunga nott... 2014-04-27 06:30:07 NaN Twitter for iPhone

In [21]:
dataset.describe()


Out[21]:
text created_at geo source
count 44854 44854 726 44854
first NaN 2014-04-27 06:30:05 NaN NaN
freq 359 16 15 10920
last NaN 2014-04-27 14:30:04 NaN NaN
top RT @catolicos_es: ¡San Juan XXIII y San Juan P... 2014-04-27 10:00:06 { "type" : "Point", "coordinates" : [ 6.864163... Twitter for iPhone
unique 19106 19056 642 210

In [22]:
# created_at timeseries is in a per minute minute format.
dataset1m = dataset['created_at'].resample('1t', how='count')
dataset1m.head()


Out[22]:
2014-04-26 20:30:00+02:00    52
2014-04-26 20:31:00+02:00    57
2014-04-26 20:32:00+02:00    70
2014-04-26 20:33:00+02:00    43
2014-04-26 20:34:00+02:00    50
dtype: int64

In [23]:
avg = dataset1m.mean()
print "average amount of tweets per minute ", avg


average amount of tweets per minute  93.2515592516

In [24]:
import vincent

vincent.core.initialize_notebook()
area = vincent.Area(dataset1m)
area.colors(brew='Spectral')
area.display()



In [25]:
# most used devices.
dataset.source.value_counts()[:15]


Out[25]:
Twitter for iPhone             10920
Twitter for Android            10403
web                             8855
Twitter for iPad                3423
TweetDeck                       2029
Twitter for BlackBerry®         1861
Twitter for  Android            1397
Twitter for Android Tablets      918
Mobile Web                       585
HootSuite                        549
Twitter for Windows Phone        534
TweetCaster for Android          332
Twitter for BlackBerry           332
Facebook                         258
Twitter for Mac                  220
dtype: int64

In [26]:
geo = dataset.geo
geo = geo[geo.notnull()]
geo.head()


Out[26]:
2014-04-26 20:31:17+02:00    { "type" : "Point", "coordinates" : [ 43.82456...
2014-04-26 20:32:55+02:00    { "type" : "Point", "coordinates" : [ 41.90168...
2014-04-26 20:33:18+02:00    { "type" : "Point", "coordinates" : [ 44.17738...
2014-04-26 20:33:18+02:00    { "type" : "Point", "coordinates" : [ 41.90221...
2014-04-26 20:33:48+02:00    { "type" : "Point", "coordinates" : [ 30.33523...
Name: geo, dtype: object

In [27]:
import json
coordinates = []

for g in geo:
    coordinates.append(json.loads(g)['coordinates'])

In [28]:
# plotting the map of twitts
import folium

#Simple Markers
wmap = folium.Map(location=[32.4942772,-34.505193], zoom_start=2)
for c in coordinates:
    wmap.simple_marker(c, popup='twitt')

wmap.create_map(path='2pop_map.html')

In [29]:
from IPython.display import IFrame
# IFrame('file:///Users/uolter/src/pycode/mining_social_web/2popesaints/2pop_map.html', width=750, height=350)
IFrame('http://localhost:8000/2pop_map.html', width=800, height=950)


Out[29]:

In [2]:
# language detected
import matplotlib.pyplot as plt
from pandas import Series

lang = [('english', 6738), ('spanish', 6663), (None, 4700),
        ('italian', 2236), ('french', 1528), ('portuguese', 720), ('hungarian', 412), 
        ('swedish', 264), ('german', 251), ('danish', 198), ('dutch', 96), ('finnish', 47), 
        ('norwegian', 37), ('russian', 8), ('turkish', 4)]



tot = [l[1] for l in lang]
index = [l[0] for l in lang]

lang_series = Series(tot, index=index )

# make a histogram

Series.plot(lang_series, kind='bar', 
     title='2 Pope Saints tweets.'
     )

plt.xlabel('languages')
plt.ylabel('count')


Out[2]:
<matplotlib.text.Text at 0x105449c10>

In [41]:



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-41-f1f21921df37> in <module>()
      1 from numpy.random import randn
----> 2 from datetime import date_range
      3 
      4 ts = Series(randn(1000))
      5 

ImportError: cannot import name date_range

In [41]:


In [ ]: