notebook.community

Edit and run



In [20]:

    
import pandas as pd
from pandas.tseries.resample import TimeGrouper
from pandas.tseries.offsets import DateOffset


dataset = pd.read_csv('2pope.csv')
dataset['created_at'] = pd.to_datetime(pd.Series(dataset['created_at']))
dataset.set_index('created_at', drop=False, inplace=True)
dataset.index = dataset.index.tz_localize('GMT').tz_convert('CET')
dataset.index = dataset.index - DateOffset(hours = 12)
dataset.head()









    Out[20]:






  
    
      
      text
      created_at
      geo
      source
    
  
  
    
      2014-04-26 20:30:05+02:00
       Et autour du Vatican, des foules cherchent à a...
      2014-04-27 06:30:05
       NaN
                       iOS
    
    
      2014-04-26 20:30:05+02:00
       RT @romereportsesp: ¡HOY es la Canonización de...
      2014-04-27 06:30:05
       NaN
       Twitter for Android
    
    
      2014-04-26 20:30:06+02:00
       RT @AchinchillaA: Últimas imágenes desde Plaza...
      2014-04-27 06:30:06
       NaN
                       web
    
    
      2014-04-26 20:30:06+02:00
       RT @jackvalero: You get a great view of St Pet...
      2014-04-27 06:30:06
       NaN
          Twitter for iPad
    
    
      2014-04-26 20:30:07+02:00
       RT @fam_cristiana: #2popesaints, la lunga nott...
      2014-04-27 06:30:07
       NaN
        Twitter for iPhone



In [21]:

    
dataset.describe()









    Out[21]:






  
    
      
      text
      created_at
      geo
      source
    
  
  
    
      count
                                                   44854
                     44854
                                                     726
                    44854
    
    
      first
                                                     NaN
       2014-04-27 06:30:05
                                                     NaN
                      NaN
    
    
      freq
                                                     359
                        16
                                                      15
                    10920
    
    
      last
                                                     NaN
       2014-04-27 14:30:04
                                                     NaN
                      NaN
    
    
      top
       RT @catolicos_es: ¡San Juan XXIII y San Juan P...
       2014-04-27 10:00:06
       { "type" : "Point", "coordinates" : [ 6.864163...
       Twitter for iPhone
    
    
      unique
                                                   19106
                     19056
                                                     642
                      210



In [22]:

    
# created_at timeseries is in a per minute minute format.
dataset1m = dataset['created_at'].resample('1t', how='count')
dataset1m.head()









    Out[22]:





2014-04-26 20:30:00+02:00    52
2014-04-26 20:31:00+02:00    57
2014-04-26 20:32:00+02:00    70
2014-04-26 20:33:00+02:00    43
2014-04-26 20:34:00+02:00    50
dtype: int64



In [23]:

    
avg = dataset1m.mean()
print "average amount of tweets per minute ", avg









    



average amount of tweets per minute  93.2515592516



In [24]:

    
import vincent

vincent.core.initialize_notebook()
area = vincent.Area(dataset1m)
area.colors(brew='Spectral')
area.display()



In [25]:

    
# most used devices.
dataset.source.value_counts()[:15]









    Out[25]:





Twitter for iPhone             10920
Twitter for Android            10403
web                             8855
Twitter for iPad                3423
TweetDeck                       2029
Twitter for BlackBerry®         1861
Twitter for  Android            1397
Twitter for Android Tablets      918
Mobile Web                       585
HootSuite                        549
Twitter for Windows Phone        534
TweetCaster for Android          332
Twitter for BlackBerry           332
Facebook                         258
Twitter for Mac                  220
dtype: int64



In [26]:

    
geo = dataset.geo
geo = geo[geo.notnull()]
geo.head()









    Out[26]:





2014-04-26 20:31:17+02:00    { "type" : "Point", "coordinates" : [ 43.82456...
2014-04-26 20:32:55+02:00    { "type" : "Point", "coordinates" : [ 41.90168...
2014-04-26 20:33:18+02:00    { "type" : "Point", "coordinates" : [ 44.17738...
2014-04-26 20:33:18+02:00    { "type" : "Point", "coordinates" : [ 41.90221...
2014-04-26 20:33:48+02:00    { "type" : "Point", "coordinates" : [ 30.33523...
Name: geo, dtype: object



In [27]:

    
import json
coordinates = []

for g in geo:
    coordinates.append(json.loads(g)['coordinates'])



In [28]:

    
# plotting the map of twitts
import folium

#Simple Markers
wmap = folium.Map(location=[32.4942772,-34.505193], zoom_start=2)
for c in coordinates:
    wmap.simple_marker(c, popup='twitt')

wmap.create_map(path='2pop_map.html')



In [29]:

    
from IPython.display import IFrame
# IFrame('file:///Users/uolter/src/pycode/mining_social_web/2popesaints/2pop_map.html', width=750, height=350)
IFrame('http://localhost:8000/2pop_map.html', width=800, height=950)









    Out[29]:



In [2]:

    
# language detected
import matplotlib.pyplot as plt
from pandas import Series

lang = [('english', 6738), ('spanish', 6663), (None, 4700),
        ('italian', 2236), ('french', 1528), ('portuguese', 720), ('hungarian', 412), 
        ('swedish', 264), ('german', 251), ('danish', 198), ('dutch', 96), ('finnish', 47), 
        ('norwegian', 37), ('russian', 8), ('turkish', 4)]



tot = [l[1] for l in lang]
index = [l[0] for l in lang]

lang_series = Series(tot, index=index )

# make a histogram

Series.plot(lang_series, kind='bar', 
     title='2 Pope Saints tweets.'
     )

plt.xlabel('languages')
plt.ylabel('count')









    Out[2]:





<matplotlib.text.Text at 0x105449c10>



In [41]:









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-41-f1f21921df37> in <module>()
      1 from numpy.random import randn
----> 2 from datetime import date_range
      3 
      4 ts = Series(randn(1000))
      5 

ImportError: cannot import name date_range



In [41]:



In [ ]:

	text	created_at	geo	source
2014-04-26 20:30:05+02:00	Et autour du Vatican, des foules cherchent à a...	2014-04-27 06:30:05	NaN	iOS
2014-04-26 20:30:05+02:00	RT @romereportsesp: ¡HOY es la Canonización de...	2014-04-27 06:30:05	NaN	Twitter for Android
2014-04-26 20:30:06+02:00	RT @AchinchillaA: Últimas imágenes desde Plaza...	2014-04-27 06:30:06	NaN	web
2014-04-26 20:30:06+02:00	RT @jackvalero: You get a great view of St Pet...	2014-04-27 06:30:06	NaN	Twitter for iPad
2014-04-26 20:30:07+02:00	RT @fam_cristiana: #2popesaints, la lunga nott...	2014-04-27 06:30:07	NaN	Twitter for iPhone

	text	created_at	geo	source
count	44854	44854	726	44854
first	NaN	2014-04-27 06:30:05	NaN	NaN
freq	359	16	15	10920
last	NaN	2014-04-27 14:30:04	NaN	NaN
top	RT @catolicos_es: ¡San Juan XXIII y San Juan P...	2014-04-27 10:00:06	{ "type" : "Point", "coordinates" : [ 6.864163...	Twitter for iPhone
unique	19106	19056	642	210