In [2]:
import json
import bz2

data = bz2.open('./data/test.20140209-140126.json.bz2', 'rt').readlines()

In [3]:
json.loads(data[101])


Out[3]:
{'entities': {'symbols': [],
  'hashtags': [{'text': 'NowListening', 'indices': [0, 13]}],
  'user_mentions': [],
  'urls': []},
 'retweet_count': 0,
 'in_reply_to_status_id': None,
 'geo': None,
 'coordinates': None,
 'text': '#NowListening Ylvis - The Fox (What Does The Fox Say)',
 'contributors': None,
 'in_reply_to_user_id_str': None,
 'lang': 'en',
 'created_at': 'Sun Feb 09 14:34:35 +0000 2014',
 'favorited': False,
 'filter_level': 'medium',
 'in_reply_to_user_id': None,
 'truncated': False,
 'place': None,
 'id_str': '432522924503740416',
 'in_reply_to_status_id_str': None,
 'user': {'profile_use_background_image': True,
  'protected': False,
  'default_profile': True,
  'name': 'Ismail Iskandar',
  'statuses_count': 7213,
  'profile_text_color': '333333',
  'url': None,
  'follow_request_sent': None,
  'favourites_count': 46,
  'is_translator': False,
  'screen_name': 'RusaKurus',
  'is_translation_enabled': False,
  'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',
  'profile_link_color': '0084B4',
  'followers_count': 271,
  'id_str': '1479875263',
  'contributors_enabled': False,
  'utc_offset': None,
  'time_zone': None,
  'profile_sidebar_fill_color': 'DDEEF6',
  'profile_image_url_https': 'https://pbs.twimg.com/profile_images/414770965331988480/bZvuU09G_normal.jpeg',
  'created_at': 'Mon Jun 03 14:37:20 +0000 2013',
  'profile_banner_url': 'https://pbs.twimg.com/profile_banners/1479875263/1387873160',
  'location': 'EAST BORNEO - INDONESIA',
  'listed_count': 0,
  'verified': False,
  'notifications': None,
  'profile_image_url': 'http://pbs.twimg.com/profile_images/414770965331988480/bZvuU09G_normal.jpeg',
  'following': None,
  'description': 'Your Friend',
  'geo_enabled': True,
  'friends_count': 159,
  'default_profile_image': False,
  'profile_sidebar_border_color': 'C0DEED',
  'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',
  'lang': 'en',
  'profile_background_color': 'C0DEED',
  'profile_background_tile': False,
  'id': 1479875263},
 'source': '<a href="http://blackberry.com/twitter" rel="nofollow">Twitter for BlackBerry®</a>',
 'id': 432522924503740416,
 'favorite_count': 0,
 'retweeted': False,
 'in_reply_to_screen_name': None}

In [3]:
tw_data = open('./data/twitter.csv', 'w')

for tw in data:
    dt = json.loads(tw)
    hs = dt['entities']['hashtags']
    hashtags = []
    for item in hs:
        hashtags.append(item['text'])
        
    tw_data.write('%s, %s, %s\n' % (dt['id'], dt['created_at'], dt['user']['time_zone']))
    tw_data.flush()
tw_data.close()

In [4]:
import pandas as pd
import matplotlib.dates as dates
import matplotlib.pyplot as plt
import matplotlib as mpl

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)

mpl.rc('figure', figsize=(10, 8))

In [5]:
df = pd.read_csv('./data/twitter.csv', parse_dates=[1], index_col=[1], names=['id', 'date', 'time-zone'])

In [6]:
df.head()


Out[6]:
                                     id                    time-zone
date                                                                
2014-02-09 14:02:01  432514725994176512                         None
2014-02-09 14:02:22  432514817526480896                       Alaska
2014-02-09 14:02:29  432514846022590465                 Kuala Lumpur
2014-02-09 14:02:56  432514959080050688                         None
2014-02-09 14:03:46  432515170112266240   Pacific Time (US & Canada)

[5 rows x 2 columns]

In [7]:
hourly = df.groupby(pd.TimeGrouper('h'))['id'].count()

In [8]:
hourly


Out[8]:
2014-02-09 14:00:00    156
2014-02-09 15:00:00    180
2014-02-09 16:00:00    105
2014-02-09 17:00:00     99
2014-02-09 18:00:00     70
2014-02-09 19:00:00     62
2014-02-09 20:00:00     70
2014-02-09 21:00:00     51
2014-02-09 22:00:00     69
2014-02-09 23:00:00    107
...
2014-02-11 12:00:00     98
2014-02-11 13:00:00    195
2014-02-11 14:00:00    210
2014-02-11 15:00:00    169
2014-02-11 16:00:00    131
2014-02-11 17:00:00     78
2014-02-11 18:00:00     63
2014-02-11 19:00:00     82
2014-02-11 20:00:00     51
2014-02-11 21:00:00     48
2014-02-11 22:00:00     18
Length: 57

In [9]:
# Note: All times are in GMT

hourly.ix['2014-02-09':'2014-02-10']


Out[9]:
2014-02-09 14:00:00    156
2014-02-09 15:00:00    180
2014-02-09 16:00:00    105
2014-02-09 17:00:00     99
2014-02-09 18:00:00     70
2014-02-09 19:00:00     62
2014-02-09 20:00:00     70
2014-02-09 21:00:00     51
2014-02-09 22:00:00     69
2014-02-09 23:00:00    107
...
2014-02-10 13:00:00    145
2014-02-10 14:00:00    147
2014-02-10 15:00:00    204
2014-02-10 16:00:00    235
2014-02-10 17:00:00    151
2014-02-10 18:00:00    113
2014-02-10 19:00:00     94
2014-02-10 20:00:00     75
2014-02-10 21:00:00     69
2014-02-10 22:00:00     69
2014-02-10 23:00:00     85
Length: 34

In [10]:
hourly.plot()


Out[10]:
<matplotlib.axes.AxesSubplot at 0x1069fbb90>

In [11]:
hourly.ix['2014-02-11':'2014-02-11'].plot()


Out[11]:
<matplotlib.axes.AxesSubplot at 0x106f7ac10>

In [12]:
hourly.ix['2014-02-10':'2014-02-10'].plot(kind='bar')


Out[12]:
<matplotlib.axes.AxesSubplot at 0x106fa3210>

In [13]:
a = df.groupby(pd.TimeGrouper('h')).apply(lambda x: x.groupby('time-zone')['id'].count())

In [14]:
a


Out[14]:
                     time-zone                  
2014-02-09 14:00:00   Alaska                         4
                      Amsterdam                      1
                      Arizona                        3
                      Athens                         1
                      Bangkok                        6
                      Beijing                       12
                      Buenos Aires                   1
                      Casablanca                     1
                      Central Time (US & Canada)     2
                      Eastern Time (US & Canada)     2
...
2014-02-11 22:00:00   Atlantic Time (Canada)        1
                      Brasilia                      1
                      Central Time (US & Canada)    1
                      Eastern Time (US & Canada)    2
                      Hawaii                        2
                      Istanbul                      1
                      Kuala Lumpur                  1
                      None                          3
                      Quito                         3
                      Singapore                     1
                      Tokyo                         2
Length: 1364, dtype: int64

In [15]:
a.ix['2014-02-10 16:00:00'].sum()


Out[15]:
235

In [16]:
a.ix['2014-02-10 16:00:00'].plot(kind='bar')


Out[16]:
<matplotlib.axes.AxesSubplot at 0x1070a6f50>