In [1]:
import pandas as pd
import re
import json
import sqlite3 as lite
from datetime import datetime 

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
litecon = lite.connect('data/twitter.db')

df = pd.read_sql("SELECT tweet_id, old_screen_name, max_tweet_time, stratum, tweet, error, modified FROM sample", litecon, index_col='tweet_id')
df['tweet'] = df.tweet.apply(lambda x: json.loads(x) if x is not None else None)
df['user_id'] = df.tweet.apply(lambda x: int(x['user']['id']) if x else None)
df['max_tweet_time'] = pd.to_datetime(df.max_tweet_time)
df['modified'] = pd.to_datetime(df.modified)

user_df = pd.read_sql("SELECT user_id, screen_name, user_object, timeline, timeline_error, timeline_modified, user_modified FROM users", litecon, index_col='user_id')
user_df.index = user_df.index.astype(int)


---------------------------------------------------------------------------
DatabaseError                             Traceback (most recent call last)
<ipython-input-2-84365c73a291> in <module>()
      1 litecon = lite.connect('data/twitter.db')
      2 
----> 3 df = pd.read_sql("SELECT tweet_id, old_screen_name, max_tweet_time, stratum, tweet, error, modified FROM sample", litecon, index_col='tweet_id')
      4 df['tweet'] = df.tweet.apply(lambda x: json.loads(x) if x is not None else None)
      5 df['user_id'] = df.tweet.apply(lambda x: int(x['user']['id']) if x else None)

/usr/local/lib/python2.7/site-packages/pandas/io/sql.pyc in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize)
    497             sql, index_col=index_col, params=params,
    498             coerce_float=coerce_float, parse_dates=parse_dates,
--> 499             chunksize=chunksize)
    500 
    501     try:

/usr/local/lib/python2.7/site-packages/pandas/io/sql.pyc in read_query(self, sql, index_col, coerce_float, params, parse_dates, chunksize)
   1597 
   1598         args = _convert_params(sql, params)
-> 1599         cursor = self.execute(*args)
   1600         columns = [col_desc[0] for col_desc in cursor.description]
   1601 

/usr/local/lib/python2.7/site-packages/pandas/io/sql.pyc in execute(self, *args, **kwargs)
   1574             ex = DatabaseError(
   1575                 "Execution failed on sql '%s': %s" % (args[0], exc))
-> 1576             raise_with_traceback(ex)
   1577 
   1578     @staticmethod

/usr/local/lib/python2.7/site-packages/pandas/io/sql.pyc in execute(self, *args, **kwargs)
   1562                 cur.execute(*args, **kwargs)
   1563             else:
-> 1564                 cur.execute(*args)
   1565             return cur
   1566         except Exception as exc:

DatabaseError: Execution failed on sql 'SELECT tweet_id, old_screen_name, max_tweet_time, stratum, tweet, error, modified FROM sample': no such column: max_tweet_time

In [3]:
tweets = df.merge(user_df, left_on='user_id', right_index=True)

In [4]:
tweets['screen_name_changed'] = tweets.apply(lambda row: row['old_screen_name'].lower() != row['screen_name'].lower(), axis=1)

In [5]:
tweets['error'] = tweets.error.map(str) # convert None to 'None'

In [88]:
gb = df.set_index('max_tweet_time').groupby([pd.TimeGrouper(freq='M'), 'error'])
gb.size().unstack().plot()

plt.title('Errors trying to retrieve Tweet Over Tweet Age')
lgd = plt.legend(loc='upper center', bbox_to_anchor=(0.5, -.2))
plt.savefig('figures/errors.png', bbox_extra_artists=(lgd,), bbox_inches='tight')



In [90]:
gb = tweets[tweets['screen_name_changed'] == True].set_index('max_tweet_time').groupby([pd.TimeGrouper(freq='M'), 'screen_name_changed'])
gb.size().unstack().plot(legend=False)

plt.title('Screen Name Changes Over Tweet Age')
plt.savefig('figures/screen_name_changes.png')



In [122]:
gb = tweets.set_index('max_tweet_time').groupby([pd.TimeGrouper(freq='M'), 'screen_name_changed'])['screen_name_changed']
gb = gb.size().multiply(100.0).divide(gb.size().sum(level=0))
df2 = gb.reset_index()
df2 = df2[df2['screen_name_changed'] == True]
df2 = df2.set_index('max_tweet_time')
df2[0].plot()

plt.title('Screen Name Changes Over Tweet Age\n(percent of valid tweets)')
plt.savefig('figures/screen_name_changes.png')



In [93]:
tweets['is_reply'] = tweets.tweet.map(lambda x: x['in_reply_to_status_id'] is not None)
gb = tweets.set_index('max_tweet_time').groupby([pd.TimeGrouper(freq='M'), 'is_reply'])['is_reply']
gb = gb.size().multiply(100.0).divide(gb.size().sum(level=0))
df2 = gb.reset_index()
df2 = df2[df2['is_reply'] == True]
df2 = df2.set_index('max_tweet_time')
df2[0].plot(legend=False)

plt.title('Percent of Tweets that are replies')
plt.savefig('figures/is_reply.png')



In [94]:
tweets['lang'] = tweets.tweet.map(lambda x: x['lang'])
gb = tweets.set_index('max_tweet_time').groupby([pd.TimeGrouper(freq='M'), 'lang'])
gb2 = gb.size().multiply(100.0).divide(gb.size().sum(level=0))
more_than_5p_langs = list(gb2[gb2 > 5].reset_index().lang.unique())
more_than_5p_langs.append('und')
more_than_5p_langs
all_langs = set(gb.size().unstack().columns)
gb2 = gb.size().unstack()[more_than_5p_langs]
gb2['other'] = gb.size().unstack()[list(all_langs.difference(more_than_5p_langs))].sum(axis=1)

gb2.plot.bar(stacked=True)
lgd = plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

plt.title('Number of Tweets by Language')
plt.savefig('figures/languages.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
# gb = tweets.set_index('max_tweet_time').groupby([pd.TimeGrouper(freq='M'), 'lang'])['lang']
# gb = gb.size().multiply(100.0).divide(gb.size().sum(level=0))
# df2 = gb.reset_index()
# df2 = df2[df2['is_retweet'] == True]
# df2 = df2.set_index('max_tweet_time')
# df2[0].plot(legend=False)

# plt.title('Percent of Tweets that are replies')
# plt.savefig('figures/is_reply.png')



In [123]:
ax = gb2.apply(lambda x: x*100/x.sum(), axis=1).plot.bar(stacked=True)
plt.ylim(0,100)
lgd = plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.title('Tweets by Language\n(as percent of valid tweets)')
plt.savefig('figures/languages.png', bbox_extra_artists=(lgd,), bbox_inches='tight')



In [104]:



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-104-ffb9d72d02f5> in <module>()
----> 1 fig.xaxis.set_major_formatter(dates.DateFormatter('%m %Y'))

NameError: name 'dates' is not defined

In [79]:
gb2.plot.bar(stacked=True)


Out[79]:
<matplotlib.legend.Legend at 0x165661050>

In [177]:
tweets.ix['208388272223813632'].tweet


Out[177]:
{u'contributors': None,
 u'coordinates': None,
 u'created_at': u'Fri Jun 01 02:43:30 +0000 2012',
 u'entities': {u'hashtags': [{u'indices': [73, 82], u'text': u'onmylist'}],
  u'symbols': [],
  u'urls': [{u'display_url': u'annualreviews.org/doi/full/10.11\u2026',
    u'expanded_url': u'http://www.annualreviews.org/doi/full/10.1146/annurev-polisci-030810-110815',
    u'indices': [52, 72],
    u'url': u'http://t.co/rvxsjYzF'}],
  u'user_mentions': [{u'id': 6366042,
    u'id_str': u'6366042',
    u'indices': [3, 12],
    u'name': u'Vincent Raynauld',
    u'screen_name': u'VincentR'}]},
 u'favorite_count': 0,
 u'favorited': False,
 u'geo': None,
 u'id': 208388272223813632,
 u'id_str': u'208388272223813632',
 u'in_reply_to_screen_name': None,
 u'in_reply_to_status_id': None,
 u'in_reply_to_status_id_str': None,
 u'in_reply_to_user_id': None,
 u'in_reply_to_user_id_str': None,
 u'is_quote_status': False,
 u'lang': u'en',
 u'place': None,
 u'possibly_sensitive': False,
 u'retweet_count': 2,
 u'retweeted': False,
 u'retweeted_status': {u'contributors': None,
  u'coordinates': None,
  u'created_at': u'Thu May 31 21:12:33 +0000 2012',
  u'entities': {u'hashtags': [{u'indices': [59, 68], u'text': u'onmylist'}],
   u'symbols': [],
   u'urls': [{u'display_url': u'annualreviews.org/doi/full/10.11\u2026',
     u'expanded_url': u'http://www.annualreviews.org/doi/full/10.1146/annurev-polisci-030810-110815',
     u'indices': [38, 58],
     u'url': u'http://t.co/rvxsjYzF'}],
   u'user_mentions': []},
  u'favorite_count': 1,
  u'favorited': False,
  u'geo': None,
  u'id': 208304984213823490,
  u'id_str': u'208304984213823490',
  u'in_reply_to_screen_name': None,
  u'in_reply_to_status_id': None,
  u'in_reply_to_status_id_str': None,
  u'in_reply_to_user_id': None,
  u'in_reply_to_user_id_str': None,
  u'is_quote_status': False,
  u'lang': u'en',
  u'place': None,
  u'possibly_sensitive': False,
  u'retweet_count': 2,
  u'retweeted': False,
  u'source': u'<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
  u'text': u"The Internet's influence on politics: http://t.co/rvxsjYzF #onmylist",
  u'truncated': False,
  u'user': {u'contributors_enabled': False,
   u'created_at': u'Sun May 27 17:14:29 +0000 2007',
   u'default_profile': False,
   u'default_profile_image': False,
   u'description': u'Asst. Prof., @EmersonCollege; Research Associate, @GRCPlaval; Adviser, @SamaraCDA / E-politics, pol comm, journalism / RT \u2260 Endorsement / My views are my own',
   u'entities': {u'description': {u'urls': []},
    u'url': {u'urls': [{u'display_url': u'emerson.academia.edu/VincentRaynauld',
       u'expanded_url': u'https://emerson.academia.edu/VincentRaynauld',
       u'indices': [0, 23],
       u'url': u'https://t.co/BEt7ZM3xMI'}]}},
   u'favourites_count': 256,
   u'follow_request_sent': False,
   u'followers_count': 1385,
   u'following': False,
   u'friends_count': 896,
   u'geo_enabled': True,
   u'has_extended_profile': False,
   u'id': 6366042,
   u'id_str': u'6366042',
   u'is_translation_enabled': False,
   u'is_translator': False,
   u'lang': u'en',
   u'listed_count': 179,
   u'location': u'',
   u'name': u'Vincent Raynauld',
   u'notifications': False,
   u'profile_background_color': u'9AE4E8',
   u'profile_background_image_url': u'http://pbs.twimg.com/profile_background_images/2802935/New_York_Night_1.JPG',
   u'profile_background_image_url_https': u'https://pbs.twimg.com/profile_background_images/2802935/New_York_Night_1.JPG',
   u'profile_background_tile': False,
   u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/6366042/1427074663',
   u'profile_image_url': u'http://pbs.twimg.com/profile_images/423986145974681600/k59Rz254_normal.png',
   u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/423986145974681600/k59Rz254_normal.png',
   u'profile_link_color': u'0000FF',
   u'profile_sidebar_border_color': u'87BC44',
   u'profile_sidebar_fill_color': u'E0FF92',
   u'profile_text_color': u'000000',
   u'profile_use_background_image': True,
   u'protected': False,
   u'screen_name': u'VincentR',
   u'statuses_count': 18392,
   u'time_zone': u'Eastern Time (US & Canada)',
   u'url': u'https://t.co/BEt7ZM3xMI',
   u'utc_offset': -14400,
   u'verified': False}},
 u'source': u'<a href="http://tweetlogix.com" rel="nofollow">Tweetlogix</a>',
 u'text': u"RT @VincentR: The Internet's influence on politics: http://t.co/rvxsjYzF #onmylist",
 u'truncated': False,
 u'user': {u'contributors_enabled': False,
  u'created_at': u'Fri Jan 02 23:08:52 +0000 2009',
  u'default_profile': False,
  u'default_profile_image': False,
  u'description': u'',
  u'entities': {u'description': {u'urls': []}},
  u'favourites_count': 602,
  u'follow_request_sent': False,
  u'followers_count': 4115,
  u'following': False,
  u'friends_count': 936,
  u'geo_enabled': False,
  u'has_extended_profile': False,
  u'id': 18566907,
  u'id_str': u'18566907',
  u'is_translation_enabled': False,
  u'is_translator': False,
  u'lang': u'en',
  u'listed_count': 175,
  u'location': u'FL',
  u'name': u'Toad',
  u'notifications': False,
  u'profile_background_color': u'2D6982',
  u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme2/bg.gif',
  u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme2/bg.gif',
  u'profile_background_tile': False,
  u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/18566907/1359594266',
  u'profile_image_url': u'http://pbs.twimg.com/profile_images/712259757055328258/8BOAlNr8_normal.jpg',
  u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/712259757055328258/8BOAlNr8_normal.jpg',
  u'profile_link_color': u'328BAB',
  u'profile_sidebar_border_color': u'FFFFFF',
  u'profile_sidebar_fill_color': u'95E8EC',
  u'profile_text_color': u'3C3940',
  u'profile_use_background_image': True,
  u'protected': False,
  u'screen_name': u'ToadonaWire',
  u'statuses_count': 79130,
  u'time_zone': u'Eastern Time (US & Canada)',
  u'url': None,
  u'utc_offset': -14400,
  u'verified': False}}