Descriptive Stats of Tweet File

Load the necessary libraries


In [3]:
%pylab inline
import matplotlib.pyplot as plt
import pandas as pd
import simplejson
from collections import Counter


Populating the interactive namespace from numpy and matplotlib

The following code loops through the file with Twitter JSON. Since pandas doesn't like to work with nested dictionaries, we pull out the nested metadata we want to analyze before creating the data frame.


In [76]:
file = open('/var/data/example_tweets.json')
tweets = []

# Create a counter of users, hashtags, URLs
users = Counter()
hashtags = Counter()
mentions = Counter()

for line in file:
    # Load each line of JSON
    tweet = simplejson.loads(line)
    if 'user' in tweet:
        users.update({tweet['user']['screen_name']:1})

        tweet['screen_name'] = tweet['user']['screen_name']

        # Pull out the user's followers_count
        tweet['followers_count'] = tweet['user']['followers_count']

        # Convert the text timestamp to a datetime object
        tweet['created_at'] = pd.to_datetime(tweet['created_at'])

        # Create a list of hashtags
        for ht in tweet['entities']['hashtags']:
            # Why do we want to lowercase the hashtags?
            hashtags.update({ht['text'].lower(): 1})
            
        tweet['hashtag_count'] = len(tweet['entities']['hashtags'])

        # Create a list of mentions
        tweet['mentions'] = []
        for mention in tweet['entities']['user_mentions']:
            mentions.update({mention['screen_name']:1})
        
        tweet['mention_count'] = len(tweet['entities']['user_mentions'])

        # Create a list of urls
        tweet['url_count'] = len(tweet['entities']['urls'])

        # Append the modified tweet to our array of tweets
        tweets.append(tweet)
df = pd.DataFrame(tweets)

Descriptive Stats

1. Graph the volume of tweets over time.


In [138]:
df.created_at.groupby([df.created_at.dt.hour]).agg('count').plot()


Out[138]:
<matplotlib.axes.AxesSubplot at 0x7fe925f311d0>

In [134]:
df.created_at.dt.hour.value_counts()


Out[134]:
1     1630
2     1626
3      507
0      218
4      204
5       78
14      61
23      58
6       57
13      56
11      46
15      45
12      44
18      40
17      39
16      37
21      32
20      29
22      28
10      27
7       26
19      21
9       15
8       14
dtype: int64

2. Total number of tweets


In [53]:
len(df)


Out[53]:
4938

3. Total number of unique users


In [54]:
df['screen_name'].nunique()


Out[54]:
4856

4. Total number of unique hashtags


In [55]:
len(hashtags)


Out[55]:
840

5. Graph of follower counts


In [152]:
df.followers_count.value_counts().plot()


Out[152]:
<matplotlib.axes.AxesSubplot at 0x7fe925bf5c18>

6. Number of URLS in each tweet with histogram


In [130]:
df.url_count.hist()


Out[130]:
<matplotlib.axes.AxesSubplot at 0x7fe925d823c8>

In [149]:
df.url_count.value_counts()


Out[149]:
0    4257
1     672
2       9
dtype: int64

7. Number of hashtags in each tweet with histogram


In [148]:
df.hashtag_count.hist()


Out[148]:
<matplotlib.axes.AxesSubplot at 0x7fe92549ff60>

In [129]:
df.hashtag_count.value_counts()


Out[129]:
1     2437
0     1317
2      828
3      230
4       70
5       38
7        9
6        5
12       2
17       1
8        1
dtype: int64

8. Top 20 users mentioned


In [72]:
mentions.most_common(20)


Out[72]:
[('chrisrockoz', 166),
 ('BarackObama', 132),
 ('MittRomney', 125),
 ('politifact', 88),
 ('HuffingtonPost', 88),
 ('KarlRove', 76),
 ('RomneyResponse', 63),
 ('DickMorrisTweet', 54),
 ('OnionPolitics', 54),
 ('RepubGrlProbs', 40),
 ('WayneRooney', 33),
 ('GOP', 31),
 ('indecision', 29),
 ('MichaelSkolnik', 29),
 ('gov', 29),
 ('darthvader', 27),
 ('TeamRomney', 27),
 ('ladygaga', 26),
 ('YahooNews', 22),
 ('cracked', 22)]

9. Top 10 hashtags besides #debates


In [75]:
# Using the list of hashtags we created above
hashtags.most_common(21)[1:]


Out[75]:
[('lynndebate', 129),
 ('debate', 104),
 ('romney', 99),
 ('obama', 94),
 ('tcot', 81),
 ('cantafford4more', 72),
 ('stopndaa', 45),
 ('horsesandbayonets', 39),
 ('romneyryan2012', 38),
 ('debate2012', 38),
 ('mockthevote', 28),
 ('p2', 27),
 ('election2012', 26),
 ('de', 26),
 ('finaldebate', 25),
 ('debates2012', 24),
 ('hp2012', 22),
 ('bears', 19),
 ('horsesandba', 19),
 ('obama2012', 19)]

In [ ]: