In [47]:
    
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
import datetime
    
In [2]:
    
df = pd.read_json("https://s3.amazonaws.com/far-right/twitter/mb_protests.json")
    
In [3]:
    
df.columns
    
    Out[3]:
In [4]:
    
print("Total number of tweets = {}".format(len(df)))
    
    
In [5]:
    
# Lowercase the hashtags and tweet body
df['hashtags'] = df['hashtags'].str.lower()
df['text'] = df['text'].str.lower()
    
In [6]:
    
print("Total number of tweets containing hashtag 'wall' = {}".format(len(df[df['hashtags'].str.contains('wall')])))
    
    
In [7]:
    
print("Total number of tweets whose body contains 'wall' = {}".format(len(df[df['text'].str.contains('wall')])))
    
    
In [8]:
    
wall_tweets = df[(df['hashtags'].str.contains('wall')) | (df['text'].str.contains('wall'))].copy()
    
In [9]:
    
print("Total number of tweets about the 'wall' = {}".format(len(wall_tweets)))
    
    
In [11]:
    
def months_between(end, start):
    return (end.year - start.year)*12 + end.month - start.month
    
In [12]:
    
wall_tweets['created'] = pd.to_datetime(wall_tweets['created'])
wall_tweets['user_created'] = pd.to_datetime(wall_tweets['user_created'])
    
In [13]:
    
wall_tweets['user_tenure'] = wall_tweets[['created', \
                            'user_created']].apply(lambda row: months_between(row[0], row[1]), axis=1)
    
In [17]:
    
tenure_grouping = wall_tweets.groupby('user_tenure').size() / len(wall_tweets) * 100
fig, ax = plt.subplots()
ax.plot(tenure_grouping.index, tenure_grouping.values)
ax.set_ylabel("% of tweets")
ax.set_xlabel("Acct tenure in months")
plt.show()
    
    
In [22]:
    
tweets_per_user = wall_tweets.groupby('user_name').size().sort_values(ascending=False)
fig, ax = plt.subplots()
ax.plot(tweets_per_user.values)
plt.show()
    
    
In [35]:
    
wall_tweets.groupby(['user_name', 'user_description']).size().sort_values(ascending=False).head(20).to_frame()
    
    Out[35]:
In [57]:
    
plt.boxplot(wall_tweets['friends_count'].values, vert=False)
plt.show()
    
    
In [58]:
    
wall_tweets['friends_count'].describe()
    
    Out[58]:
In [39]:
    
wall_tweets.groupby('user_location').size().sort_values(ascending=False)
    
    Out[39]:
In [ ]: