In [2]:
%matplotlib inline
#import envoy
import json
import pymongo
from bson import json_util # From pymongo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as md
from datetime import datetime
import twitter
import networkx as nx
import pickle
print "imported"
In [45]:
mdb = pymongo.MongoClient("mongodb://10.223.208.79")
db = mdb.ferguson
tweets = db.tweets_aug
_start = datetime.now()
print "Number of tweets"
print tweets.count()
_end = datetime.now()
_diff = _start - _end
print "Query took {} seconds".format((_diff.microseconds)/1000000.0)
In [3]:
df = pd.read_csv('/home/data/august_addl_reduced.csv', error_bad_lines = False)
df.head()
Out[3]:
In [4]:
len(df)
Out[4]:
In [6]:
df.drop(["retweeted_status.user.id.1", "retweeted_status.favourities_count"], axis=1, inplace=True)
In [7]:
# _iso_created_at --> created: time tweet was created
# id --> tweet_id: unique id of the tweet
# user.id --> user_id: unique id of the twitter user posting the tweet
# If the tweet is a reteweet:
# retweeted_status.user.id --> rt_user_id: id of the user that created the tweet being retweeted
# retweeted_status.favorite_count --> rt_favorite_ct: number of times the original retweet was retweeted
# retweeted_status.user.followers_count --> rt_user_followers_ct: number of followers of the user that created the retweet
# retweeted_status.user.friends_count --> rt_user_friends_ct: number of friends of the user that created the retweet
# in_reply_to_status_id --> rp_tweet_id: the id of the tweet the current tweet is in reply to (is not a retweet)
# retweeted_status.in_reply_to_status_id --> rt_rp_tweet_id: tweet ID of the tweet the retweet was originally in reply to
# retweeted_status.in_reply_to_user_id' --> rt_rp_user_id: user id of the person the retweet was in reply to
df.columns = ["created", "tweet_id", "user_id", "rt_user_id", "rt_favorite_ct", "rt_user_followers_ct", "rt_user_friends_ct", "rp_tweet_id", "rt_rp_tweet_id", "rt_rp_user_id"]
df.head()
Out[7]:
In [8]:
df.to_pickle('august_addl_reduced.pkl')
In [24]:
df.sort(["user_id", "created"], ascending=[1,0], inplace=True)
df[:20]
Out[24]:
In [17]:
fr_fl = df.groupby(['user_id'], as_index=False).first()
fr_fl[:20]
Out[17]:
In [20]:
df.shape
Out[20]:
In [6]:
aug_red_all = pd.read_csv("/home/data/aug_reduced_all.csv")
In [4]:
en = aug_red_all[aug_red_all["lang"] == "en"]
In [7]:
user_tweets = pd.DataFrame(en["user.screen_name"].value_counts())
user_tweets.reset_index(inplace=True)
user_tweets.columns = ["screen_name", "tweets"]
In [9]:
user_tweets.describe()
Out[9]:
In [8]:
cutoff = user_tweets[user_tweets["tweets"] > 10]
plt.figure(figsize=(20, 4))
plt.hist(cutoff["tweets"], bins=100, bottom=1)
plt.xlim(10, cutoff["tweets"].max())
plt.title("Distribution of Tweets by Factors of 10")
plt.ylabel("Number of Users in Powers of 10")
plt.xlabel("Number of Tweets")
plt.semilogy()
Out[8]:
In [10]:
# cutoff = 10
cutoff = user_tweets[user_tweets["tweets"] >= 325]
x = np.arange(len(cutoff))
plt.figure(figsize=(20, 4))
plt.bar(x, cutoff["tweets"])
plt.xticks(x, cutoff["screen_name"], rotation=90, fontsize=10)
plt.title("Top Tweeters")
plt.xlabel("Username")
plt.ylabel("Number of Tweets")
plt.show()
In [11]:
by_user_date = en.sort(["user.screen_name", "_iso_created_at_x"], ascending=[1,0])
In [12]:
fr_fl = pd.concat([by_user_date["_iso_created_at_x"], by_user_date["user.screen_name"], by_user_date["user.friends_count"], by_user_date["user.followers_count"]], axis=1)
fr_fl.head()
Out[12]:
In [13]:
total_fr_fl = fr_fl.groupby(['user.screen_name'], as_index=False).first()
In [16]:
total_fr_fl.drop("_iso_created_at_x", axis=1, inplace=True)
In [24]:
total_fr_fl.columns = ["screen_name", "friends_count", "followers_count"]
In [25]:
total_fr_fl.head()
Out[25]:
In [26]:
plt.figure(figsize=(20, 4))
plt.hist(total_fr_fl["friends_count"], bins=100, bottom=1)
plt.xlim(0, total_fr_fl["friends_count"].max())
plt.title("Distribution of Friends by Factors of 10")
plt.ylabel("Number of Users in Powers of 10")
plt.xlabel("Number of Friends")
plt.semilogy()
Out[26]:
In [27]:
plt.figure(figsize=(20, 4))
plt.hist(total_fr_fl["followers_count"], bins=100, bottom=1)
plt.xlim(0, total_fr_fl["followers_count"].max())
plt.title("Distribution of Followers by Factors of 10")
plt.ylabel("Number of Users in Powers of 10")
plt.xlabel("Number of Followers")
plt.semilogy()
Out[27]:
In [41]:
total_fr_fl.sort("friends_count", ascending=False)[:25]
Out[41]: