In [2]:
%matplotlib inline

#import envoy 
import json
import pymongo 
from bson import json_util # From  pymongo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as md
from datetime import datetime
import twitter
import networkx as nx
import pickle
print "imported"


imported

In [45]:
mdb = pymongo.MongoClient("mongodb://10.223.208.79")
db = mdb.ferguson
tweets = db.tweets_aug

_start = datetime.now()
print "Number of tweets"
print tweets.count()
_end = datetime.now()
_diff = _start - _end
print "Query took {} seconds".format((_diff.microseconds)/1000000.0)


Number of tweets
2876648
Query took 0.99913 seconds

In [3]:
df = pd.read_csv('/home/data/august_addl_reduced.csv', error_bad_lines = False)
df.head()


Out[3]:
_iso_created_at id user.id retweeted_status.user.id retweeted_status.favorite_count retweeted_status.favourities_count retweeted_status.user.id.1 retweeted_status.user.followers_count retweeted_status.user.friends_count in_reply_to_status_id retweeted_status.in_reply_to_status_id retweeted_status.in_reply_to_user_id
0 2014-08-10T22:45:46.000Z 4.986011e+17 41712041 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2014-08-10T22:45:54.000Z 4.986012e+17 1928268895 NaN NaN NaN NaN NaN NaN 4.985969e+17 NaN NaN
2 2014-08-10T22:46:20.000Z 4.986013e+17 462427721 53019309 56 NaN 53019309 23600 17179 NaN NaN NaN
3 2014-08-10T22:46:33.000Z 4.986013e+17 1952105298 51330975 506 NaN 51330975 4581 2448 NaN NaN NaN
4 2014-08-10T22:47:26.000Z 4.986015e+17 355918910 14090948 12 NaN 14090948 121280 1332 NaN NaN NaN

In [4]:
len(df)


Out[4]:
1819587

In [6]:
df.drop(["retweeted_status.user.id.1", "retweeted_status.favourities_count"], axis=1, inplace=True)

In [7]:
# _iso_created_at --> created: time tweet was created
# id -->              tweet_id: unique id of the tweet
# user.id -->         user_id: unique id of the twitter user posting the tweet
    
# If the tweet is a reteweet:
    
# retweeted_status.user.id --> rt_user_id: id of the user that created the tweet being retweeted
# retweeted_status.favorite_count --> rt_favorite_ct: number of times the original retweet was retweeted
# retweeted_status.user.followers_count --> rt_user_followers_ct: number of followers of the user that created the retweet
# retweeted_status.user.friends_count --> rt_user_friends_ct: number of friends of the user that created the retweet

# in_reply_to_status_id --> rp_tweet_id: the id of the tweet the current tweet is in reply to (is not a retweet)
# retweeted_status.in_reply_to_status_id --> rt_rp_tweet_id: tweet ID of the tweet the retweet was originally in reply to
# retweeted_status.in_reply_to_user_id' --> rt_rp_user_id: user id of the person the retweet was in reply to

df.columns = ["created", "tweet_id", "user_id", "rt_user_id", "rt_favorite_ct", "rt_user_followers_ct", "rt_user_friends_ct", "rp_tweet_id", "rt_rp_tweet_id", "rt_rp_user_id"]
df.head()


Out[7]:
created tweet_id user_id rt_user_id rt_favorite_ct rt_user_followers_ct rt_user_friends_ct rp_tweet_id rt_rp_tweet_id rt_rp_user_id
0 2014-08-10T22:45:46.000Z 4.986011e+17 41712041 NaN NaN NaN NaN NaN NaN NaN
1 2014-08-10T22:45:54.000Z 4.986012e+17 1928268895 NaN NaN NaN NaN 4.985969e+17 NaN NaN
2 2014-08-10T22:46:20.000Z 4.986013e+17 462427721 53019309 56 23600 17179 NaN NaN NaN
3 2014-08-10T22:46:33.000Z 4.986013e+17 1952105298 51330975 506 4581 2448 NaN NaN NaN
4 2014-08-10T22:47:26.000Z 4.986015e+17 355918910 14090948 12 121280 1332 NaN NaN NaN

In [8]:
df.to_pickle('august_addl_reduced.pkl')

In [24]:
df.sort(["user_id", "created"], ascending=[1,0], inplace=True)
df[:20]


Out[24]:
created tweet_id user_id rt_user_id rt_favorite_ct rt_user_followers_ct rt_user_friends_ct rp_tweet_id rt_rp_tweet_id rt_rp_user_id
1770670 2014-08-16T21:22:41.000Z 5.007545e+17 12 NaN NaN NaN NaN NaN NaN NaN
1702365 2014-08-16T18:05:41.000Z 5.007050e+17 12 NaN NaN NaN NaN NaN NaN NaN
1510856 2014-08-16T02:37:57.000Z 5.004715e+17 12 NaN NaN NaN NaN NaN NaN NaN
1501290 2014-08-16T01:52:26.000Z 5.004600e+17 12 13393052 2552 216681 390 NaN NaN NaN
1501059 2014-08-16T01:51:20.000Z 5.004598e+17 12 NaN NaN NaN NaN NaN NaN NaN
1495064 2014-08-16T01:24:37.000Z 5.004530e+17 12 14090948 123 121075 1329 NaN 5.004504e+17 163570705
1494060 2014-08-16T01:19:52.000Z 5.004518e+17 12 58 296 4396 367 NaN NaN NaN
1489066 2014-08-16T00:56:16.000Z 5.004459e+17 12 NaN NaN NaN NaN NaN NaN NaN
1477868 2014-08-16T00:05:05.000Z 5.004330e+17 12 NaN NaN NaN NaN NaN NaN NaN
1439855 2014-08-15T21:16:04.000Z 5.003905e+17 12 22990962 24 34893 614 NaN NaN NaN
1221897 2014-08-15T02:14:39.000Z 5.001032e+17 12 89887215 3559 25246 2335 NaN NaN NaN
794499 2014-08-14T04:25:34.000Z 4.997738e+17 12 16253142 234 16503 2815 NaN NaN NaN
1669639 2014-08-16T15:47:41.000Z 5.006702e+17 57 NaN NaN NaN NaN NaN NaN NaN
1630450 2014-08-16T13:05:25.000Z 5.006294e+17 57 119010935 10 3514 812 NaN NaN NaN
1627355 2014-08-16T12:47:18.000Z 5.006248e+17 57 65277959 27 5778 1143 NaN NaN NaN
1626731 2014-08-16T12:43:26.000Z 5.006239e+17 57 16326882 30 31627 1187 NaN NaN NaN
1423677 2014-08-15T20:28:38.000Z 5.003786e+17 57 NaN NaN NaN NaN NaN NaN NaN
1420373 2014-08-15T20:20:19.000Z 5.003765e+17 57 14173315 152 1936978 2894 NaN NaN NaN
1275620 2014-08-15T03:58:57.000Z 5.001295e+17 57 NaN NaN NaN NaN NaN NaN NaN
1267157 2014-08-15T03:38:11.000Z 5.001243e+17 57 36401728 26 19481 2342 NaN NaN NaN

In [17]:
fr_fl = df.groupby(['user_id'], as_index=False).first()
fr_fl[:20]


Out[17]:
user_id created tweet_id rt_user_id rt_favorite_ct rt_user_followers_ct rt_user_friends_ct rp_tweet_id rt_rp_tweet_id rt_rp_user_id
0 12 2014-08-16T21:22:41.000Z 5.007545e+17 13393052 2552 216681 390 NaN 5.004504e+17 163570705
1 57 2014-08-16T15:47:41.000Z 5.006702e+17 119010935 10 3514 812 NaN 4.997227e+17 18924291
2 58 2014-08-16T18:33:06.000Z 5.007119e+17 12 185 2740150 1278 NaN NaN NaN
3 76 2014-08-16T17:17:44.000Z 5.006929e+17 26225949 3 7248 2194 NaN 5.004426e+17 15125592
4 521 2014-08-15T22:26:41.000Z 5.004083e+17 11740172 388 4164 759 5.000310e+17 NaN NaN
5 556 2014-08-14T16:39:35.000Z 4.999585e+17 26004544 1 11186 401 NaN 4.997795e+17 14703036
6 586 2014-08-16T23:43:31.000Z 5.007900e+17 37570179 30 192393 1397 4.999611e+17 NaN NaN
7 767 2014-08-16T17:33:03.000Z 5.006968e+17 278901890 176 31662 1261 4.997343e+17 5.005121e+17 18031210
8 929 2014-08-15T02:57:39.000Z 5.001141e+17 19817778 69 28104 1826 NaN NaN NaN
9 988 2014-08-14T04:56:53.000Z 4.997817e+17 34988016 2287 3161 601 NaN NaN NaN
10 997 2014-08-15T03:34:59.000Z 5.001235e+17 46213956 24 32525 1081 4.997672e+17 NaN NaN
11 1378 2014-08-15T00:58:15.000Z 5.000840e+17 NaN NaN NaN NaN NaN NaN NaN
12 1497 2014-08-16T21:34:38.000Z 5.007575e+17 924157549 8 214 455 NaN NaN NaN
13 1720 2014-08-14T02:49:44.000Z 4.997497e+17 14348157 179 50039 382 NaN NaN NaN
14 1929 2014-08-15T19:50:54.000Z 5.003691e+17 15220768 944 172962 929 NaN NaN NaN
15 1942 2014-08-14T04:56:27.000Z 4.997816e+17 17530277 29 111704 2066 NaN NaN NaN
16 2092 2014-08-16T03:53:04.000Z 5.004904e+17 15816595 576 32936 1144 NaN NaN NaN
17 2140 2014-08-16T21:34:40.000Z 5.007576e+17 24165761 884 138151 110 NaN NaN NaN
18 2426 2014-08-16T07:57:01.000Z 5.005518e+17 765 183 11883 655 NaN NaN NaN
19 2737 2014-08-16T09:34:19.000Z 5.005763e+17 14606079 270 326988 206 NaN 4.997748e+17 24500377

In [20]:
df.shape


Out[20]:
(1819587, 10)

In [6]:
aug_red_all = pd.read_csv("/home/data/aug_reduced_all.csv")

In [4]:
en = aug_red_all[aug_red_all["lang"] == "en"]

In [7]:
user_tweets = pd.DataFrame(en["user.screen_name"].value_counts())
user_tweets.reset_index(inplace=True)
user_tweets.columns = ["screen_name", "tweets"]

In [9]:
user_tweets.describe()


Out[9]:
tweets
count 114894.000000
mean 13.550977
std 25.746768
min 1.000000
25% 4.000000
50% 7.000000
75% 13.000000
max 788.000000

In [8]:
cutoff = user_tweets[user_tweets["tweets"] > 10]

plt.figure(figsize=(20, 4))
plt.hist(cutoff["tweets"], bins=100, bottom=1)
plt.xlim(10, cutoff["tweets"].max())
plt.title("Distribution of Tweets by Factors of 10")
plt.ylabel("Number of Users in Powers of 10")
plt.xlabel("Number of Tweets")

plt.semilogy()


Out[8]:
[]

In [10]:
# cutoff = 10

cutoff = user_tweets[user_tweets["tweets"] >= 325]

x = np.arange(len(cutoff))

plt.figure(figsize=(20, 4))
plt.bar(x, cutoff["tweets"])
plt.xticks(x, cutoff["screen_name"], rotation=90, fontsize=10)
plt.title("Top Tweeters")
plt.xlabel("Username")
plt.ylabel("Number of Tweets")
plt.show()



In [11]:
by_user_date = en.sort(["user.screen_name", "_iso_created_at_x"], ascending=[1,0])

In [12]:
fr_fl = pd.concat([by_user_date["_iso_created_at_x"], by_user_date["user.screen_name"], by_user_date["user.friends_count"], by_user_date["user.followers_count"]], axis=1)
fr_fl.head()


Out[12]:
_iso_created_at_x user.screen_name user.friends_count user.followers_count
5135530 2014-08-16T06:34:18.000Z 000120o 444 1154
4824496 2014-08-15T12:27:08.000Z 0003orB 504 117
4754278 2014-08-15T02:48:05.000Z 0003orB 504 123
4434305 2014-08-14T05:56:40.000Z 0003orB 504 117
4391914 2014-08-14T05:44:57.000Z 0003orB 504 117

In [13]:
total_fr_fl = fr_fl.groupby(['user.screen_name'], as_index=False).first()

In [16]:
total_fr_fl.drop("_iso_created_at_x", axis=1, inplace=True)

In [24]:
total_fr_fl.columns = ["screen_name", "friends_count", "followers_count"]

In [25]:
total_fr_fl.head()


Out[25]:
screen_name friends_count followers_count
0 000120o 444 1154
1 0003orB 504 117
2 000Dillon000 2229 2181
3 000RowanPark000 1939 1490
4 007JamesBong_ 560 453

In [26]:
plt.figure(figsize=(20, 4))
plt.hist(total_fr_fl["friends_count"], bins=100, bottom=1)
plt.xlim(0, total_fr_fl["friends_count"].max())
plt.title("Distribution of Friends by Factors of 10")
plt.ylabel("Number of Users in Powers of 10")
plt.xlabel("Number of Friends")
plt.semilogy()


Out[26]:
[]

In [27]:
plt.figure(figsize=(20, 4))
plt.hist(total_fr_fl["followers_count"], bins=100, bottom=1)
plt.xlim(0, total_fr_fl["followers_count"].max())
plt.title("Distribution of Followers by Factors of 10")
plt.ylabel("Number of Users in Powers of 10")
plt.xlabel("Number of Followers")
plt.semilogy()


Out[27]:
[]

In [41]:
total_fr_fl.sort("friends_count", ascending=False)[:25]


Out[41]:
screen_name friends_count followers_count
40627 NesterTweets 845485 770631
45925 RWSurferGirl 404383 506159
109487 thehill 383821 505640
7504 BlackieCanela 380215 381469
55588 TheEyeOfControl 308201 295031
114604 zaibatsu 279289 505502
57947 TrueBornRecords 245904 275041
4454 AppSame 201048 978630
53388 SupermanHotMale 185552 208926
77430 egomezislas 184240 461563
53943 TIMENOUT 177518 175824
27007 JeffreyGuterman 175996 171188
65707 alhanda 174636 176647
47237 RevezNexus 166804 179400
103246 rowdytukgoon 162351 147928
59997 WashingtonDCTea 161736 165432
27815 JoeyCutless 158837 169837
81226 greensboro_nc 157899 189009
41482 NormanBuffong 154081 460424
44550 Politics_PR 152864 169513
21504 GoodMenProject 150478 148332
61623 Yubbie007 146561 148158
82888 iAmNateJames 145441 210947
71789 ceoMARS 141651 237733
82883 iAmMasonJames 139454 208222