notebook.community

Edit and run



In [2]:

    
%matplotlib inline

#import envoy 
import json
import pymongo 
from bson import json_util # From  pymongo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as md
from datetime import datetime
import twitter
import networkx as nx
import pickle
print "imported"









    



imported



In [45]:

    
mdb = pymongo.MongoClient("mongodb://10.223.208.79")
db = mdb.ferguson
tweets = db.tweets_aug

_start = datetime.now()
print "Number of tweets"
print tweets.count()
_end = datetime.now()
_diff = _start - _end
print "Query took {} seconds".format((_diff.microseconds)/1000000.0)









    



Number of tweets
2876648
Query took 0.99913 seconds



In [3]:

    
df = pd.read_csv('/home/data/august_addl_reduced.csv', error_bad_lines = False)
df.head()









    Out[3]:






  
    
      
      _iso_created_at
      id
      user.id
      retweeted_status.user.id
      retweeted_status.favorite_count
      retweeted_status.favourities_count
      retweeted_status.user.id.1
      retweeted_status.user.followers_count
      retweeted_status.user.friends_count
      in_reply_to_status_id
      retweeted_status.in_reply_to_status_id
      retweeted_status.in_reply_to_user_id
    
  
  
    
      0
       2014-08-10T22:45:46.000Z
       4.986011e+17
         41712041
            NaN
       NaN
      NaN
            NaN
          NaN
         NaN
                NaN
      NaN
      NaN
    
    
      1
       2014-08-10T22:45:54.000Z
       4.986012e+17
       1928268895
            NaN
       NaN
      NaN
            NaN
          NaN
         NaN
       4.985969e+17
      NaN
      NaN
    
    
      2
       2014-08-10T22:46:20.000Z
       4.986013e+17
        462427721
       53019309
        56
      NaN
       53019309
        23600
       17179
                NaN
      NaN
      NaN
    
    
      3
       2014-08-10T22:46:33.000Z
       4.986013e+17
       1952105298
       51330975
       506
      NaN
       51330975
         4581
        2448
                NaN
      NaN
      NaN
    
    
      4
       2014-08-10T22:47:26.000Z
       4.986015e+17
        355918910
       14090948
        12
      NaN
       14090948
       121280
        1332
                NaN
      NaN
      NaN



In [4]:

    
len(df)









    Out[4]:





1819587



In [6]:

    
df.drop(["retweeted_status.user.id.1", "retweeted_status.favourities_count"], axis=1, inplace=True)



In [7]:

    
# _iso_created_at --> created: time tweet was created
# id -->              tweet_id: unique id of the tweet
# user.id -->         user_id: unique id of the twitter user posting the tweet
    
# If the tweet is a reteweet:
    
# retweeted_status.user.id --> rt_user_id: id of the user that created the tweet being retweeted
# retweeted_status.favorite_count --> rt_favorite_ct: number of times the original retweet was retweeted
# retweeted_status.user.followers_count --> rt_user_followers_ct: number of followers of the user that created the retweet
# retweeted_status.user.friends_count --> rt_user_friends_ct: number of friends of the user that created the retweet

# in_reply_to_status_id --> rp_tweet_id: the id of the tweet the current tweet is in reply to (is not a retweet)
# retweeted_status.in_reply_to_status_id --> rt_rp_tweet_id: tweet ID of the tweet the retweet was originally in reply to
# retweeted_status.in_reply_to_user_id' --> rt_rp_user_id: user id of the person the retweet was in reply to

df.columns = ["created", "tweet_id", "user_id", "rt_user_id", "rt_favorite_ct", "rt_user_followers_ct", "rt_user_friends_ct", "rp_tweet_id", "rt_rp_tweet_id", "rt_rp_user_id"]
df.head()









    Out[7]:






  
    
      
      created
      tweet_id
      user_id
      rt_user_id
      rt_favorite_ct
      rt_user_followers_ct
      rt_user_friends_ct
      rp_tweet_id
      rt_rp_tweet_id
      rt_rp_user_id
    
  
  
    
      0
       2014-08-10T22:45:46.000Z
       4.986011e+17
         41712041
            NaN
       NaN
          NaN
         NaN
                NaN
      NaN
      NaN
    
    
      1
       2014-08-10T22:45:54.000Z
       4.986012e+17
       1928268895
            NaN
       NaN
          NaN
         NaN
       4.985969e+17
      NaN
      NaN
    
    
      2
       2014-08-10T22:46:20.000Z
       4.986013e+17
        462427721
       53019309
        56
        23600
       17179
                NaN
      NaN
      NaN
    
    
      3
       2014-08-10T22:46:33.000Z
       4.986013e+17
       1952105298
       51330975
       506
         4581
        2448
                NaN
      NaN
      NaN
    
    
      4
       2014-08-10T22:47:26.000Z
       4.986015e+17
        355918910
       14090948
        12
       121280
        1332
                NaN
      NaN
      NaN



In [8]:

    
df.to_pickle('august_addl_reduced.pkl')



In [24]:

    
df.sort(["user_id", "created"], ascending=[1,0], inplace=True)
df[:20]









    Out[24]:






  
    
      
      created
      tweet_id
      user_id
      rt_user_id
      rt_favorite_ct
      rt_user_followers_ct
      rt_user_friends_ct
      rp_tweet_id
      rt_rp_tweet_id
      rt_rp_user_id
    
  
  
    
      1770670
       2014-08-16T21:22:41.000Z
       5.007545e+17
       12
             NaN
        NaN
           NaN
        NaN
      NaN
                NaN
             NaN
    
    
      1702365
       2014-08-16T18:05:41.000Z
       5.007050e+17
       12
             NaN
        NaN
           NaN
        NaN
      NaN
                NaN
             NaN
    
    
      1510856
       2014-08-16T02:37:57.000Z
       5.004715e+17
       12
             NaN
        NaN
           NaN
        NaN
      NaN
                NaN
             NaN
    
    
      1501290
       2014-08-16T01:52:26.000Z
       5.004600e+17
       12
        13393052
       2552
        216681
        390
      NaN
                NaN
             NaN
    
    
      1501059
       2014-08-16T01:51:20.000Z
       5.004598e+17
       12
             NaN
        NaN
           NaN
        NaN
      NaN
                NaN
             NaN
    
    
      1495064
       2014-08-16T01:24:37.000Z
       5.004530e+17
       12
        14090948
        123
        121075
       1329
      NaN
       5.004504e+17
       163570705
    
    
      1494060
       2014-08-16T01:19:52.000Z
       5.004518e+17
       12
              58
        296
          4396
        367
      NaN
                NaN
             NaN
    
    
      1489066
       2014-08-16T00:56:16.000Z
       5.004459e+17
       12
             NaN
        NaN
           NaN
        NaN
      NaN
                NaN
             NaN
    
    
      1477868
       2014-08-16T00:05:05.000Z
       5.004330e+17
       12
             NaN
        NaN
           NaN
        NaN
      NaN
                NaN
             NaN
    
    
      1439855
       2014-08-15T21:16:04.000Z
       5.003905e+17
       12
        22990962
         24
         34893
        614
      NaN
                NaN
             NaN
    
    
      1221897
       2014-08-15T02:14:39.000Z
       5.001032e+17
       12
        89887215
       3559
         25246
       2335
      NaN
                NaN
             NaN
    
    
      794499 
       2014-08-14T04:25:34.000Z
       4.997738e+17
       12
        16253142
        234
         16503
       2815
      NaN
                NaN
             NaN
    
    
      1669639
       2014-08-16T15:47:41.000Z
       5.006702e+17
       57
             NaN
        NaN
           NaN
        NaN
      NaN
                NaN
             NaN
    
    
      1630450
       2014-08-16T13:05:25.000Z
       5.006294e+17
       57
       119010935
         10
          3514
        812
      NaN
                NaN
             NaN
    
    
      1627355
       2014-08-16T12:47:18.000Z
       5.006248e+17
       57
        65277959
         27
          5778
       1143
      NaN
                NaN
             NaN
    
    
      1626731
       2014-08-16T12:43:26.000Z
       5.006239e+17
       57
        16326882
         30
         31627
       1187
      NaN
                NaN
             NaN
    
    
      1423677
       2014-08-15T20:28:38.000Z
       5.003786e+17
       57
             NaN
        NaN
           NaN
        NaN
      NaN
                NaN
             NaN
    
    
      1420373
       2014-08-15T20:20:19.000Z
       5.003765e+17
       57
        14173315
        152
       1936978
       2894
      NaN
                NaN
             NaN
    
    
      1275620
       2014-08-15T03:58:57.000Z
       5.001295e+17
       57
             NaN
        NaN
           NaN
        NaN
      NaN
                NaN
             NaN
    
    
      1267157
       2014-08-15T03:38:11.000Z
       5.001243e+17
       57
        36401728
         26
         19481
       2342
      NaN
                NaN
             NaN



In [17]:

    
fr_fl = df.groupby(['user_id'], as_index=False).first()
fr_fl[:20]









    Out[17]:






  
    
      
      user_id
      created
      tweet_id
      rt_user_id
      rt_favorite_ct
      rt_user_followers_ct
      rt_user_friends_ct
      rp_tweet_id
      rt_rp_tweet_id
      rt_rp_user_id
    
  
  
    
      0 
         12
       2014-08-16T21:22:41.000Z
       5.007545e+17
        13393052
       2552
        216681
        390
                NaN
       5.004504e+17
       163570705
    
    
      1 
         57
       2014-08-16T15:47:41.000Z
       5.006702e+17
       119010935
         10
          3514
        812
                NaN
       4.997227e+17
        18924291
    
    
      2 
         58
       2014-08-16T18:33:06.000Z
       5.007119e+17
              12
        185
       2740150
       1278
                NaN
                NaN
             NaN
    
    
      3 
         76
       2014-08-16T17:17:44.000Z
       5.006929e+17
        26225949
          3
          7248
       2194
                NaN
       5.004426e+17
        15125592
    
    
      4 
        521
       2014-08-15T22:26:41.000Z
       5.004083e+17
        11740172
        388
          4164
        759
       5.000310e+17
                NaN
             NaN
    
    
      5 
        556
       2014-08-14T16:39:35.000Z
       4.999585e+17
        26004544
          1
         11186
        401
                NaN
       4.997795e+17
        14703036
    
    
      6 
        586
       2014-08-16T23:43:31.000Z
       5.007900e+17
        37570179
         30
        192393
       1397
       4.999611e+17
                NaN
             NaN
    
    
      7 
        767
       2014-08-16T17:33:03.000Z
       5.006968e+17
       278901890
        176
         31662
       1261
       4.997343e+17
       5.005121e+17
        18031210
    
    
      8 
        929
       2014-08-15T02:57:39.000Z
       5.001141e+17
        19817778
         69
         28104
       1826
                NaN
                NaN
             NaN
    
    
      9 
        988
       2014-08-14T04:56:53.000Z
       4.997817e+17
        34988016
       2287
          3161
        601
                NaN
                NaN
             NaN
    
    
      10
        997
       2014-08-15T03:34:59.000Z
       5.001235e+17
        46213956
         24
         32525
       1081
       4.997672e+17
                NaN
             NaN
    
    
      11
       1378
       2014-08-15T00:58:15.000Z
       5.000840e+17
             NaN
        NaN
           NaN
        NaN
                NaN
                NaN
             NaN
    
    
      12
       1497
       2014-08-16T21:34:38.000Z
       5.007575e+17
       924157549
          8
           214
        455
                NaN
                NaN
             NaN
    
    
      13
       1720
       2014-08-14T02:49:44.000Z
       4.997497e+17
        14348157
        179
         50039
        382
                NaN
                NaN
             NaN
    
    
      14
       1929
       2014-08-15T19:50:54.000Z
       5.003691e+17
        15220768
        944
        172962
        929
                NaN
                NaN
             NaN
    
    
      15
       1942
       2014-08-14T04:56:27.000Z
       4.997816e+17
        17530277
         29
        111704
       2066
                NaN
                NaN
             NaN
    
    
      16
       2092
       2014-08-16T03:53:04.000Z
       5.004904e+17
        15816595
        576
         32936
       1144
                NaN
                NaN
             NaN
    
    
      17
       2140
       2014-08-16T21:34:40.000Z
       5.007576e+17
        24165761
        884
        138151
        110
                NaN
                NaN
             NaN
    
    
      18
       2426
       2014-08-16T07:57:01.000Z
       5.005518e+17
             765
        183
         11883
        655
                NaN
                NaN
             NaN
    
    
      19
       2737
       2014-08-16T09:34:19.000Z
       5.005763e+17
        14606079
        270
        326988
        206
                NaN
       4.997748e+17
        24500377



In [20]:

    
df.shape









    Out[20]:





(1819587, 10)



In [6]:

    
aug_red_all = pd.read_csv("/home/data/aug_reduced_all.csv")



In [4]:

    
en = aug_red_all[aug_red_all["lang"] == "en"]



In [7]:

    
user_tweets = pd.DataFrame(en["user.screen_name"].value_counts())
user_tweets.reset_index(inplace=True)
user_tweets.columns = ["screen_name", "tweets"]



In [9]:

    
user_tweets.describe()









    Out[9]:






  
    
      
      tweets
    
  
  
    
      count
       114894.000000
    
    
      mean
           13.550977
    
    
      std
           25.746768
    
    
      min
            1.000000
    
    
      25%
            4.000000
    
    
      50%
            7.000000
    
    
      75%
           13.000000
    
    
      max
          788.000000



In [8]:

    
cutoff = user_tweets[user_tweets["tweets"] > 10]

plt.figure(figsize=(20, 4))
plt.hist(cutoff["tweets"], bins=100, bottom=1)
plt.xlim(10, cutoff["tweets"].max())
plt.title("Distribution of Tweets by Factors of 10")
plt.ylabel("Number of Users in Powers of 10")
plt.xlabel("Number of Tweets")

plt.semilogy()









    Out[8]:





[]



In [10]:

    
# cutoff = 10

cutoff = user_tweets[user_tweets["tweets"] >= 325]

x = np.arange(len(cutoff))

plt.figure(figsize=(20, 4))
plt.bar(x, cutoff["tweets"])
plt.xticks(x, cutoff["screen_name"], rotation=90, fontsize=10)
plt.title("Top Tweeters")
plt.xlabel("Username")
plt.ylabel("Number of Tweets")
plt.show()



In [11]:

    
by_user_date = en.sort(["user.screen_name", "_iso_created_at_x"], ascending=[1,0])



In [12]:

    
fr_fl = pd.concat([by_user_date["_iso_created_at_x"], by_user_date["user.screen_name"], by_user_date["user.friends_count"], by_user_date["user.followers_count"]], axis=1)
fr_fl.head()









    Out[12]:






  
    
      
      _iso_created_at_x
      user.screen_name
      user.friends_count
      user.followers_count
    
  
  
    
      5135530
       2014-08-16T06:34:18.000Z
       000120o
       444
       1154
    
    
      4824496
       2014-08-15T12:27:08.000Z
       0003orB
       504
        117
    
    
      4754278
       2014-08-15T02:48:05.000Z
       0003orB
       504
        123
    
    
      4434305
       2014-08-14T05:56:40.000Z
       0003orB
       504
        117
    
    
      4391914
       2014-08-14T05:44:57.000Z
       0003orB
       504
        117



In [13]:

    
total_fr_fl = fr_fl.groupby(['user.screen_name'], as_index=False).first()



In [16]:

    
total_fr_fl.drop("_iso_created_at_x", axis=1, inplace=True)



In [24]:

    
total_fr_fl.columns = ["screen_name", "friends_count", "followers_count"]



In [25]:

    
total_fr_fl.head()









    Out[25]:






  
    
      
      screen_name
      friends_count
      followers_count
    
  
  
    
      0
               000120o
        444
       1154
    
    
      1
               0003orB
        504
        117
    
    
      2
          000Dillon000
       2229
       2181
    
    
      3
       000RowanPark000
       1939
       1490
    
    
      4
         007JamesBong_
        560
        453



In [26]:

    
plt.figure(figsize=(20, 4))
plt.hist(total_fr_fl["friends_count"], bins=100, bottom=1)
plt.xlim(0, total_fr_fl["friends_count"].max())
plt.title("Distribution of Friends by Factors of 10")
plt.ylabel("Number of Users in Powers of 10")
plt.xlabel("Number of Friends")
plt.semilogy()









    Out[26]:





[]



In [27]:

    
plt.figure(figsize=(20, 4))
plt.hist(total_fr_fl["followers_count"], bins=100, bottom=1)
plt.xlim(0, total_fr_fl["followers_count"].max())
plt.title("Distribution of Followers by Factors of 10")
plt.ylabel("Number of Users in Powers of 10")
plt.xlabel("Number of Followers")
plt.semilogy()









    Out[27]:





[]



In [41]:

    
total_fr_fl.sort("friends_count", ascending=False)[:25]









    Out[41]:






  
    
      
      screen_name
      friends_count
      followers_count
    
  
  
    
      40627 
          NesterTweets
       845485
       770631
    
    
      45925 
          RWSurferGirl
       404383
       506159
    
    
      109487
               thehill
       383821
       505640
    
    
      7504  
         BlackieCanela
       380215
       381469
    
    
      55588 
       TheEyeOfControl
       308201
       295031
    
    
      114604
              zaibatsu
       279289
       505502
    
    
      57947 
       TrueBornRecords
       245904
       275041
    
    
      4454  
               AppSame
       201048
       978630
    
    
      53388 
       SupermanHotMale
       185552
       208926
    
    
      77430 
           egomezislas
       184240
       461563
    
    
      53943 
              TIMENOUT
       177518
       175824
    
    
      27007 
       JeffreyGuterman
       175996
       171188
    
    
      65707 
               alhanda
       174636
       176647
    
    
      47237 
            RevezNexus
       166804
       179400
    
    
      103246
          rowdytukgoon
       162351
       147928
    
    
      59997 
       WashingtonDCTea
       161736
       165432
    
    
      27815 
           JoeyCutless
       158837
       169837
    
    
      81226 
         greensboro_nc
       157899
       189009
    
    
      41482 
         NormanBuffong
       154081
       460424
    
    
      44550 
           Politics_PR
       152864
       169513
    
    
      21504 
        GoodMenProject
       150478
       148332
    
    
      61623 
             Yubbie007
       146561
       148158
    
    
      82888 
          iAmNateJames
       145441
       210947
    
    
      71789 
               ceoMARS
       141651
       237733
    
    
      82883 
         iAmMasonJames
       139454
       208222

	_iso_created_at	id	user.id	retweeted_status.user.id	retweeted_status.favorite_count	retweeted_status.favourities_count	retweeted_status.user.id.1	retweeted_status.user.followers_count	retweeted_status.user.friends_count	in_reply_to_status_id	retweeted_status.in_reply_to_status_id	retweeted_status.in_reply_to_user_id
0	2014-08-10T22:45:46.000Z	4.986011e+17	41712041	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	2014-08-10T22:45:54.000Z	4.986012e+17	1928268895	NaN	NaN	NaN	NaN	NaN	NaN	4.985969e+17	NaN	NaN
2	2014-08-10T22:46:20.000Z	4.986013e+17	462427721	53019309	56	NaN	53019309	23600	17179	NaN	NaN	NaN
3	2014-08-10T22:46:33.000Z	4.986013e+17	1952105298	51330975	506	NaN	51330975	4581	2448	NaN	NaN	NaN
4	2014-08-10T22:47:26.000Z	4.986015e+17	355918910	14090948	12	NaN	14090948	121280	1332	NaN	NaN	NaN

	created	tweet_id	user_id	rt_user_id	rt_favorite_ct	rt_user_followers_ct	rt_user_friends_ct	rp_tweet_id	rt_rp_tweet_id	rt_rp_user_id
1770670	2014-08-16T21:22:41.000Z	5.007545e+17	12	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1702365	2014-08-16T18:05:41.000Z	5.007050e+17	12	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1510856	2014-08-16T02:37:57.000Z	5.004715e+17	12	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1501290	2014-08-16T01:52:26.000Z	5.004600e+17	12	13393052	2552	216681	390	NaN	NaN	NaN
1501059	2014-08-16T01:51:20.000Z	5.004598e+17	12	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1495064	2014-08-16T01:24:37.000Z	5.004530e+17	12	14090948	123	121075	1329	NaN	5.004504e+17	163570705
1494060	2014-08-16T01:19:52.000Z	5.004518e+17	12	58	296	4396	367	NaN	NaN	NaN
1489066	2014-08-16T00:56:16.000Z	5.004459e+17	12	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1477868	2014-08-16T00:05:05.000Z	5.004330e+17	12	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1439855	2014-08-15T21:16:04.000Z	5.003905e+17	12	22990962	24	34893	614	NaN	NaN	NaN
1221897	2014-08-15T02:14:39.000Z	5.001032e+17	12	89887215	3559	25246	2335	NaN	NaN	NaN
794499	2014-08-14T04:25:34.000Z	4.997738e+17	12	16253142	234	16503	2815	NaN	NaN	NaN
1669639	2014-08-16T15:47:41.000Z	5.006702e+17	57	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1630450	2014-08-16T13:05:25.000Z	5.006294e+17	57	119010935	10	3514	812	NaN	NaN	NaN
1627355	2014-08-16T12:47:18.000Z	5.006248e+17	57	65277959	27	5778	1143	NaN	NaN	NaN
1626731	2014-08-16T12:43:26.000Z	5.006239e+17	57	16326882	30	31627	1187	NaN	NaN	NaN
1423677	2014-08-15T20:28:38.000Z	5.003786e+17	57	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1420373	2014-08-15T20:20:19.000Z	5.003765e+17	57	14173315	152	1936978	2894	NaN	NaN	NaN
1275620	2014-08-15T03:58:57.000Z	5.001295e+17	57	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1267157	2014-08-15T03:38:11.000Z	5.001243e+17	57	36401728	26	19481	2342	NaN	NaN	NaN

	tweets
count	114894.000000
mean	13.550977
std	25.746768
min	1.000000
25%	4.000000
50%	7.000000
75%	13.000000
max	788.000000

	_iso_created_at_x	user.screen_name	user.friends_count	user.followers_count
5135530	2014-08-16T06:34:18.000Z	000120o	444	1154
4824496	2014-08-15T12:27:08.000Z	0003orB	504	117
4754278	2014-08-15T02:48:05.000Z	0003orB	504	123
4434305	2014-08-14T05:56:40.000Z	0003orB	504	117
4391914	2014-08-14T05:44:57.000Z	0003orB	504	117

	screen_name	friends_count	followers_count
40627	NesterTweets	845485	770631
45925	RWSurferGirl	404383	506159
109487	thehill	383821	505640
7504	BlackieCanela	380215	381469
55588	TheEyeOfControl	308201	295031
114604	zaibatsu	279289	505502
57947	TrueBornRecords	245904	275041
4454	AppSame	201048	978630
53388	SupermanHotMale	185552	208926
77430	egomezislas	184240	461563
53943	TIMENOUT	177518	175824
27007	JeffreyGuterman	175996	171188
65707	alhanda	174636	176647
47237	RevezNexus	166804	179400
103246	rowdytukgoon	162351	147928
59997	WashingtonDCTea	161736	165432
27815	JoeyCutless	158837	169837
81226	greensboro_nc	157899	189009
41482	NormanBuffong	154081	460424
44550	Politics_PR	152864	169513
21504	GoodMenProject	150478	148332
61623	Yubbie007	146561	148158
82888	iAmNateJames	145441	210947
71789	ceoMARS	141651	237733
82883	iAmMasonJames	139454	208222