In [2]:
%matplotlib inline

#import envoy 
import json
import pymongo 
from bson import json_util # From  pymongo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as md
from datetime import datetime
import twitter
import networkx as nx
print "imported"


imported

In [4]:
htgs = pd.read_csv("../data/ferguson_hashtags.csv")
htgs.head()


/home/ubuntu/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1130: DtypeWarning: Columns (10,11,12) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)
Out[4]:
id _iso_created_at user.screen_name entities.hashtags.0.text entities.hashtags.1.text entities.hashtags.2.text entities.hashtags.3.text entities.hashtags.4.text entities.hashtags.5.text entities.hashtags.6.text entities.hashtags.7.text entities.hashtags.8.text entities.hashtags.9.text
0 5.343279e+17 2014-11-17T12:51:02.000Z AmirahOna NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 5.343279e+17 2014-11-17T12:51:04.000Z NewsyBarbara NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 NaN { "$date" : { "$numberLong" : "-92233720368547... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 5.343279e+17 2014-11-17T12:51:12.000Z muhdfikhri_ NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 5.343279e+17 2014-11-17T12:51:14.000Z MykeBusch Ferguson NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [5]:
htgs.columns = ["id", "_iso_created_at", "user.screen_name", "h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9", "h10" ]
htgs[:10]


Out[5]:
id _iso_created_at user.screen_name h1 h2 h3 h4 h5 h6 h7 h8 h9 h10
0 5.343279e+17 2014-11-17T12:51:02.000Z AmirahOna NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 5.343279e+17 2014-11-17T12:51:04.000Z NewsyBarbara NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 NaN { "$date" : { "$numberLong" : "-92233720368547... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 5.343279e+17 2014-11-17T12:51:12.000Z muhdfikhri_ NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 5.343279e+17 2014-11-17T12:51:14.000Z MykeBusch Ferguson NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 5.343279e+17 2014-11-17T12:51:15.000Z alexvdl0 Ferguson NaN NaN NaN NaN NaN NaN NaN NaN NaN
6 5.343279e+17 2014-11-17T12:51:18.000Z NubianQueenIAm MichaelBrown NaN NaN NaN NaN NaN NaN NaN NaN NaN
7 5.343279e+17 2014-11-17T12:51:18.000Z YourAnonGlobal Anonymous Ferguson OpKKK NaN NaN NaN NaN NaN NaN NaN
8 5.343279e+17 2014-11-17T12:51:19.000Z syukrimanutd NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9 5.343279e+17 2014-11-17T12:51:23.000Z maverckmr NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [7]:
# htgs[["h1", "h2"]].head()
# htgs.loc[:, "h1":"h10"].head()

all_htgs = np.array((htgs["h1"].dropna()))

for column in htgs.loc[:, "h2":"h10"].columns:
    np.append(all_htgs, htgs[column], axis=0)

# all_htgs.append(htgs[column])

# all_htgs.head()
# all_htgs[:10]

In [9]:
# default dict info from http://evanmuehlhausen.com/simple-counters-in-python-with-benchmarks/
from collections import defaultdict

# orginal recipe tags
tags = defaultdict(int)

# dict for retweet tags counts
# rtags = defaultdict(int)

# counter for the number of orginal tweets from thie user
original_count = 0

# go through tweets of the user
for htg in all_htgs:
    tags[htg] +=1
    # get the tags
#     tags = [hashtags["text"] for hashtags in tweet["entities"]["hashtags"]]
    
    # update the dict that is keeping count of the tag fq.
#     for tag in tags:
#         if "retweeted_status" in tweet:
#             rtags[tag] += 1
#         else:
#             otags[tag] += 1
#             original_count += 1

print "Go on..."


Go on...

In [12]:
# operator info: http://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value
import operator

sorted_tags = sorted(tags.items(), key = operator.itemgetter(1))

sorted_tags.reverse()

for tag in sorted_tags[45:100]:
    print tag[0], ": ", tag[1]


MUFC :  7197
FoxNews :  7149
EEUU :  6935
AA :  6448
soracist :  6403
HandsUpWalkOut :  6352
TamirRice :  6325
Rams :  6281
Boston :  6245
dontsellshots :  6093
BreakingNews :  6085
Chicago :  5973
JusticeForZemir :  5848
FergusonProud :  5724
stoptheparade :  5584
Yeremiito21 :  5440
RIPMikeBrown :  5424
JusticeforMichael :  5359
WhitePrivilege :  5295
NYPD :  5146
inners :  5118
FergusonShooting :  5057
LAPD :  5041
Ayotzinapa :  5028
PoliceState :  4911
android :  4904
mufc :  4845
opKKK :  4777
TIMEPOY :  4730
stl :  4704
StLouis :  4637
supportdarrenwilson :  4551
HandsUp :  4508
blacklivesmatter :  4440
LondonToFerguson :  4383
PALESTINE :  4364
Walmart :  4205
ViolenceWillNotBeTolerated :  4199
Breaking :  4180
WhereisJustice :  4125
ICantBreathe :  4026
oakland :  3942
Portland :  3939
Police :  3937
Iraq :  3920
Shaw :  3847
America :  3834
MLK :  3783
Justice :  3742
TCOT :  3730
police :  3685
obstinate :  3674
usa :  3552
LosAngeles :  3543
RunForJustice :  3514

In [27]:
hashtags = pd.DataFrame(sorted_tags, columns = ["hashtag", "num"])
hashtags.sort("num", inplace=True)
hashtags.head()


Out[27]:
hashtag num
86012 VouPraBelieve 1
114678 CurvesBeautiful 1
114679 depeche 1
114680 ThisIsUsParty 1
114681 ImMad 1

In [28]:
cutoff = hashtags[(hashtags["hashtag"] != "Ferguson") & (hashtags["hashtag"] != "ferguson") & (hashtags["num"] > 2500)]
len(cutoff)
cutoff.reset_index(drop=True).head()


Out[28]:
hashtag num
0 FBI 2516
1 Sharpton 2545
2 unarmed 2558
3 PoliceBrutality 2563
4 Cuba 2605

In [30]:
h = np.array(cutoff["hashtag"])

In [31]:
plt.figure(figsize=(5, 20))

y_pos = np.arange(len(cutoff))
plt.barh(y_pos, cutoff["num"])
plt.yticks(y_pos, h)
plt.xlabel('Ammount')
plt.ylabel('Hashtags')

plt.title('Overall Hashtag Usage After #Ferguson')
plt.show()