In [2]:
%matplotlib inline
#import envoy
import json
import pymongo
from bson import json_util # From pymongo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as md
from datetime import datetime
import twitter
import networkx as nx
print "imported"
In [4]:
htgs = pd.read_csv("../data/ferguson_hashtags.csv")
htgs.head()
Out[4]:
In [5]:
htgs.columns = ["id", "_iso_created_at", "user.screen_name", "h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9", "h10" ]
htgs[:10]
Out[5]:
In [7]:
# htgs[["h1", "h2"]].head()
# htgs.loc[:, "h1":"h10"].head()
all_htgs = np.array((htgs["h1"].dropna()))
for column in htgs.loc[:, "h2":"h10"].columns:
np.append(all_htgs, htgs[column], axis=0)
# all_htgs.append(htgs[column])
# all_htgs.head()
# all_htgs[:10]
In [9]:
# default dict info from http://evanmuehlhausen.com/simple-counters-in-python-with-benchmarks/
from collections import defaultdict
# orginal recipe tags
tags = defaultdict(int)
# dict for retweet tags counts
# rtags = defaultdict(int)
# counter for the number of orginal tweets from thie user
original_count = 0
# go through tweets of the user
for htg in all_htgs:
tags[htg] +=1
# get the tags
# tags = [hashtags["text"] for hashtags in tweet["entities"]["hashtags"]]
# update the dict that is keeping count of the tag fq.
# for tag in tags:
# if "retweeted_status" in tweet:
# rtags[tag] += 1
# else:
# otags[tag] += 1
# original_count += 1
print "Go on..."
In [12]:
# operator info: http://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value
import operator
sorted_tags = sorted(tags.items(), key = operator.itemgetter(1))
sorted_tags.reverse()
for tag in sorted_tags[45:100]:
print tag[0], ": ", tag[1]
In [27]:
hashtags = pd.DataFrame(sorted_tags, columns = ["hashtag", "num"])
hashtags.sort("num", inplace=True)
hashtags.head()
Out[27]:
In [28]:
cutoff = hashtags[(hashtags["hashtag"] != "Ferguson") & (hashtags["hashtag"] != "ferguson") & (hashtags["num"] > 2500)]
len(cutoff)
cutoff.reset_index(drop=True).head()
Out[28]:
In [30]:
h = np.array(cutoff["hashtag"])
In [31]:
plt.figure(figsize=(5, 20))
y_pos = np.arange(len(cutoff))
plt.barh(y_pos, cutoff["num"])
plt.yticks(y_pos, h)
plt.xlabel('Ammount')
plt.ylabel('Hashtags')
plt.title('Overall Hashtag Usage After #Ferguson')
plt.show()