In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pickle
import boto
import pandas as pd
import seaborn as sns
import mpld3
from bokeh import *
from ggplot import *
%matplotlib inline
In [4]:
# %%timeit
# import pickle
# inFile=open('DUMP_stuff.dat','r')
# mentionCounter=pickle.load(inFile)
# hashTagCounter=pickle.load(inFile)
# taggedHashTagCounter=pickle.load(inFile)
# domainCounter=pickle.load(inFile)
# topicCounter=pickle.load(inFile)
# timeCounter=pickle.load(inFile)
# rawDomainCounter=pickle.load(inFile)
In [2]:
# cPickle currently approx 3 times faster than pickle
import cPickle
inFile=open('DUMP_stuff.dat','r')
mentionCounter=cPickle.load(inFile)
hashTagCounter=cPickle.load(inFile)
domainCounter=cPickle.load(inFile)
topicCounter=cPickle.load(inFile)
timeCounter=cPickle.load(inFile)
rawDomainCounter=cPickle.load(inFile)
In [3]:
mentionCounterFrame = pd.DataFrame(mentionCounter)
hashTagCounterFrame= pd.DataFrame(hashTagCounter)
domainCounterFrame= pd.DataFrame(domainCounter)
topicCounterFrame= pd.DataFrame(topicCounter)
timeCounterFrame= pd.DataFrame(timeCounter)
rawDomainCounterFrame= pd.DataFrame(rawDomainCounter)
In [5]:
mentionCounterFrame.columns = ['Account', 'Number of tweets']
hashTagCounterFrame.columns = ['Hashtag', 'Number of tweets']
domainCounterFrame.columns = ['Domain', 'Number of tweets']
topicCounterFrame.columns = ['Topic', 'Number of tweets']
timeCounterFrame.columns = ['Time', 'Number of tweets']
rawDomainCounterFrame.columns = ['URL', 'Number of tweets']
In [6]:
rawDomainCounterFrame = rawDomainCounterFrame.replace(to_replace='', value=np.nan)
rawDomainCounterFrame = rawDomainCounterFrame.dropna()
#rawDomainCounterFrame = rawDomainCounterFrame[pd.notnull(rawDomainCounterFrame['URL'])]
In [7]:
rawDomainCounterFrame.sort(columns='Number of tweets', ascending=False, inplace=True)
In [8]:
rawDomainCounterFrame.head()
Out[8]:
In [9]:
from matplotlib.ticker import ScalarFormatter
formatter = ScalarFormatter()
formatter.set_scientific(False)
In [10]:
sns.set_context("poster")
sns.set_style("darkgrid", {'font.size': 14, 'axes.labelsize': 16, 'legend.fontsize': 14.0, 'axes.titlesize': 12, 'xtick.labelsize': 14,
'ytick.labelsize': 14})
In [11]:
fig, ax = plt.subplots()
hashTagCounter.reverse()
ax.barh(range(10),[v[1] for v in hashTagCounter[-10:]],log=False,linewidth=0,alpha=0.7,color="#00aeef")
ax.set_axis_bgcolor('#efefef')
ax.xaxis.set_major_formatter(formatter)
ax.xaxis.set_major_locator(plt.MaxNLocator(4))
ax.set_yticks([i+0.5 for i in range(10)])
ax.set_xlabel('Number of Tweets')
ax.set_yticklabels(['#'+v[0] for v in hashTagCounter[-10:]]);
plt.savefig('../web/charts/hashtags.png', bbox_inches='tight',dpi=200)
In [14]:
fig, ax = plt.subplots()
rawDomainCounter.reverse()
ax.barh(range(10),[v[1] for v in rawDomainCounter[-12:-2]],log=False,linewidth=0,alpha=0.7,color="#00aeef")
ax.set_axis_bgcolor('#efefef')
ax.xaxis.set_major_formatter(formatter)
ax.xaxis.set_major_locator(plt.MaxNLocator(4))
ax.set_yticks([i+0.5 for i in range(10)])
ax.set_xlabel('Number of Tweets')
ax.set_yticklabels([v[0] for v in rawDomainCounter[-12:-2]]);
plt.savefig('../web/charts/rawdomains.png', bbox_inches='tight',dpi=200)
In [16]:
rawDomainFig, ax = plt.subplots()
rawDomainCounter.reverse()
ax.barh(range(10),[v[1] for v in rawDomainCounter[-12:-2]],log=False,linewidth=0,alpha=0.7,color="#00aeef")
ax.set_axis_bgcolor('#efefef')
ax.xaxis.set_major_formatter(formatter)
ax.xaxis.set_major_locator(plt.MaxNLocator(4))
ax.set_yticks([i+0.5 for i in range(10)])
ax.set_xlabel('Number of Tweets')
ax.set_yticklabels([v[0] for v in rawDomainCounter[-12:-2]]);
plt.savefig('../web/charts/rawdomains.png', bbox_inches='tight',dpi=200)
mpld3.save_html(rawDomainFig, '../charts/rawdomains.php', figid="taggedHashtagsFig")
mpld3.display(rawDomainFig)
Out[16]:
In [17]:
fig, ax = plt.subplots()
domainCounter.reverse()
ax.barh(range(10),[v[1] for v in domainCounter[-10:]],log=False,linewidth=0,alpha=0.7,color="#00aeef")
ax.set_axis_bgcolor('#efefef')
ax.xaxis.set_major_formatter(formatter)
ax.xaxis.set_major_locator(plt.MaxNLocator(4))
ax.set_yticks([i+0.5 for i in range(10)])
ax.set_xlabel('Number of Tweets')
ax.set_yticklabels([v[0] for v in domainCounter[-10:]]);
plt.savefig('../web/charts/domains.png', bbox_inches='tight',dpi=200)
In [18]:
topicCounter
Out[18]:
In [19]:
len(topicCounter[-10:])
range(9)
[v[1] for v in topicCounter[-10:]]
Out[19]:
In [30]:
fig, ax = plt.subplots()
topicCounter.reverse()
ax.barh(range(4),[v[1] for v in topicCounter[-4:]],log=False,linewidth=0,alpha=0.7,color="#00aeef")
ax.set_axis_bgcolor('#efefef')
ax.xaxis.set_major_formatter(formatter)
ax.xaxis.set_major_locator(plt.MaxNLocator(4))
ax.set_yticks([i+0.5 for i in range(4)])
ax.set_xlabel('Number of Tweets')
ax.set_yticklabels([v[0] for v in topicCounter[-4:]]);
plt.savefig('../web/charts/topics.png', bbox_inches='tight',dpi=200)
In [32]:
sns.set_context("poster")
sns.despine()
sns.set(style="whitegrid")
sns.barplot(topicCounterFrame["Topic"],topicCounterFrame["Number of tweets"], color="#00aeef")
plt.savefig('../charts/topics_seaborn.png')
In [33]:
# ggplot
ggplot(aes(x="Topic", weight="Number of tweets"), topicCounterFrame) + geom_bar(fill='#00aeef')
Out[33]:
In [34]:
fig, ax = plt.subplots()
mentionCounter.reverse()
ax.barh(range(10),[v[1] for v in mentionCounter[-10:]],log=False,linewidth=0,alpha=0.7,color="#00aeef")
ax.set_axis_bgcolor('#efefef')
ax.xaxis.set_major_formatter(formatter)
ax.xaxis.set_major_locator(plt.MaxNLocator(4))
ax.set_yticks([i+0.5 for i in range(10)])
ax.set_xlabel('Number of Tweets')
ax.set_yticklabels(['@'+v[0] for v in mentionCounter[-10:]]);
plt.savefig('../web/charts/mentions.png', bbox_inches='tight',dpi=200)
In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
Out[1]:
In [ ]: