In [1]:
import pandas as pd
import json,csv,re,os,sys,glob,dateutil,collections,operator,time,traceback
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import mpld3
# mpl.rcParams['axes.titlesize'] = 18
# mpl.rcParams['axes.labelsize'] = 14
# mpl.rcParams['legend.fontsize'] = 12
%matplotlib inline
In [2]:
import sys
sys.path.append('/mnt/home/ubuntu/projects/tools/')
from gp_colours import *
In [3]:
# Get emojis
stem=''
posEmojiFile=csv.reader(open(stem+'pos_emojis.txt','r'),delimiter='\t')
posEmojiFileUnicode=csv.reader(open(stem+'pos_emojis_unicode.txt','r'),delimiter='\t')
posEmojiSymbols=[line[0].decode('utf-8') for line in posEmojiFile if len(line)>0]
posEmojiSymbols+=[line[0].decode('utf-8') for line in posEmojiFileUnicode if len(line)>0]
posEmojiSymbols=[e.replace('(','\(').replace(')','\)').replace('.','\.').replace('|','\|').replace("'","\'") for e in posEmojiSymbols]
negEmojiFile=csv.reader(open(stem+'neg_emojis.txt','r'),delimiter='\t')
negEmojiFileUnicode=csv.reader(open(stem+'neg_emojis_unicode.txt','r'),delimiter='\t')
negEmojiSymbols=[line[0].decode('utf-8') for line in negEmojiFile if len(line)>0]
negEmojiSymbols+=[line[0].decode('utf-8') for line in negEmojiFileUnicode if len(line)>0]
negEmojiSymbols=[e.replace('(','\(').replace(')','\)').replace('.','\.').replace('|','\|').replace("'","\'").replace(":","\:").replace("[","\[").replace("]","\]") for e in negEmojiSymbols]
In [4]:
#plotColours=[gpBlue,gpDarkBlue,gpRed,'black']
#plotColours=['#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff53','#a65628','#f781bf','#9999f9']
plotColours=['#1B6E44','#6D1431','#FF5500','#5D6263','#009D97','#c84699','#71be45','#3F8CBC','#FFC200']
plotColours.extend(plotColours)
def colours():
n=0
for c in plotColours:
yield c
# Give me a new colour, one at a time
coloursClass=colours()
# We need a whole bunch of colours, so repeat them
In [ ]:
!python count_stuff_RCN.py;
In [6]:
dateList;
In [8]:
reducedSeriesPos=None
reducedSeriesNeg=None
reducedSeriesNet=None
reducedSeriesTotal=None
reducedSeriesPosEmojis=None
reducedSeriesNegEmojis=None
totalSeriesPositive=None
totalSeriesNegative=None
totalSeriesNet=None
totalSeriesTotal=None
totalSeriesPosEmojis=None
totalSeriesNegEmojis=None
# Keep hourly/daily series sentiments
sentimentCounter=collections.defaultdict(int)
sentimentHist=np.zeros(shape=(41,len(dateList)))
accountCounter=collections.defaultdict(int)
nError=0
nTotal=0
for dir in glob.glob('../data/2014-0[0-9]'):
print dir
for f in glob.glob(dir+'/Data*json'):
times=[]
positives=[]
negatives=[]
posEmojis=[]
negEmojis=[]
net=[]
totals=[]
# Keep times and sentiment in memory
# Only one file at a time
for newline in open(f,'r').read().decode('utf-8').split('\n'):
tweet=json.loads(newline)
nTotal+=1
try:
accountCounter[tweet['interaction']['author']['username']]+=1
timeCreated=dateutil.parser.parse(tweet['interaction']['created_at'])
tweetContent=tweet['interaction']['content'].encode('utf-8')
sentiment=tweet['interaction']['tag_tree']['sentiment'].keys()[0]
sentimentValue=int(tweet['interaction']['tag_tree']['sentiment'].values()[0][0])
times.append(timeCreated)
totals.append(1)
net.append(sentimentValue)
sentimentCounter[sentimentValue]+=1
sentimentHist[sentimentValue+20][dateList.index(timeCreated.date())]+=1
'''
if re.search(r':-\)|:\)',tweetContent):
posEmojis.append(1)
else:
posEmojis.append(0)
if re.search(r':-\(|:\(',tweetContent):
negEmojis.append(1)
else:
negEmojis.append(0)
'''
negEmojis.append(0)
posEmojis.append(0)
if sentiment=='Positive':
positives.append(1)
# positives.append(sentimentValue)
negatives.append(np.nan)
elif sentiment=='Negative':
positives.append(np.nan)
negatives.append(1)
# negatives.append(-1.0*sentimentValue)
elif sentiment=='Neutral':
positives.append(0)
negatives.append(0)
except:
# print traceback.print_exc()
# time.sleep(100)
nError+=1
# for t,p in zip(times,positives):
# print t,p
# time.sleep(10000)
tempDf=pd.DataFrame(data={'positive':positives,'negative':negatives,'total':totals,'net':net,'posEmojis':posEmojis,'negEmojis':negEmojis},index=times)
# Make a dataframe with contents of this file
if not type(totalSeriesPositive)==pd.Series:
totalSeriesPositive=tempDf.resample('D',how='sum')['positive']
totalSeriesNegative=tempDf.resample('D',how='sum')['negative']
totalSeriesPosEmojis=tempDf.resample('D',how='mean')['posEmojis']
totalSeriesNegEmojis=tempDf.resample('D',how='mean')['negEmojis']
totalSeriesNet=tempDf.resample('D',how='mean')['net']
totalSeriesTotal=tempDf.resample('D',how='sum')['total']
# print totalSeriesNegative
# time.sleep(100)
# First time through, add downsampled series
else:
totalSeriesPositive=totalSeriesPositive.add(tempDf.resample('D',how='sum')['positive'],fill_value=0)
totalSeriesNegative=totalSeriesNegative.add(tempDf.resample('D',how='sum')['negative'],fill_value=0)
totalSeriesPosEmojis=totalSeriesPosEmojis.add(tempDf.resample('D',how='sum')['posEmojis'],fill_value=0)
totalSeriesNegEmojis=totalSeriesNegEmojis.add(tempDf.resample('D',how='sum')['negEmojis'],fill_value=0)
totalSeriesNet=totalSeriesNet.add(tempDf.resample('D',how='mean')['net'],fill_value=0)
totalSeriesTotal=totalSeriesTotal.add(tempDf.resample('D',how='sum')['total'],fill_value=0)
print 'ERRORS',nError,'TOTAL',nTotal
In [31]:
plt.imshow(np.log(sentimentHist+0.001),interpolation='none')
#plt.colorbar()
Out[31]:
In [ ]:
for n in range(40):print n-20,np.max(sentimentHist[n])
In [24]:
totalSeriesPosEmojis.plot(label='Positive',legend=True)
(-1.0*totalSeriesNegEmojis).plot(label='Negative',legend=True)
(totalSeriesTotal/1000.0).plot(label='Total (x1000)',legend=True)
plt.axvline(totalSeriesPosEmojis.idxmax(),linestyle='--',color='b')
plt.axvline(totalSeriesNegEmojis.idxmax(),linestyle='--',color='g')
plt.savefig('../web/charts/PLOT_Sentiment.png',dpi=60)
In [34]:
print totalSeriesPosEmojis.idxmax() # Most positive emojis day
print totalSeriesNegEmojis.idxmax() # Most negative emojis day
In [15]:
totalSeriesNegEmojisNorm=(totalSeriesNegEmojis-totalSeriesNegEmojis.mean())/totalSeriesNegEmojis.std()
totalSeriesPosEmojisNorm=(totalSeriesPosEmojis-totalSeriesPosEmojis.mean())/totalSeriesPosEmojis.std()
In [16]:
(totalSeriesNegEmojisNorm).plot(label='Neg_norm',legend=True)
(totalSeriesPosEmojisNorm).plot(label='Pos_norm',legend=True)
Out[16]:
In [20]:
# 22/4 Earth Day
# 5/6 World Environment Day
# ----------------
# 14/4 climate report https://twitter.com/Independent/status/455612789805944832/photo/1
# 17/5 Arsenal trophy drought jokes
# 20/6 Bit unclear
# 17/7 Australia repeals carbon tax law
In [19]:
print totalSeriesPosEmojisNorm[totalSeriesPosEmojisNorm.abs()>2]
print '-------------'
print totalSeriesNegEmojisNorm[totalSeriesNegEmojisNorm.abs()>2]
In [36]:
print totalSeriesPosEmojis[totalSeriesPosEmojis>80]
In [33]:
#for k,v in sorted(sentimentCounter.iteritems(),key=lambda x:x[0]):
# print k,v
plt.semilogy([k for k,v in sorted(sentimentCounter.iteritems(),key=lambda x:x[0])],[v for k,v in sorted(sentimentCounter.iteritems(),key=lambda x:x[0])])
plt.savefig('dist.png',dpi=400)
In [17]:
print totalSeriesPositive.idxmax() # Most number positive day
print totalSeriesNegative.idxmax() # Most number negative day
print totalSeriesTotal.idxmax() # Most everything day
print totalSeriesNet.idxmax() # Most overall positive day
print totalSeriesNet.idxmin() # Most overall negative day
In [37]:
totalSeriesNet.plot(label='Net',legend=True)
(totalSeriesTotal/1000).plot(label='Total (x1000)',legend=True)
plt.savefig('net_sentiment.png',dpi=400)
In [39]:
totalSeriesNet[totalSeriesNet>50]
# Where series goes above 50
Out[39]:
In [42]:
totalSeriesNet[totalSeriesNet<-100]
# Where series goes below 80
Out[42]:
In [32]:
totalSeriesPositive.plot(label='Vol Pos',legend=True)
(-1.0*totalSeriesNegative).plot(label='Vol Neg',legend=True)
(totalSeriesTotal/1000.0).plot(label='Volume',legend=True)
plt.savefig('number.png',dpi=400)
In [5]:
import pickle
if False:
dumpFile=open('DUMP_sentiments.dat','w')
pickle.dump(totalSeriesPositive,dumpFile)
pickle.dump(totalSeriesNegative,dumpFile)
pickle.dump(totalSeriesTotal,dumpFile)
pickle.dump(totalSeriesNet,dumpFile)
pickle.dump(totalSeriesPosEmojis,dumpFile)
pickle.dump(totalSeriesNegEmojis,dumpFile)
pickle.dump(sentimentCounter,dumpFile)
pickle.dump(dateList,dumpFile)
pickle.dump(sentimentHist,dumpFile)
else:
dumpFile=open('DUMP_sentiments.dat','r')
totalSeriesPositive=pickle.load(dumpFile)
totalSeriesNegative=pickle.load(dumpFile)
totalSeriesTotal=pickle.load(dumpFile)
totalSeriesNet=pickle.load(dumpFile)
totalSeriesPosEmojis=pickle.load(dumpFile)
totalSeriesNegEmojis=pickle.load(dumpFile)
sentimentCounter=pickle.load(dumpFile)
dateList=pickle.load(dumpFile)
sentimentHist=pickle.load(dumpFile)
dumpFile.close()
In [9]:
dumpFile=open('DUMP_accounts.dat','accountCounaccountCounaccaccaccaccountCounter
pickle.dump(accountCounter,dumpFile)
dumpFile.close()
In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
Out[1]:
In [ ]: