In [1]:
import pandas as pd
import json,csv,re,os,sys,glob,dateutil,collections,operator,time,itertools,fuzzy,difflib,bisect
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['axes.titlesize'] = 18
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['legend.fontsize'] = 12
%matplotlib inline
In [2]:
import sys
sys.path.append('/mnt/home/ubuntu/projects/tools/')
from gp_colours import *
In [3]:
plotColours=[gpBlue,gpDarkBlue,gpRed,'black']
plotColours=['#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff53','#a65628','#f781bf','#9999f9']
#plotColours.extend(plotColours)
'''
def colours():
n=0
for c in plotColours:
yield c
# Give me a new colour, one at a time
coloursClass=colours()
'''
colors=itertools.cycle(plotColours)
# We need a whole bunch of colours, so repeat them
In [4]:
topics=[u'Discrimination',u'Prevention',u'Campaign',u'Testing']
In [99]:
nTotal=0
nErrors=0
nTopicErrors=0
nFollowerError=0
nInRange=0
tweetCounter=collections.defaultdict(int)
# Count each tweet in one big bucket
tweetTopicCounter={}
for t in topics:
tweetTopicCounter[t]=collections.defaultdict(int)
# Count each tweet by topic
topFollowers=[(-1,-1) for i in range(10)]
# This is a list, it holds 10 tweet id's with
# most number of followers
currentTopFollower=-1
# Keep track of minimum value in top 10
# Optimises loop
topTopicFollowers={t:[(-1,-1) for i in range(10)] for t in topics}
# Dictionary of lists
currentTopTopicFollowers={t:-1 for t in topics}
# Likewise keep a record of current largest
# follower count
nRetweet=0
for dir in glob.glob('../data/stream'):
print dir
for f in glob.glob(dir+'/Data*json'):
for line in open(f,'r').read().decode('utf-8').split('\n'):
nTotal+=1
isRetweet=False
tweet=json.loads(line)
# tweetTime=dt.datetime.strptime(tweet['interaction']['created_at'],'%a, %d %b %Y %H:%M:%S +0000')
# TODO try twitter.retweeted.created_at first, if not use interaction.created_at
# dayDiff=(tweetTime-dt.datetime.now()).days
# Tue, 20 May 2014 23:57:56 +0000
if True:
# Filter here by language/date
nInRange+=1
try:
try:
id=tweet['twitter']['retweeted']['id'] # TODO: check if this is a bug, should it not be "retweet" (instead of retweeted)?
isRetweet=True
except:
id=tweet['twitter']['id']
content=tweet['interaction']['content'].encode('utf-8').replace('\n',' ')
if re.search(r'\bRT ',content):
nRetweet+=1
print content
tweetCounter[(content,id)]+=1
except:
nErrors+=1
##########################################################
try:
tweetTopics=tweet['interaction']['tag_tree']['topic']
for topic in tweetTopics:
tweetTopicCounter[topic][(content,id)]+=1
except:
tweetTopicCounter[u'None'][(content,id)]+=1
nTopicErrors+=1
# Count tweets by topic
##########################################################
try:
try:
nFollowers=tweet['twitter']['user']['followers_count']
except:
nFollowers=tweet['twitter']['retweet']['user']['followers_count']
if nFollowers>currentTopFollower and not isRetweet:
bisect.insort(topFollowers,(nFollowers,id))
currentTopFollower=topFollowers[0][0]
# Insert tweet to maintain order
# if new number of followers is larger than lowest value
if len(topFollowers)>10:topFollowers=topFollowers[-10:]
# Allow lower values to drop out
for topic in tweetTopics:
if nFollowers>currentTopTopicFollowers[topic] and not isRetweet:
# print 'INSERTING TO',topic
# print nFollowers,currentTopTopicFollowers[topic]
currentTopTopicFollowers[topic]=topTopicFollowers[topic][0][0]
bisect.insort(topTopicFollowers[topic],(nFollowers,id))
if len(topTopicFollowers[topic])>10:topTopicFollowers[topic]=topTopicFollowers[topic][-10:]
# print topTopicFollowers[topic]
# print ''
except:
nFollowerError+=1
# Get tweets with top followers
##########################################################
print '\t',nErrors,nTopicErrors,nFollowerError,nInRange,nTotal
#print 'ERRORS',nErrors
In [98]:
nRetweet
Out[98]:
In [86]:
topTopicFollowers
Out[86]:
In [53]:
test={'a':(1,2),'b':(3,1)}
sorted(test.iteritems(),key=lambda x:x[1][-1])
Out[53]:
In [87]:
test=tweetCounter.items()[0:5]
test
Out[87]:
In [91]:
sortedTweets=sorted(tweetCounter.iteritems(),key=lambda x:x[1])
#sortedTweets.reverse()
for t in sortedTweets[0:20]:
print t[1],'\t',t[0][0],t[0][1]
In [61]:
sortedTweets=sorted(tweetCounter.iteritems(), key=operator.itemgetter(1))
sortedTweets.reverse()
outFile=csv.writer(open('../web/data/top-tweets/portuguese.all.top.retweet','w'),delimiter='\t')
for t in sortedTweets[0:10]:
print t[1],'\t',t[0]
outFile.writerow([t[0][1]])
# Write out top tweet ids for all tweets
In [62]:
outFile=csv.writer(open('../web/data/top-tweets/portuguese.all.top.followers','w'),delimiter='\t')
for t in reversed(topFollowers[0:10]):
outFile.writerow([t[1]])
print t
# Write out top follower count tweet id's for all tweets
In [63]:
test=range(10)
for t in reversed(test[0:10]):print t
In [64]:
for k,v in reversed(topTopicFollowers.items()[0:10]):
print k
if not k=='None':
fileName='../web/data/top-tweets/portuguese.'+k+'.top.followers'
outFile=csv.writer(open(fileName,'w'),delimiter='\t')
for id in reversed(v):
outFile.writerow([id[1]])
print '\t',id
# Write out top follow count tweet ids for tweets, by topic
outFile=None
# Need to flush out file handle
# Sometimes last file doesn't get written otherwise
In [65]:
for k,v in tweetTopicCounter.items():
if not k=='None':
sortedTweets=sorted(v.iteritems(),key=lambda x:x[1])
sortedTweets.reverse()
fileName='../web/data/top-tweets/portuguese.'+k+'.top.retweet'
outFile=csv.writer(open(fileName,'w'),delimiter='\t')
# print ''
print k
# print '------------'
for t in sortedTweets[0:10]:
print '\t',t[1],t[0][0],t[0][1]
outFile.writerow([t[0][1]])
print 'line',fileName
# Write out top retweeted tweet id's by topic
outFile=None
In [41]:
import pickle
outFile=open('top_tweets.dat','w')
pickle.dump(tweetCounter,outFile)
pickle.dump(tweetTopicCounter,outFile)
outFile.close()
In [42]:
topTweets=sortedTweets[0:20]
#topTweets=[(re.sub(r'http:\/\/[a-zA-Z0-9\.\/]+','',t[0]),t[1]) for t in topTweets]
In [17]:
def filterUrl(t):
return (re.sub(r'http:\/\/[a-zA-Z0-9\.\/]+','',t))
# Strip URLs from tweets so not included in similarity calculation
In [18]:
for i in itertools.combinations(topTweets,2):
similarity=difflib.SequenceMatcher(None,filterUrl(i[0][0]),filterUrl(i[1][0])).ratio()
if similarity>0.7:print similarity,'\n',i[0][0],'\n',i[1][0],'\n\n'
In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
Out[1]:
In [ ]: