In [2]:
import sys
In [3]:
sys.path.append('/mnt/home/ubuntu/projects/tools/')
In [10]:
import sys,re,json,os,csv,glob
import numpy as np
import matplotlib.pyplot as plt
from dateutil.parser import parse
import time,random,traceback
from geopy import distance
import geolocator
geo=geolocator.Geolocator()
geo.init()
In [2]:
# Create data folder
# !mkdir -p ../data
In [ ]:
# Grab (JSON) files from S3, takes a bunch of time
# s3cmd sync s3://plny-protectthegoal/tweets ../data
In [11]:
files=glob.glob('../data/2014-06/DataSift*json')
files.sort()
print files[0]
In [12]:
# Find Number of JSON Files
print('We have a total of %d files' % len(files))
In [13]:
tweets=[]
for file in files:
# Cycle through files
fileString=open(file,'r').read().decode('utf-8')
# Read file as one long string and convert to unicode
fileTweets=[json.loads(line) for line in fileString.split('\n')]
# Split into lines and load as JSON
tweets.extend(fileTweets)
# Add list of tweets from file to global list
print('We have %d tweets' % len(tweets))
In [14]:
geoError=0
for tweet in tweets:
try:
tweet['geolocated']=geo.geoLocate(tweet['twitter']['retweet']['user']['location'])[0][3]
except:
try:
tweet['geolocated']=geo.geoLocate(tweet['twitter']['user']['location'])[0][3]
except:
geoError+=1
tweet['geolocated']=None
print('Couldn\'t geolocate %d tweets' % geoError)
print('Managed to geolocate %d p.c.' % (100.0*(1.0-(float(geoError)/len(tweets)))))
In [9]:
# Testing that it worked
tweets[0]['geolocated']
In [15]:
import gender
g=gender.Gender()
g.gender(tweets[2]['interaction']['author']['name']) #Testing that it works
Out[15]:
In [16]:
# Gender of tweeter or retweeter
genderError=0
for tweet in tweets:
try:
tweet['gender']=g.gender(tweet['interaction']['author']['name'])
except:
genderError+=1
tweet['gender']=None
print('Couldn\'t add gender probability for %d tweets' % genderError)
print('Managed to add gender to %d p.c.' % (100.0*(1.0-(float(genderError)/len(tweets)))))
In [18]:
# Testing that it worked
tweets[0]['gender'].values()[0]['gender']
In [19]:
with open('../data/protectthegoal.json','wb') as f: f.write(json.dumps(tweets))
In [20]:
outFile=csv.writer(open('../data/protectthegoal.csv','wb'),delimiter='\t')
In [21]:
nIdError=0
nDateError=0
nContentError=0
nTypeError=0
nLanguageError=0
nTwitterLanguageError=0
nLocationError=0
nLatError=0
nLongError=0
nUngpLocationError=0
nGenderError=0
nUngpGenderError=0
nUngpGenderProbError=0
nFollowersError=0
nFriendsError=0
nSentimentError=0
nTopicError=0
nSubTopicError=0
documents=[]
for tweet in tweets:
outList=[]
try:
outList.append(tweet['interaction']['id'])
documents.append(tweet['interaction']['id'])
except:
outList.append('NaN')
nIdError+=1
try:
outList.append(tweet['interaction']['created_at'])
documents.append(tweet['interaction']['created_at'])
except:
outList.append('NaN')
nDateError+=1
try:
outList.append(tweet['interaction']['content'].encode('utf-8'))
documents.append(tweet['interaction']['content'].encode('utf-8'))
except:
#print traceback.print_exc()
#print tweet['interaction']['content']
outList.append('NaN')
nContentError+=1
try:
outList.append(tweet['interaction']['type'].encode('utf-8'))
documents.append(tweet['interaction']['type'].encode('utf-8'))
except:
outList.append('NaN')
nTypeError+=1
try:
outList.append(tweet['language']['tag'].encode('utf-8'))
documents.append(tweet['language']['tag'].encode('utf-8'))
except:
outList.append('NaN')
nLanguageError+=1
try:
outList.append(tweet['twitter']['lang'].encode('utf-8'))
documents.append(tweet['twitter']['lang'].encode('utf-8'))
except:
outList.append('NaN')
nTwitterLanguageError+=1
try:
outList.append(tweet['twitter']['user']['location'].encode('utf-8'))
documents.append(tweet['twitter']['user']['location'].encode('utf-8'))
except:
outList.append('NaN')
nLocationError+=1
try:
outList.append(tweet['twitter']['geo']['latitude'])
documents.append(tweet['twitter']['geo']['latitude'])
except:
outList.append('NaN')
nLatError+=1
try:
outList.append(tweet['twitter']['geo']['longitude'])
documents.append(tweet['twitter']['geo']['longitude'])
except:
outList.append('NaN')
nLongError+=1
try:
outList.append(tweet['geolocated'].encode('utf-8'))
documents.append(tweet['geolocated'].encode('utf-8'))
except:
outList.append('NaN')
nUngpLocationError+=1
try:
outList.append(tweet['demographic']['gender'].encode('utf-8'))
documents.append(tweet['demographic']['gender'].encode('utf-8'))
except:
outList.append('NaN')
nGenderError+=1
try:
outList.append(tweet['gender'].values()[0]['gender'].encode('utf-8'))
documents.append(tweet['gender'].values()[0]['gender'].encode('utf-8'))
except:
outList.append('NaN')
nUngpGenderError+=1
try:
outList.append(tweet['gender'].values()[0]['probability'])
documents.append(tweet['gender'].values()[0]['probability'])
except:
outList.append('NaN')
nUngpGenderProbError+=1
try:
outList.append(tweet['twitter']['user']['followers_count'])
documents.append(tweet['twitter']['user']['followers_count'])
except:
outList.append('NaN')
nFollowersError+=1
try:
outList.append(tweet['twitter']['user']['friends_count'])
documents.append(tweet['twitter']['user']['friends_count'])
except:
outList.append('NaN')
nFriendsError+=1
try:
outList.append(tweet['interaction']['tag_tree']['sentiment'].values()[0])
documents.append(tweet['interaction']['tag_tree']['sentiment'].values()[0])
except:
outList.append('NaN')
nSentimentError+=1
try:
outList.append(tweet['interaction']['tag_tree']['topic'])
documents.append(tweet['interaction']['tag_tree']['topic'])
except:
outList.append('NaN')
nTopicError+=1
try:
outList.append(tweet['interaction']['tag_tree']['topic'].values()[0])
documents.append(tweet['interaction']['tag_tree']['topic'].values()[0])
except:
outList.append('NaN')
nSubTopicError+=1
outFile.writerow(outList)
print "%d ID errors." % nIdError
print "%d Date errors." % nDateError
print "%d Content errors." % nContentError
print "%d Type errors." % nTypeError
print "%d DataSift language errors." % nLanguageError
print "%d Twitter language errors." % nTwitterLanguageError
print "%d Twitter Location errors." % nLocationError
print "%d Twitter Latitude errors." % nLatError
print "%d Twitter Longitude errors." % nLongError
print "%d UNGP Location errors." % nUngpLocationError
print "%d Gender errors." % nGenderError
print "%d UNGP gender errors." % nUngpGenderError
print "%d UNGP gender probability errors." % nUngpGenderProbError
print "%d Follower errors." % nFollowersError
print "%d Friends errors." % nFriendsError
print "%d Sentiment errors." % nSentimentError
print "%d Topic errors." % nTopicError
print "%d Sub-topic errors." % nSubTopicError
In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
Out[1]:
In [ ]: