In [2]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
Out[2]:
In [3]:
import sys,re,json,os,csv
import numpy as np
import cPickle as pickle
import uuid
from IPython.display import display_javascript, display_html, display
In [4]:
picklepath = '/Users/rcn/Desktop/twitter-analysis/data/raw/tweets.p'
In [16]:
tweets = pickle.load( open(picklepath, "rb" ) )
In [17]:
print('We have %d tweets in total' % len(tweets))
Let's make JSON look nice (with thanks to Renderjson)
In [18]:
class RenderJSON(object):
def __init__(self, json_data):
if isinstance(json_data, dict):
self.json_str = json.dumps(json_data)
else:
self.json_str = json
self.uuid = str(uuid.uuid4())
def _ipython_display_(self):
display_html('<div id="{}" style="height: 600px; width:100%;"></div>'.format(self.uuid),
raw=True
)
display_javascript("""
require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
document.getElementById('%s').appendChild(renderjson(%s))
});
""" % (self.uuid, self.json_str), raw=True)
In [19]:
RenderJSON(tweets[0])
In [20]:
tweetLinebreakError=0
for tweet in tweets:
try:
tweet['text'] = tweet['text'].replace('\n', ' ').replace('\r', '')
except:
tweetLinebreakError+=1
tweet['text'] = 'NaN'
print('Failed removing line breaks in %d tweets' % tweetLinebreakError)
In [ ]:
jsonpath = '' # Path to JSON file
picklepath = '' # Path to pickle file
tsvpath = '/Users/rcn/Desktop/twitter-analysis/data/tweets.tsv' # Path to tsv file
In [11]:
with open(jsonpath, 'wb') as tweetsfile: # Get ready to write to output file
json.dump(tweets, tweetsfile) # Write tweets to json file
In [12]:
with open(picklepath, "wb") as tweetsfile:
pickle.dump(tweets, tweetsfile) # Write tweets to pickle file
In [22]:
header=['Tweet ID','Time','User','Username','Text','Language','User Location','Geo','Place','Likes','Retweets',
'Followers','Friends','Listed','Favourites','Hashtags','Mentions','Links','User Description']
outFile=csv.writer(open(tsvpath,'wb'),delimiter='\t')
outFile.writerow(header)
In [23]:
nIdError = 0
nDateError = 0
nNameError = 0
nScreenNameError = 0
nTextError = 0
nLanguageError = 0
nLocationError = 0
nGeoError = 0
nPlaceError = 0
nLikesError = 0
nRetweetsError = 0
nFollowersError = 0
nFriendsError = 0
nListedError = 0
nFavouritesError = 0
nTagsError = 0
nMentionsError = 0
nLinksError = 0
nDescriptionError = 0
documents=[]
for tweet in tweets:
outList=[]
try:
outList.append(tweet['id'])
documents.append(tweet['id'])
except:
outList.append('NaN')
documents.append('NaN')
nIdError+=1
try:
outList.append(tweet['created_at'])
documents.append(tweet['created_at'])
except:
outList.append('NaN')
documents.append('NaN')
nDateError+=1
try:
outList.append(tweet['user']['name'].encode('utf-8'))
documents.append(tweet['user']['name'].encode('utf-8'))
except:
nNameError+=1
outList.append('NaN')
documents.append('NaN')
try:
outList.append(tweet['user']['screen_name'])
documents.append(tweet['user']['screen_name'])
except:
nScreenNameError+=1
outList.append('NaN')
documents.append('NaN')
try:
outList.append(tweet['text'].encode('utf-8'))
documents.append(tweet['text'].encode('utf-8'))
except:
outList.append('NaN')
documents.append('NaN')
nTextError+=1
try:
outList.append(tweet['lang'])
documents.append(tweet['lang'])
except:
outList.append('NaN')
documents.append('NaN')
nLanguageError+=1
try:
outList.append(tweet['user']['location'].encode('utf-8'))
documents.append(tweet['user']['location'].encode('utf-8'))
except:
outList.append('NaN')
documents.append('NaN')
nLocationError+=1
try:
outList.append(tweet['geo'].encode('utf-8'))
documents.append(tweet['geo'].encode('utf-8'))
except:
outList.append('NaN')
documents.append('NaN')
nGeoError+=1
try:
outList.append(tweet['place'].encode('utf-8'))
documents.append(tweet['place'].encode('utf-8'))
except:
outList.append('NaN')
documents.append('NaN')
nPlaceError+=1
try:
outList.append(tweet['favorite_count'])
documents.append(tweet['favorite_count'])
except:
outList.append('NaN')
documents.append('NaN')
nLikesError+=1
try:
outList.append(tweet['retweet_count'])
documents.append(tweet['retweet_count'])
except:
outList.append('NaN')
documents.append('NaN')
nRetweetsError+=1
try:
outList.append(tweet['user']['followers_count'])
documents.append(tweet['user']['followers_count'])
except:
outList.append('NaN')
documents.append('NaN')
nFollowersError+=1
try:
outList.append(tweet['user']['friends_count'])
documents.append(tweet['user']['friends_count'])
except:
outList.append('NaN')
documents.append('NaN')
nFriendsError+=1
try:
outList.append(tweet['user']['listed_count'])
documents.append(tweet['user']['listed_count'])
except:
outList.append('NaN')
documents.append('NaN')
nListedError+=1
try:
outList.append(tweet['user']['favourites_count'])
documents.append(tweet['user']['favourites_count'])
except:
outList.append('NaN')
documents.append('NaN')
nFavouritesError+=1
try:
tweetTags=','.join([h.lower() for h in tweet['entities']['hashtags']])
outList.append(tweetTags.decode('utf-8'))
documents.append(tweetTags.decode('utf-8'))
except:
nTagsError+=1
outList.append('NaN')
documents.append('NaN')
try:
tweetMentions=','.join([m.lower() for m in tweet['entities']['user_mentions']])
outList.append(tweetMentions.decode('utf-8'))
documents.append(tweetMentions.decode('utf-8'))
except:
nMentionsError+=1
outList.append('NaN')
documents.append('NaN')
try:
tweetLinks=','.join([m.lower() for m in tweet['entities']['urls']])
outList.append(tweetLinks.decode('utf-8'))
documents.append(tweetLinks.decode('utf-8'))
except:
nLinksError+=1
outList.append('NaN')
documents.append('NaN')
try:
outList.append(tweet['user']['description'].encode('utf-8'))
documents.append(tweet['user']['description'].encode('utf-8'))
except:
nDescriptionError+=1
outList.append('NaN')
documents.append('NaN')
outFile.writerow(outList)
print "%d ID errors." % nIdError
print "%d date errors." % nDateError
print "%d name errors." % nNameError
print "%d screen name errors." % nScreenNameError
print "%d text errors." % nTextError
print "%d language errors." % nLanguageError
print "%d user location errors." % nLocationError
print "%d tweet geo errors." % nGeoError
print "%d tweet place errors." % nPlaceError
print "%d likes errors." % nLikesError
print "%d retweets errors." % nRetweetsError
print "%d followers errors." % nFollowersError
print "%d friends errors." % nFriendsError
print "%d listed errors." % nListedError
print "%d favourites errors." % nFavouritesError
print "%d hashtag errors." % nTagsError
print "%d mention errors." % nMentionsError
print "%d link errors." % nLinksError
print "%d Description errors." % nDescriptionError
In [ ]: