Parse Twitter Data

  1. Import retrieved JSON files (from S3)
  2. Read in individual tweets
  3. Geolocate
  4. Add gender
  5. Create CSV file (and drop unwanted data)

Get Data and Enrich It


In [2]:
import sys

In [3]:
sys.path.append('/mnt/home/ubuntu/projects/tools/')

In [10]:
import sys,re,json,os,csv,glob
import numpy as np
import matplotlib.pyplot as plt
from dateutil.parser import parse
import time,random,traceback
from geopy import distance
import geolocator
geo=geolocator.Geolocator()
geo.init()


WARNING:geopy:BeautifulSoup was not found. The SemanticMediaWiki geocoder will not work.
Loading the world...
Oh, the world is already out there...

In [2]:
# Create data folder
# !mkdir -p ../data

In [ ]:
# Grab (JSON) files from S3, takes a bunch of time
# s3cmd sync s3://plny-protectthegoal/tweets ../data

In [11]:
files=glob.glob('../data/2014-06/DataSift*json')
files.sort()
print files[0]


../data/2014-06/DataSift-5feb4f2f6ae45ec94ee52ef01eb40052-1405010687.json

In [12]:
# Find Number of JSON Files
print('We have a total of %d files' % len(files))


We have a total of 69 files

Number of tweets


In [13]:
tweets=[]
for file in files:
# Cycle through files
    fileString=open(file,'r').read().decode('utf-8')
    # Read file as one long string and convert to unicode
    fileTweets=[json.loads(line) for line in fileString.split('\n')]
    # Split into lines and load as JSON
    tweets.extend(fileTweets)
    # Add list of tweets from file to global list
print('We have %d tweets' % len(tweets))


We have 85 tweets

Geolocate From User Location


In [14]:
geoError=0
for tweet in tweets:
  try:
    tweet['geolocated']=geo.geoLocate(tweet['twitter']['retweet']['user']['location'])[0][3] 
  except:
    try:
        tweet['geolocated']=geo.geoLocate(tweet['twitter']['user']['location'])[0][3]
    except:
        geoError+=1
        tweet['geolocated']=None
print('Couldn\'t geolocate %d tweets' % geoError)
print('Managed to geolocate %d p.c.' % (100.0*(1.0-(float(geoError)/len(tweets)))))


Couldn't geolocate 10 tweets
Managed to geolocate 88 p.c.

In [9]:
# Testing that it worked
tweets[0]['geolocated']

Insert Gender


In [15]:
import gender
g=gender.Gender()
g.gender(tweets[2]['interaction']['author']['name']) #Testing that it works


Out[15]:
{u'MIGUEL LUCERO': {'gender': 'male',
  'probability': 0.9940737157197714,
  'volume_female': 980.0,
  'volume_male': 164385.0}}

In [16]:
# Gender of tweeter or retweeter
genderError=0
for tweet in tweets:
  try:
    tweet['gender']=g.gender(tweet['interaction']['author']['name'])
  except:
    genderError+=1
    tweet['gender']=None
print('Couldn\'t add gender probability for %d tweets' % genderError)
print('Managed to add gender to %d p.c.' % (100.0*(1.0-(float(genderError)/len(tweets)))))


Couldn't add gender probability for 0 tweets
Managed to add gender to 100 p.c.

In [18]:
# Testing that it worked
tweets[0]['gender'].values()[0]['gender']


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-18-894e52995262> in <module>()
      1 # Testing that it worked
----> 2 tweets[0]['gender'].values()[0]['gender']

IndexError: list index out of range

Save Data to Disk

Save as JSON


In [19]:
with open('../data/protectthegoal.json','wb') as f: f.write(json.dumps(tweets))

Save as CSV


In [20]:
outFile=csv.writer(open('../data/protectthegoal.csv','wb'),delimiter='\t')

In [21]:
nIdError=0
nDateError=0
nContentError=0
nTypeError=0
nLanguageError=0
nTwitterLanguageError=0
nLocationError=0
nLatError=0
nLongError=0
nUngpLocationError=0
nGenderError=0
nUngpGenderError=0
nUngpGenderProbError=0
nFollowersError=0
nFriendsError=0
nSentimentError=0
nTopicError=0
nSubTopicError=0

documents=[]

for tweet in tweets:
  outList=[]
  try:
    outList.append(tweet['interaction']['id'])
    documents.append(tweet['interaction']['id'])
  except:
    outList.append('NaN')
    nIdError+=1
  try:
    outList.append(tweet['interaction']['created_at'])
    documents.append(tweet['interaction']['created_at'])
  except:
    outList.append('NaN')
    nDateError+=1
  try:
    outList.append(tweet['interaction']['content'].encode('utf-8'))
    documents.append(tweet['interaction']['content'].encode('utf-8'))
  except:
    #print traceback.print_exc()
    #print tweet['interaction']['content']
    outList.append('NaN')
    nContentError+=1
  try:
    outList.append(tweet['interaction']['type'].encode('utf-8'))
    documents.append(tweet['interaction']['type'].encode('utf-8'))
  except:
    outList.append('NaN')
    nTypeError+=1
  try:
    outList.append(tweet['language']['tag'].encode('utf-8'))
    documents.append(tweet['language']['tag'].encode('utf-8'))
  except:
    outList.append('NaN')
    nLanguageError+=1
  try:
    outList.append(tweet['twitter']['lang'].encode('utf-8'))
    documents.append(tweet['twitter']['lang'].encode('utf-8'))
  except:
    outList.append('NaN')
    nTwitterLanguageError+=1
  try:
    outList.append(tweet['twitter']['user']['location'].encode('utf-8'))
    documents.append(tweet['twitter']['user']['location'].encode('utf-8'))
  except:
    outList.append('NaN')
    nLocationError+=1
  try:
    outList.append(tweet['twitter']['geo']['latitude'])
    documents.append(tweet['twitter']['geo']['latitude'])
  except:
    outList.append('NaN')
    nLatError+=1
  try:
    outList.append(tweet['twitter']['geo']['longitude'])
    documents.append(tweet['twitter']['geo']['longitude'])
  except:
    outList.append('NaN')
    nLongError+=1
  try:
    outList.append(tweet['geolocated'].encode('utf-8'))
    documents.append(tweet['geolocated'].encode('utf-8'))
  except:
    outList.append('NaN')
    nUngpLocationError+=1
  try:
    outList.append(tweet['demographic']['gender'].encode('utf-8'))
    documents.append(tweet['demographic']['gender'].encode('utf-8'))
  except:
    outList.append('NaN')
    nGenderError+=1
  try:
    outList.append(tweet['gender'].values()[0]['gender'].encode('utf-8'))
    documents.append(tweet['gender'].values()[0]['gender'].encode('utf-8'))
  except:
    outList.append('NaN')
    nUngpGenderError+=1
  try:
    outList.append(tweet['gender'].values()[0]['probability'])
    documents.append(tweet['gender'].values()[0]['probability'])
  except:
    outList.append('NaN')
    nUngpGenderProbError+=1
  try:
    outList.append(tweet['twitter']['user']['followers_count'])
    documents.append(tweet['twitter']['user']['followers_count'])
  except:
    outList.append('NaN')
    nFollowersError+=1
  try:
    outList.append(tweet['twitter']['user']['friends_count'])
    documents.append(tweet['twitter']['user']['friends_count'])
  except:
    outList.append('NaN')
    nFriendsError+=1
  try:
    outList.append(tweet['interaction']['tag_tree']['sentiment'].values()[0])
    documents.append(tweet['interaction']['tag_tree']['sentiment'].values()[0])
  except:
    outList.append('NaN')
    nSentimentError+=1
  try:
    outList.append(tweet['interaction']['tag_tree']['topic'])
    documents.append(tweet['interaction']['tag_tree']['topic'])
  except:
    outList.append('NaN')
    nTopicError+=1
  try:
    outList.append(tweet['interaction']['tag_tree']['topic'].values()[0])
    documents.append(tweet['interaction']['tag_tree']['topic'].values()[0])
  except:
    outList.append('NaN')
    nSubTopicError+=1

  outFile.writerow(outList)

print "%d ID errors." % nIdError
print "%d Date errors." % nDateError
print "%d Content errors." % nContentError
print "%d Type errors." % nTypeError
print "%d DataSift language errors." % nLanguageError
print "%d Twitter language errors." % nTwitterLanguageError
print "%d Twitter Location errors." % nLocationError
print "%d Twitter Latitude errors." % nLatError
print "%d Twitter Longitude errors." % nLongError
print "%d UNGP Location errors." % nUngpLocationError
print "%d Gender errors." % nGenderError
print "%d UNGP gender errors." % nUngpGenderError
print "%d UNGP gender probability errors." % nUngpGenderProbError
print "%d Follower errors." % nFollowersError
print "%d Friends errors." % nFriendsError
print "%d Sentiment errors." % nSentimentError
print "%d Topic errors." % nTopicError
print "%d Sub-topic errors." % nSubTopicError


0 ID errors.
0 Date errors.
0 Content errors.
0 Type errors.
2 DataSift language errors.
0 Twitter language errors.
2 Twitter Location errors.
0 Twitter Latitude errors.
0 Twitter Longitude errors.
10 UNGP Location errors.
29 Gender errors.
23 UNGP gender errors.
23 UNGP gender probability errors.
0 Follower errors.
0 Friends errors.
85 Sentiment errors.
0 Topic errors.
0 Sub-topic errors.

In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)


Out[1]:

In [ ]: