In [1]:
import json
path='/home/sriram/Downloads/'
tweetFile='Twitter_data1.txt'
import pandas as pd
In [2]:
tweets_data = []
tweets_data_path=path+tweetFile
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
In [3]:
print len(tweets_data)
In [33]:
from textblob import TextBlob
count=0
tweets = pd.DataFrame(index=range(len(tweets_data)), columns=['text','lang','retweeted','location','state','sentiment','country_code','lat','lon'])
#tweets['text'] = map(lambda tweet: tweet['text'] if tweet['text']!=None else None, tweets_data)
#tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data)
#tweets['retweeted']= map(lambda tweet: tweet['retweeted'], tweets_data)
#tweets['location'] = map(lambda tweet: tweet['user']['location'] if tweet['user']['location'] != None else None, tweets_data)
for i in range(len(tweets_data)):
try:
tweets['text'][i] = tweets_data[i]['text']
except:
tweets['text'][i] = ""
try:
tweets['lang'][i]=tweets_data[i]['lang']
except:
tweets['lang'][i]='NA'
try:
tweets['retweeted'][i]=tweets_data[i]['retweeted']
except:
tweets['lang'][i]='NA'
try:
tweets['location'][i]=tweets_data[i]['user']['location']
except:
tweets['location'][i]='NA'
try:
tweets['country_code'][i]=tweets_data[i]['place']['country_code']
except:
tweets['country_code'][i]=''
try:
tweets['lon'][i]=tweets_data[i]['place']['bounding_box']['coordinates'][0][0][0]
except:
tweets['lon'][i]='NA'
try:
tweets['lat'][i]=tweets_data[i]['place']['bounding_box']['coordinates'][0][0][1]
except:
tweets['lat'][i]='NA'
In [67]:
import time
import zipcode
start_time = time.time()
count=0
for i in range(len(tweets)):
blob = TextBlob(tweets['text'][i])
try:
sentence=blob.sentences[0]
tweets['sentiment'][i]=sentence.sentiment.polarity
except:
tweets['sentiment'][i]=0
try:
stateFromData=tweets['location'][i].split(',')[1]
except:
stateFromData=''
if len(stateFromData)==2:
tweets['state'][i]=stateFromData
else:
if tweets['lat'][i] !='NA':
radius=10
incre=10
zips=zipcode.isinradius((tweets['lat'][i],tweets['lon'][i]),radius)
while len(zips)==0:
radius=radius+incre
zips=zipcode.isinradius((tweets['lat'][i],tweets['lon'][i]),radius)
incre=incre+10
myzip = zipcode.isequal(str(zips[0].zip))
tweets['state'][i]=myzip.state
else:
tweets['state'][i]='NA'
count+=1
if count%1000==0:
print (count," Tweets processed")
print("--- %s seconds ---" % (time.time() - start_time))
In [63]:
len(u'IL')
Out[63]:
In [18]:
import time
start_time = time.time()
for i in range(len(tweets)):
zipcode.isinradius((39.98,-87.29),20)
Out[18]:
In [53]:
tweets.head()
Out[53]:
In [13]:
zip=zipcode.isinradius((39.98,-87.29),20)
Out[13]:
In [ ]: