In [88]:
import json
path='C:\\Users\\Shahidhya\\OneDrive\\I524 Project\\3. Twitter Streaming\\set 1\\'
tweetFile='Twitter_data2.txt'
import pandas as pd
statedata=pd.read_csv('C:\\Users\\Shahidhya\\OneDrive\\I524 Project\\3. Twitter Streaming\\'+"states.csv")
In [89]:
tweets_data = []
tweets_data_path=path+tweetFile
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
In [90]:
from textblob import TextBlob
count=0
tweets = pd.DataFrame(index=range(len(tweets_data)), columns=['text','created_at','lang','retweeted','location','state','sentiment','sentiment_cat','country_code','lat','lon'])
for i in range(len(tweets_data)):
try:
tweets['text'][i] = tweets_data[i]['text']
except:
tweets['text'][i] = ""
try:
tweets['lang'][i]=tweets_data[i]['lang']
except:
tweets['lang'][i]='NA'
try:
tweets['retweeted'][i]=tweets_data[i]['retweeted']
except:
tweets['lang'][i]='NA'
try:
tweets['location'][i]=tweets_data[i]['user']['location']
except:
tweets['location'][i]='NA'
try:
tweets['country_code'][i]=tweets_data[i]['place']['country_code']
except:
tweets['country_code'][i]=''
try:
tweets['lon'][i]=tweets_data[i]['place']['bounding_box']['coordinates'][0][0][0]
except:
tweets['lon'][i]='NA'
try:
tweets['lat'][i]=tweets_data[i]['place']['bounding_box']['coordinates'][0][0][1]
except:
tweets['lat'][i]='NA'
try:
tweets['created_at'][i]=tweets_data[i]['created_at']
except:
tweets['created_at'][i]='NA'
In [91]:
import time
import zipcode
start_time = time.time()
count=0
for i in range(len(tweets)):
blob = TextBlob(tweets['text'][i])
try:
sentence=blob.sentences[0]
tweets['sentiment'][i]=sentence.sentiment.polarity
except:
tweets['sentiment'][i]=0
if tweets['sentiment'][i] < 0:
tweets['sentiment_cat'][i] = 'Neg'
else:
if tweets['sentiment'][i] > 0:
tweets['sentiment_cat'][i] = 'Pos'
else:
tweets['sentiment_cat'][i] = 'Neu'
try:
stateFromData=tweets['location'][i].split(',')[1]
except:
stateFromData=''
if len(stateFromData)==2:
tweets['state'][i]=stateFromData.upper()
else:
if tweets['lat'][i] !='NA':
radius=10
incre=10
zips=zipcode.isinradius((tweets['lat'][i],tweets['lon'][i]),radius)
while len(zips)==0:
radius=radius+incre
zips=zipcode.isinradius((tweets['lat'][i],tweets['lon'][i]),radius)
incre=incre+10
myzip = zipcode.isequal(str(zips[0].zip))
tweets['state'][i]=myzip.state
else:
tweets['state'][i]='NA'
count+=1
if count%1000==0:
print(count," Tweets processed")
print("--- %s seconds ---" % (time.time() - start_time))
tweetsFinal=pd.merge(tweets, statedata, how='left',left_on="state",right_on="Abbreviation")
In [ ]:
import json
#a = tweets.groupby(['state'],as_index=False).count()
output=pd.DataFrame({'value' : tweetsFinal.groupby( [ "State"] ).size()}).reset_index()
outputJsonList=output.to_json(orient = "records")
finalOutput=outputJsonList[33:len(outputJsonList)-1].upper().replace("\"STATE\"","ucName").replace("\"VALUE\"","value")
print(finalOutput)
with open('C:\\Users\\Shahidhya\\OneDrive\\I524 Project\\3. Twitter Streaming\\usStates-tweetCount.json', 'w') as outfile:
outfile.write(finalOutput)
#-------------------------------------
zips=zipcode.isinradius((tweets['lat'][0],tweets['lon'][0]),radius)
print (zips[0].zip)
myzip = zipcode.isequal(str(81507))
print (myzip.statename)
#-------------------------------------------
output2=pd.DataFrame({'value' : tweetsFinal.groupby( [ "State","sentiment_cat"] ).size()}).reset_index()
output2.head()
#-------------------------------------------
import numpy as np
outData=pd.pivot_table(output2,values='value', index=['State'], columns=['sentiment_cat'], aggfunc=np.sum)
outData=outData.fillna(0)
outData.head()
#-------------------------------------------
outData['sum']=outData[['Neg', 'Neu', 'Pos']].sum(axis=1)
outData['max']=outData['maxFinal']=outData[['Neg', 'Neu', 'Pos']].idxmax(axis=1)
#-------------------------------------------
for i in range(len(outData)):
if outData['max'][i] =="Pos":
outData['maxFinal'][i] = '1'
else:
if outData['max'][i] =="Neu":
outData['maxFinal'][i] = '-1'
else:
outData['maxFinal'][i] = '2'
outData
#-------------------------------------------
outData['state']=outData.index
outData.reset_index()
#-------------------------------------------
d="var data =[\n"
for i in range(len(outData)):
row=outData.ix[i]
d += "[\'"+row['state']+"\',"+",".join([str(i) for i in row[:5]])+"],\n"
d+=']'
#-------------------------------------------
In [92]:
from time import strptime
td1 = pd.DataFrame({'value' : tweets.groupby( [ "created_at"] ).size()}).reset_index()
timedata = td1[td1.created_at != 'NA']
data1 ={}
data = ["var data=["]
for i in range(1,len(timedata)+1):
year = timedata['created_at'][i][-4:]
if (timedata['created_at'][i][4:7] == 'Jan'):
mon = '1'
else:
if (timedata['created_at'][i][4:7] == 'Feb'):
mon = '2'
else:
if (timedata['created_at'][i][4:7] == 'Mar'):
mon = '3'
else:
if (timedata['created_at'][i][4:7] == 'Apr'):
mon = '4'
else:
if (timedata['created_at'][i][4:7] == 'May'):
mon = '5'
else:
if (timedata['created_at'][i][4:7] == 'Jun'):
mon = '6'
else:
if (timedata['created_at'][i][4:7] == 'Jul'):
mon = '7'
else:
if (timedata['created_at'][i][4:7] == 'Jul'):
mon = '8'
else:
if (timedata['created_at'][i][4:7] == 'Jul'):
mon = '9'
else:
if (timedata['created_at'][i][4:7] == 'Jul'):
mon = '10'
else:
if (timedata['created_at'][i][4:7] == 'Jul'):
mon = '11'
else:
mon = '12'
date = timedata['created_at'][i][7:10]
hour = timedata['created_at'][i][10:13]
minu = timedata['created_at'][i][14:16]
sec = timedata['created_at'][i][17:20]
value = timedata['value'][i]
data1 = ("[Date.UTC("+str(year)+","+str(mon)+","+str(date)+","+str(hour)+","+str(minu)+","+str(sec)+"),"+str(value)+"]")
if (len(timedata)):
data.append
data.append(data1)
data = ",\n".join(data)+"\n]"
data = data.replace("[,","[")
with open('C:\\Users\\Shahidhya\\OneDrive\\I524 Project\\3. Twitter Streaming\\tweet_cnt.json', 'w') as outfile:
outfile.write(data)
In [93]:
# Tokenizer:
import re
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
stop = stopwords.words('english') + ['and']
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
tokens=tokens_re.findall(s)
return [ x for x in tokens if 'http' not in x and len(x)>1 and x.lower() not in stop]
def preprocess(s, lowercase=True):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
In [94]:
#Co-occurrence:
from itertools import combinations
from collections import Counter
def collect_pairs(lines):
pair_counter = Counter()
for line in lines:
unique_tokens = sorted(set(line)) # exclude duplicates in same line and sort to ensure one word is always before other
combos = combinations(unique_tokens, 2)
pair_counter += Counter(combos)
return pair_counter
t2 = []
t1 =tweets['text']
for t in range(len(t1)):
t2.append(preprocess(t1[t]))
pairs = collect_pairs(t2)
In [97]:
import numpy as np
nptp = np.array(top_pairs)
maxtp = np.max(nptp[:,1])
top_pairs = pairs.most_common(200)
nodes={}
links=["\"links\":["]
count =0
len_top=len(top_pairs)
for p in range(len(top_pairs)):
for i in range(2):
if top_pairs[p][0][i] not in nodes:
nodes[top_pairs[p][0][i]] = count
count+=1
link="{ \"source\":"+str(nodes[top_pairs[p][0][0]])+",\"target\":"+str(nodes[top_pairs[p][0][1]])+",\"value\":"+str(round(top_pairs[p][1]*10/maxtp))+"}"
links.append(link)
links=",\n".join(links)+"\n]"
links=links.replace("[,","[")
nodes = sorted(nodes.items(), key=lambda x: x[1])
nodes1=["\"nodes\":["]
for p in range(len(nodes)):
nodes1.append("{ \"name\":\""+nodes[p][0]+"\",\"group\":"+"0}")
nodes1=",\n".join(nodes1)+"\n]"
nodes1=nodes1.replace("[,","[")
with open('C:\\Users\\Shahidhya\\OneDrive\\I524 Project\\3. Twitter Streaming\\cooccur_word.json', 'w') as outfile:
outfile.write("{\n"+nodes1+",\n"+links+"}\n")
In [152]:
#Wordcloud:
from os import path
from wordcloud import WordCloud
#d = path.dirname(__file__)
# Read the whole text.
#text = open(path.join(d, 'constitution.txt')).read()
textpos = tweets[tweets.sentiment_cat == 'Pos']
textneg = tweets[tweets.sentiment_cat == 'Neg']
textp = preprocess(textpos['text'])
textn = preprocess(textneg['text'])
wordcloudp = WordCloud(font_path='/Users/kunal/Library/Fonts/sans-serif.ttf',
stopwords=STOPWORDS,
background_color='white',
width=1200,
height=1000).generate(textp)
wordcloudn = WordCloud(font_path='/Users/kunal/Library/Fonts/sans-serif.ttf',
stopwords=STOPWORDS,
background_color='white',
width=1200,
height=1000).generate(textn)
import matplotlib.pyplot as plt
plt.imshow(wordcloudp, interpolation='bilinear')
plt.axis("off")
plt.show()
plt.imshow(wordcloudn, interpolation='bilinear')
plt.axis("off")
plt.show()
In [ ]:
import json
from geopy.geocoders import Nominatim
geolocator = Nominatim()
path='C:\\Users\\Shahidhya\\OneDrive\\I524 Project\\'
tweetFile='WorldTweets-'
import pandas as pd
import zipcode
from textblob import TextBlob
#from random import shuffle
import sys
tweets_data = []
fileno=1
tweets_data_path=path+tweetFile+str(fileno)+".txt"
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
tweets = pd.DataFrame(index=range(len(tweets_data)), columns=['text','lang','created_at','hour','retweet','country_code','state','sentiment','lat','lon'])
print ("dataset initialized")
#count=0
#shuffle(data)
#tweets_data=data[:200]
for i in range(len(tweets_data)):
try:
tweets['text'][i] = tweets_data[i]['text']
except:
tweets['text'][i] = ""
try:
tweets['lang'][i]=tweets_data[i]['lang']
except:
tweets['lang'][i]='NA'
try:
tweets['retweet'][i]='0' if "RT @" not in tweets['text'][i] else '1'
except:
tweets['lang'][i]='NA'
try:
tweets['country_code'][i]=str(tweets_data[i]['place']['country_code']).upper()
except:
tweets['country_code'][i]=''
try:
tweets['lon'][i]=tweets_data[i]['place']['bounding_box']['coordinates'][0][0][0]
except:
tweets['lon'][i]='NA'
try:
tweets['lat'][i]=tweets_data[i]['place']['bounding_box']['coordinates'][0][0][1]
except:
tweets['lat'][i]='NA'
try:
tweets['hour'][i]=tweets_data[i]['created_at'][11:13]
except:
tweets['hour'][i]='NA'
try:
tweets['created_at'][i]=tweets_data[i]['created_at']
except:
tweets['created_at'][i]='NA'
blob = TextBlob(tweets['text'][i])
try:
sentence=blob.sentences[0]
tweets['sentiment'][i]=sentence.sentiment.polarity
except:
tweets['sentiment'][i]=0
try:
location=tweets_data[i]['user']['location']
except:
location="NA"
try:
coor=geolocator.geocode(location)
coor=coor.raw
address = geolocator.reverse(coor['lat']+","+coor['lon']).raw
country_code=str(address['address']['country_code']).upper()
except:
country=""
if len(tweets['country_code']) <= 1:
tweets['country_code']=country_code
try:
stateFromData=tweets['location'][i].split(',')[1]
except:
stateFromData=''
if len(stateFromData)==2:
tweets['state'][i]=stateFromData
else:
if tweets['lat'][i] !='NA':
radius=10
incre=10
zips=zipcode.isinradius((tweets['lat'][i],tweets['lon'][i]),radius)
while len(zips)==0:
radius=radius+incre
zips=zipcode.isinradius((tweets['lat'][i],tweets['lon'][i]),radius)
incre=incre+10
myzip = zipcode.isequal(str(zips[0].zip))
tweets['state'][i]=myzip.state
else:
tweets['state'][i]='NA'
tweets.to_csv(path+"tweets-output/"+tweetFile+str(fileno)+".csv",encoding='utf-8')
In [ ]:
#Sentiment matrix:
td1 = pd.DataFrame({'value' : tweets.groupby( [ "created_at"],['country'],['country_code'] ).size()}).reset_index()
timedata = td1[td1.created_at != 'NA']