notebook.community

Edit and run



In [88]:

    
import json
path='C:\\Users\\Shahidhya\\OneDrive\\I524 Project\\3. Twitter Streaming\\set 1\\'
tweetFile='Twitter_data2.txt'
import pandas as pd
statedata=pd.read_csv('C:\\Users\\Shahidhya\\OneDrive\\I524 Project\\3. Twitter Streaming\\'+"states.csv")



In [89]:

    
tweets_data = []
tweets_data_path=path+tweetFile
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue



In [90]:

    
from textblob import TextBlob
count=0
tweets = pd.DataFrame(index=range(len(tweets_data)), columns=['text','created_at','lang','retweeted','location','state','sentiment','sentiment_cat','country_code','lat','lon'])

for i in range(len(tweets_data)):
    try:
        tweets['text'][i] = tweets_data[i]['text']
    except:
        tweets['text'][i] = ""
    try:
        tweets['lang'][i]=tweets_data[i]['lang']
    except:
        tweets['lang'][i]='NA'
    try:
        tweets['retweeted'][i]=tweets_data[i]['retweeted']
    except:
        tweets['lang'][i]='NA'
    try:
        tweets['location'][i]=tweets_data[i]['user']['location']
    except:
        tweets['location'][i]='NA'
    try:
        tweets['country_code'][i]=tweets_data[i]['place']['country_code']
    except:
        tweets['country_code'][i]=''
    try:
        tweets['lon'][i]=tweets_data[i]['place']['bounding_box']['coordinates'][0][0][0]
    except:
        tweets['lon'][i]='NA'
    try:
        tweets['lat'][i]=tweets_data[i]['place']['bounding_box']['coordinates'][0][0][1]
    except:
        tweets['lat'][i]='NA'
    try:
        tweets['created_at'][i]=tweets_data[i]['created_at']
    except:
        tweets['created_at'][i]='NA'



In [91]:

    
import time
import zipcode
start_time = time.time()
count=0
for i in range(len(tweets)):
    blob = TextBlob(tweets['text'][i])
    try:
        sentence=blob.sentences[0]
        tweets['sentiment'][i]=sentence.sentiment.polarity
    except:
        tweets['sentiment'][i]=0
    if tweets['sentiment'][i] < 0:
        tweets['sentiment_cat'][i] = 'Neg'
    else:
        if tweets['sentiment'][i] > 0:
            tweets['sentiment_cat'][i] = 'Pos'
        else:
            tweets['sentiment_cat'][i] = 'Neu'  
    try:
        stateFromData=tweets['location'][i].split(',')[1]
    except:
        stateFromData=''
    if len(stateFromData)==2:
        tweets['state'][i]=stateFromData.upper()
    else:
        if tweets['lat'][i] !='NA':
            radius=10
            incre=10
            zips=zipcode.isinradius((tweets['lat'][i],tweets['lon'][i]),radius)
            while len(zips)==0:
                radius=radius+incre
                zips=zipcode.isinradius((tweets['lat'][i],tweets['lon'][i]),radius)
                incre=incre+10
            myzip = zipcode.isequal(str(zips[0].zip))
            tweets['state'][i]=myzip.state
        else:
            tweets['state'][i]='NA'
    count+=1
    if count%1000==0:
        print(count," Tweets processed")
print("--- %s seconds ---" % (time.time() - start_time))
tweetsFinal=pd.merge(tweets, statedata, how='left',left_on="state",right_on="Abbreviation")









    



1000  Tweets processed
2000  Tweets processed
3000  Tweets processed
4000  Tweets processed
5000  Tweets processed
6000  Tweets processed
7000  Tweets processed
8000  Tweets processed
9000  Tweets processed
10000  Tweets processed
--- 693.4969084262848 seconds ---



In [ ]:

    
import json
#a = tweets.groupby(['state'],as_index=False).count()
output=pd.DataFrame({'value' : tweetsFinal.groupby( [ "State"] ).size()}).reset_index()
outputJsonList=output.to_json(orient = "records")
finalOutput=outputJsonList[33:len(outputJsonList)-1].upper().replace("\"STATE\"","ucName").replace("\"VALUE\"","value")
print(finalOutput)

with open('C:\\Users\\Shahidhya\\OneDrive\\I524 Project\\3. Twitter Streaming\\usStates-tweetCount.json', 'w') as outfile:
    outfile.write(finalOutput)
#-------------------------------------
zips=zipcode.isinradius((tweets['lat'][0],tweets['lon'][0]),radius)
print (zips[0].zip)
myzip = zipcode.isequal(str(81507))
print (myzip.statename)
#-------------------------------------------
output2=pd.DataFrame({'value' : tweetsFinal.groupby( [ "State","sentiment_cat"] ).size()}).reset_index()
output2.head()
#-------------------------------------------
import numpy as np
outData=pd.pivot_table(output2,values='value', index=['State'], columns=['sentiment_cat'], aggfunc=np.sum)
outData=outData.fillna(0)
outData.head()
#-------------------------------------------
outData['sum']=outData[['Neg', 'Neu', 'Pos']].sum(axis=1)
outData['max']=outData['maxFinal']=outData[['Neg', 'Neu', 'Pos']].idxmax(axis=1)
#-------------------------------------------
for i in range(len(outData)):
    if outData['max'][i] =="Pos":
        outData['maxFinal'][i] = '1'
    else:
        if outData['max'][i] =="Neu":
            outData['maxFinal'][i] = '-1'
        else:
            outData['maxFinal'][i] = '2'

outData
#-------------------------------------------
outData['state']=outData.index
outData.reset_index()
#-------------------------------------------
d="var data =[\n"
for i in range(len(outData)):
    row=outData.ix[i]
    d += "[\'"+row['state']+"\',"+",".join([str(i) for i in row[:5]])+"],\n"
    
d+=']'
#-------------------------------------------



In [92]:

    
from time import strptime

td1 = pd.DataFrame({'value' : tweets.groupby( [ "created_at"] ).size()}).reset_index()
timedata = td1[td1.created_at != 'NA']
data1 ={}
data = ["var data=["]
for i in range(1,len(timedata)+1):

    year = timedata['created_at'][i][-4:]
    if (timedata['created_at'][i][4:7] == 'Jan'):
        mon = '1'
    else:
        if (timedata['created_at'][i][4:7] == 'Feb'):
            mon = '2'
        else:
            if (timedata['created_at'][i][4:7] == 'Mar'):
                mon = '3'
            else:
                if (timedata['created_at'][i][4:7] == 'Apr'):
                    mon = '4'
                else:
                    if (timedata['created_at'][i][4:7] == 'May'):
                        mon = '5'
                    else:
                        if (timedata['created_at'][i][4:7] == 'Jun'):
                            mon = '6'
                        else:
                            if (timedata['created_at'][i][4:7] == 'Jul'):
                                mon = '7'
                            else:
                                if (timedata['created_at'][i][4:7] == 'Jul'):
                                    mon = '8'
                                else:
                                    if (timedata['created_at'][i][4:7] == 'Jul'):
                                        mon = '9'
                                    else:
                                        if (timedata['created_at'][i][4:7] == 'Jul'):
                                            mon = '10'
                                        else:
                                            if (timedata['created_at'][i][4:7] == 'Jul'):
                                                mon = '11'
                                            else:
                                                mon = '12'
    date = timedata['created_at'][i][7:10]
    hour = timedata['created_at'][i][10:13]
    minu = timedata['created_at'][i][14:16]
    sec = timedata['created_at'][i][17:20]
    value = timedata['value'][i]
    data1 = ("[Date.UTC("+str(year)+","+str(mon)+","+str(date)+","+str(hour)+","+str(minu)+","+str(sec)+"),"+str(value)+"]")
    if (len(timedata)):
        data.append
    data.append(data1)
data = ",\n".join(data)+"\n]"
data = data.replace("[,","[")
with open('C:\\Users\\Shahidhya\\OneDrive\\I524 Project\\3. Twitter Streaming\\tweet_cnt.json', 'w') as outfile:
    outfile.write(data)



In [93]:

    
# Tokenizer:

import re
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')

stop = stopwords.words('english') + ['and']
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    tokens=tokens_re.findall(s)
    return [ x for x in tokens if 'http' not in x and len(x)>1 and x.lower() not in stop]
 
def preprocess(s, lowercase=True):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens



In [94]:

    
#Co-occurrence:

from itertools import combinations
from collections import Counter
def collect_pairs(lines):
    pair_counter = Counter()
    for line in lines:
        unique_tokens = sorted(set(line))  # exclude duplicates in same line and sort to ensure one word is always before other
        combos = combinations(unique_tokens, 2)
        pair_counter += Counter(combos)
    return pair_counter
t2 = []
t1 =tweets['text']
for t in range(len(t1)):
    t2.append(preprocess(t1[t]))
              
pairs = collect_pairs(t2)



In [97]:

    
import numpy as np
nptp = np.array(top_pairs)
maxtp = np.max(nptp[:,1])

top_pairs = pairs.most_common(200)
nodes={}
links=["\"links\":["]
count =0
len_top=len(top_pairs)
for p in range(len(top_pairs)):
    for i in range(2):
        if top_pairs[p][0][i] not in nodes:
            nodes[top_pairs[p][0][i]] = count
            count+=1
    link="{ \"source\":"+str(nodes[top_pairs[p][0][0]])+",\"target\":"+str(nodes[top_pairs[p][0][1]])+",\"value\":"+str(round(top_pairs[p][1]*10/maxtp))+"}"
    links.append(link)
links=",\n".join(links)+"\n]"
links=links.replace("[,","[")
nodes = sorted(nodes.items(), key=lambda x: x[1])
nodes1=["\"nodes\":["]
for p in range(len(nodes)):
    nodes1.append("{ \"name\":\""+nodes[p][0]+"\",\"group\":"+"0}")
nodes1=",\n".join(nodes1)+"\n]"
nodes1=nodes1.replace("[,","[")
with open('C:\\Users\\Shahidhya\\OneDrive\\I524 Project\\3. Twitter Streaming\\cooccur_word.json', 'w') as outfile:
    outfile.write("{\n"+nodes1+",\n"+links+"}\n")



In [152]:

    
#Wordcloud:

from os import path
from wordcloud import WordCloud

#d = path.dirname(__file__)

# Read the whole text.
#text = open(path.join(d, 'constitution.txt')).read()
textpos = tweets[tweets.sentiment_cat == 'Pos']
textneg = tweets[tweets.sentiment_cat == 'Neg']
textp = preprocess(textpos['text'])
textn = preprocess(textneg['text'])
wordcloudp = WordCloud(font_path='/Users/kunal/Library/Fonts/sans-serif.ttf',
                          stopwords=STOPWORDS,
                          background_color='white',
                          width=1200,
                          height=1000).generate(textp)
wordcloudn = WordCloud(font_path='/Users/kunal/Library/Fonts/sans-serif.ttf',
                          stopwords=STOPWORDS,
                          background_color='white',
                          width=1200,
                          height=1000).generate(textn)
import matplotlib.pyplot as plt
plt.imshow(wordcloudp, interpolation='bilinear')
plt.axis("off")
plt.show()
plt.imshow(wordcloudn, interpolation='bilinear')
plt.axis("off")
plt.show()



In [ ]:

    
import json
from geopy.geocoders import Nominatim
geolocator = Nominatim()
path='C:\\Users\\Shahidhya\\OneDrive\\I524 Project\\'
tweetFile='WorldTweets-'
import pandas as pd
import zipcode
from textblob import TextBlob
#from random import shuffle
import sys
tweets_data = []
fileno=1
tweets_data_path=path+tweetFile+str(fileno)+".txt"
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue

tweets = pd.DataFrame(index=range(len(tweets_data)), columns=['text','lang','created_at','hour','retweet','country_code','state','sentiment','lat','lon'])
print ("dataset initialized")	
#count=0
#shuffle(data)
#tweets_data=data[:200]
for i in range(len(tweets_data)):
    try:
        tweets['text'][i] = tweets_data[i]['text']
    except:
        tweets['text'][i] = ""
    try:
        tweets['lang'][i]=tweets_data[i]['lang']
    except:
        tweets['lang'][i]='NA'
    try:
        tweets['retweet'][i]='0' if "RT @" not in tweets['text'][i] else '1'
    except:
        tweets['lang'][i]='NA'
    try:
        tweets['country_code'][i]=str(tweets_data[i]['place']['country_code']).upper()
    except:
        tweets['country_code'][i]=''
    try:
        tweets['lon'][i]=tweets_data[i]['place']['bounding_box']['coordinates'][0][0][0]
    except:
        tweets['lon'][i]='NA'
    try:
        tweets['lat'][i]=tweets_data[i]['place']['bounding_box']['coordinates'][0][0][1]
    except:
        tweets['lat'][i]='NA'
    try:
        tweets['hour'][i]=tweets_data[i]['created_at'][11:13]
    except:
        tweets['hour'][i]='NA'
    try:
        tweets['created_at'][i]=tweets_data[i]['created_at']
    except:
            tweets['created_at'][i]='NA'
    blob = TextBlob(tweets['text'][i])
    try:
        sentence=blob.sentences[0]
        tweets['sentiment'][i]=sentence.sentiment.polarity
    except:
        tweets['sentiment'][i]=0
    try:
        location=tweets_data[i]['user']['location']
    except:
        location="NA"
    try:
        coor=geolocator.geocode(location)
        coor=coor.raw
        address = geolocator.reverse(coor['lat']+","+coor['lon']).raw
        country_code=str(address['address']['country_code']).upper()
    except:
        country=""
    if len(tweets['country_code']) <= 1:
        tweets['country_code']=country_code
    try:
        stateFromData=tweets['location'][i].split(',')[1]
    except:
        stateFromData=''
    if len(stateFromData)==2:
        tweets['state'][i]=stateFromData
    else:
        if tweets['lat'][i] !='NA':
            radius=10
            incre=10
            zips=zipcode.isinradius((tweets['lat'][i],tweets['lon'][i]),radius)
            while len(zips)==0:
                radius=radius+incre
                zips=zipcode.isinradius((tweets['lat'][i],tweets['lon'][i]),radius)
                incre=incre+10
            myzip = zipcode.isequal(str(zips[0].zip))
            tweets['state'][i]=myzip.state
        else:
            tweets['state'][i]='NA'
tweets.to_csv(path+"tweets-output/"+tweetFile+str(fileno)+".csv",encoding='utf-8')









    



dataset initialized



In [ ]:

    
#Sentiment matrix:

td1 = pd.DataFrame({'value' : tweets.groupby( [ "created_at"],['country'],['country_code'] ).size()}).reset_index()
timedata = td1[td1.created_at != 'NA']