In [2]:
import datetime
import pandas as pd
import thirdparty.tsearch.TwitterScraper as ts
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
In [7]:
bernie_search = ['Bernie', 'Sanders', 'BernieSanders']
rubio_search = ['Marco', 'Rubio', 'marcorubio']
cruz_search = ['Ted', 'Cruz', 'tedcruz']
clinton_search = ['Hillary', 'Clinton', 'HillaryClinton']
trump_search = ['Donald', 'Trump', 'DonaldTrump']
bernie_keywords = ['Bernie', 'Sanders', 'BernieSanders', 'Feel the Bern', 'Bern', 'feelthebern']
rubio_keywords = ['Marco', 'Rubio', 'marcorubio', '"A New American Century"']
cruz_keywords = ['Ted', 'Cruz', 'tedcruz', 'CruzCrew', '"Together, we will win"']
clinton_keywords = ['Hillary', 'Clinton', 'HillaryClinton', '"I\'m with her"', '"Stronger Together"', 'imwithher']
trump_keywords = ['Donald', 'Trump', 'DonaldTrump', 'NeverTrump', '"Make America Great Again"', 'maga']
primaries = {'Democrat': [('Clinton', clinton_search),
('Sanders', bernie_search)],
'Republican': [('Rubio', rubio_search),
('Cruz', cruz_search),
('Trump', trump_search)]}
In [8]:
# Generate all the time-based and geo-based components from the csv
state_dict = dict()
with open('data/raw_calc_lat_lon.csv', 'r') as csvfile:
tab = csv.reader(csvfile, delimiter=',')
for row in tab:
if row[1] != 'latitude':
state = row[0]
latitude = row[1]
longitude = row[2]
radius = row[3]
date = row[4]
party = row[5]
after = row[6]
until = row[7]
if state not in state_dict:
state_dict[state] = dict()
state_dict[state][party] = (latitude, longitude, radius, after, until)
In [11]:
# Get all the tweets for a person
qs_list = []
tweets = dict()
for state in state_dict.keys():
tweets[state] = dict()
for primary in primaries.keys():
tweets[state][primary] = dict()
for candidate in primaries[primary]:
latitude, longitude, radius, since, until = state_dict[state][primary]
geo = ' geocode:%.6f,%.6f,%dkm' % (float(latitude), float(longitude), float(radius))
query_string = " OR ".join(candidate[1]) + geo
since = datetime.datetime.strptime(since, '%m/%d/%y')
until = datetime.datetime.strptime(until, '%m/%d/%y')
twitSlice = ts.TwitterSlicer(0, 5, since, until, 5)
twitSlice.search(query_string)
tweets[state][primary][candidate[0]] = twitSlice.tweets
print('%s %s %s: Downloaded %d tweets.' % (state, primary, candidate[0], twitSlice.counter))
In [36]:
for state in tweets.keys():
total = 0
for party in tweets[state].keys():
for candidate in tweets[state][party].keys():
total += len(tweets[state][party][candidate])
print(str(total) + ',')
In [19]:
sid = SentimentIntensityAnalyzer()
In [30]:
df = pd.DataFrame(columns=['State', 'Primary', 'Candidate',])
In [31]:
i = 0
for _, state in enumerate(tweets):
for party in tweets[state].keys():
for candidate in tweets[state][party].keys():
df.set_value(i, 'State', state)
df.set_value(i, 'Primary', party)
df.set_value(i, 'Candidate', candidate)
for k in range(len(tweets[state][party][candidate])):
tweet_text = tweets[state][party][candidate][k]['text']
score = sid.polarity_scores(str(tweet_text))['compound']
df.set_value(i, 't' + str(k), score)
i += 1
In [39]:
df[:10]
Out[39]:
In [38]:
df.to_csv('data/twitter/twitter_sentiment_scores.csv')