For sentiment analysis, we will use the VADER library.
In [ ]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
import pandas as pd
pd.set_option('display.max_colwidth', -1)
We can get the tweets to analyse by reading the text
column from our metoo dataset. We also read the dates column.
In [ ]:
df = pd.DataFrame.from_csv("metoo_full_backup_3M.csv")
In [ ]:
sentences = df.text
dates = df.created_at
To skip some text cleaning and filtering steps here, we re-read the tweets from a pre-processed file.
In [ ]:
sentences = open('metoo_tweets.txt', 'r').readlines()
Next, use VADER to calculate sentiment scores.
In [ ]:
vader_scores = []
numdocs = len(sentences)
for c,sentence in enumerate(sentences):
score = analyzer.polarity_scores(sentence)
vader_scores.append(score)
if c % 1000 == 0:
print("\r" + str(c) + "/" + str(numdocs) ,end = "")
Iterate over the sentences
list and the vader_scores
list in parallel, to be able to add each sentence as a key to the dictionary of its scores.
In [ ]:
for sentence, score_dict in zip(sentences, vader_scores):
score_dict['text'] = sentence
In [ ]:
for date, score_dict in zip(dates, vader_scores):
score_dict['created_at'] = date
Now vader_scores
is a list of dictionaries with scores and sentences. We write it to a pandas dataframe.
In [ ]:
vader_df = pd.DataFrame(vader_scores)[['text', 'created_at','compound', 'neg', 'neu', 'pos']]
vader_df = vader_df.sort_values('compound', ascending=True)
vader_df.head(7)
In [ ]:
vader_df = pd.DataFrame(vader_scores)[['text', 'compound', 'created_at']]
vader_df = vader_df.sort_values('compound', ascending=True)
vader_df.head(7)
In [ ]:
df = df.sort_values(by="text")
vader_df = vader_df.sort_values(by="text")
df = df.reset_index(drop=True)
vader_df = vader_df.reset_index(drop=True)
df.head()
In [ ]:
df.tail()
In [ ]:
vader_df.dtypes
In [ ]:
firstday = (vader_df['created_at'] > '2017-10-28') & (vader_df['created_at'] < '2017-10-31')
firstday_df = df[firstday]
firstday_df = firstday_df.sort_values(by="sentiment", ascending = False)
firstday_df.to_csv("firstday.csv")
firstday_df
In [ ]:
firstday_df = firstday_df.sort_values(by="created_at", ascending = True)
firstday_df
In [ ]:
firstday_df.dtypes
In [ ]:
vader_df.head()
In [ ]:
vader_df.tail()
In [ ]:
df['neg'] = vader_df['neg']
df['pos'] = vader_df['pos']
In [ ]:
df['text2'] = vader_df.text
In [ ]:
df.head()
In [ ]:
df.to_csv("sentiment_dataframe.csv")
In [ ]:
sentiments = df[['created_at', 'neg', 'pos']]
sentiments = sentiments.sort_values(by="created_at")
sentiments = sentiments.reset_index(drop=True)
sentiments.head()
In [ ]:
sentiments.tail()
In [ ]:
sentiments['created_at'] = pd.to_datetime(sentiments['created_at'])
In [ ]:
groups = sentiments.groupby([sentiments['created_at'].dt.date])
In [ ]:
daycol = []
posmeancol = []
negmeancol = []
for name, group in groups:
posmeancol.append(group.pos.mean())
negmeancol.append(group.neg.mean())
date = group.created_at.tolist()[0]
daycol.append(str(date)[:-9])
In [ ]:
daycol = pd.Series(daycol)
posmeancol = pd.Series(posmeancol)
negmeancol = pd.Series(negmeancol)
sentdata = pd.concat([daycol, posmeancol, negmeancol], axis=1)
sentdata.columns=['day', 'posmean', 'negmean']
In [ ]:
import matplotlib.pyplot as plt
from matplotlib import dates, pyplot
import matplotlib.ticker as ticker
%matplotlib inline
# Create a new figure
plt.figure(figsize=(10,6), dpi=100)
# Define x
#x = sentdata.day.tolist() # the day col in list-of-strings format
x = range(68)
xn = range(len(x)) # make it numerical 1 to n
plt.xticks(xn, x) # name the ticks
# What columns to plot
pos = sentdata.posmean
neg = sentdata.negmean
# Plot them
plt.plot(xn, pos, color="gray", linewidth=2.0, linestyle=":", label="Positive scores")
plt.plot(xn, neg, color="black", linewidth=2.0, linestyle="-", label = "Negative scores")
plt.legend(loc='upper left', frameon=False)
# Set axis ranges
plt.xlim(1,60)
plt.ylim(0,0.2)
# Label orientation and size
plt.xticks(rotation=0, fontsize = 8)
plt.yticks(rotation=0, fontsize = 8)
# Tweak axes more
ax = plt.gca() # get current axes in the plot
# Loop over the x labels and hide all
for label in ax.xaxis.get_ticklabels():
label.set_visible(False)
# Loop over every nth x label and set it to visible
for label in ax.xaxis.get_ticklabels()[::7]:
label.set_visible(True)
# Also, set the very first label to visible
ax.xaxis.get_ticklabels()[1].set_visible(True)
plt.ylabel("Mean weighted normalised composite score")
plt.xlabel("Day")
plt.savefig('sentiments.pdf')
In [ ]:
sentdata_beginning = sentdata[(sentdata.day > '2017-10-25')]
In [ ]:
sentdata_beginning.plot(kind='area')
In [ ]:
october = sentiments[(sentiments.created_at < '2017-11-01')]
november = sentiments[(sentiments.created_at > '2017-10-31') & (sentiments.created_at <= '2017-11-30')]
december = sentiments[(sentiments.created_at > '2017-11-30')]
In [ ]:
import seaborn as sns
In [ ]:
%matplotlib inline
sns.violinplot(data=october, inner="box", orient = "h", bw=.03)
#sns.violinplot(data=corr_df, palette="Set3", bw=.2, cut=1, linewidth=1)
In [ ]:
%matplotlib inline
sns.violinplot(data=november, inner="box", orient="h", bw=.03)
In [ ]:
%matplotlib inline
sns.violinplot(data=december, inner="box", orient = "h", bw=.03)
In [ ]:
dta = sentiments.head(100) # testdata
dta['item'] = dta.index
dta.head()
In [ ]:
hexbin = sns.jointplot(x="item", y="sentiment", data=dta, kind="scatter")
#bins='log', cmap='inferno'