Simon #metoo step 4

For sentiment analysis, we will use the VADER library.


In [ ]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

import pandas as pd
pd.set_option('display.max_colwidth', -1)

We can get the tweets to analyse by reading the text column from our metoo dataset. We also read the dates column.


In [ ]:
df = pd.DataFrame.from_csv("metoo_full_backup_3M.csv")

In [ ]:
sentences = df.text
dates = df.created_at

To skip some text cleaning and filtering steps here, we re-read the tweets from a pre-processed file.


In [ ]:
sentences = open('metoo_tweets.txt', 'r').readlines()

Next, use VADER to calculate sentiment scores.


In [ ]:
vader_scores = []
numdocs = len(sentences)
for c,sentence in enumerate(sentences):
    score = analyzer.polarity_scores(sentence)
    vader_scores.append(score)
    if c % 1000 == 0:
        print("\r" + str(c) + "/" + str(numdocs) ,end = "")

Iterate over the sentences list and the vader_scores list in parallel, to be able to add each sentence as a key to the dictionary of its scores.


In [ ]:
for sentence, score_dict in zip(sentences, vader_scores):
    score_dict['text'] = sentence

In [ ]:
for date, score_dict in zip(dates, vader_scores):
    score_dict['created_at'] = date

Now vader_scores is a list of dictionaries with scores and sentences. We write it to a pandas dataframe.


In [ ]:
vader_df = pd.DataFrame(vader_scores)[['text', 'created_at','compound', 'neg', 'neu', 'pos']]
vader_df = vader_df.sort_values('compound', ascending=True)
vader_df.head(7)

In [ ]:
vader_df = pd.DataFrame(vader_scores)[['text', 'compound', 'created_at']]
vader_df = vader_df.sort_values('compound', ascending=True)
vader_df.head(7)

In [ ]:
df = df.sort_values(by="text")
vader_df = vader_df.sort_values(by="text")
df = df.reset_index(drop=True)
vader_df = vader_df.reset_index(drop=True)


df.head()

In [ ]:
df.tail()

In [ ]:
vader_df.dtypes

In [ ]:
firstday = (vader_df['created_at'] > '2017-10-28') & (vader_df['created_at'] < '2017-10-31')
firstday_df = df[firstday]
firstday_df = firstday_df.sort_values(by="sentiment", ascending = False)
firstday_df.to_csv("firstday.csv")

firstday_df

In [ ]:
firstday_df = firstday_df.sort_values(by="created_at", ascending = True)

firstday_df

In [ ]:
firstday_df.dtypes

In [ ]:
vader_df.head()

In [ ]:
vader_df.tail()

In [ ]:
df['neg'] = vader_df['neg']
df['pos'] = vader_df['pos']

In [ ]:
df['text2'] = vader_df.text

In [ ]:
df.head()

In [ ]:
df.to_csv("sentiment_dataframe.csv")

In [ ]:
sentiments = df[['created_at', 'neg', 'pos']]
sentiments = sentiments.sort_values(by="created_at")
sentiments = sentiments.reset_index(drop=True)
sentiments.head()

In [ ]:
sentiments.tail()

In [ ]:
sentiments['created_at'] =  pd.to_datetime(sentiments['created_at'])

In [ ]:
groups = sentiments.groupby([sentiments['created_at'].dt.date])

In [ ]:
daycol = []
posmeancol = []
negmeancol = []

for name, group in groups:
    posmeancol.append(group.pos.mean())
    negmeancol.append(group.neg.mean())
    date = group.created_at.tolist()[0]
    daycol.append(str(date)[:-9])

In [ ]:
daycol = pd.Series(daycol)
posmeancol = pd.Series(posmeancol)
negmeancol = pd.Series(negmeancol)
sentdata = pd.concat([daycol, posmeancol, negmeancol], axis=1)
sentdata.columns=['day', 'posmean', 'negmean']

In [ ]:
import matplotlib.pyplot as plt
from matplotlib import dates, pyplot
import matplotlib.ticker as ticker
%matplotlib inline

# Create a new figure
plt.figure(figsize=(10,6), dpi=100)

# Define x
#x = sentdata.day.tolist() # the day col in list-of-strings format
x = range(68)
xn = range(len(x)) # make it numerical 1 to n
plt.xticks(xn, x) # name the ticks

# What columns to plot
pos = sentdata.posmean
neg = sentdata.negmean

# Plot them
plt.plot(xn, pos, color="gray", linewidth=2.0, linestyle=":", label="Positive scores")
plt.plot(xn, neg, color="black", linewidth=2.0, linestyle="-", label = "Negative scores")

plt.legend(loc='upper left', frameon=False)

# Set axis ranges
plt.xlim(1,60)
plt.ylim(0,0.2)

# Label orientation and size
plt.xticks(rotation=0, fontsize = 8)
plt.yticks(rotation=0, fontsize = 8)

# Tweak axes more
ax = plt.gca() # get current axes in the plot

# Loop over the x labels and hide all
for label in ax.xaxis.get_ticklabels():
    label.set_visible(False)

# Loop over every nth x label and set it to visible
for label in ax.xaxis.get_ticklabels()[::7]:
    label.set_visible(True)
    
# Also, set the very first label to visible
ax.xaxis.get_ticklabels()[1].set_visible(True)

plt.ylabel("Mean weighted normalised composite score")
plt.xlabel("Day")

plt.savefig('sentiments.pdf')

In [ ]:
sentdata_beginning = sentdata[(sentdata.day > '2017-10-25')]

In [ ]:
sentdata_beginning.plot(kind='area')

In [ ]:
october = sentiments[(sentiments.created_at < '2017-11-01')]
november = sentiments[(sentiments.created_at > '2017-10-31') & (sentiments.created_at <= '2017-11-30')]
december = sentiments[(sentiments.created_at > '2017-11-30')]

In [ ]:
import seaborn as sns

In [ ]:
%matplotlib inline
sns.violinplot(data=october, inner="box", orient = "h", bw=.03)
#sns.violinplot(data=corr_df, palette="Set3", bw=.2, cut=1, linewidth=1)

In [ ]:
%matplotlib inline
sns.violinplot(data=november, inner="box", orient="h", bw=.03)

In [ ]:
%matplotlib inline
sns.violinplot(data=december, inner="box", orient = "h", bw=.03)

In [ ]:
dta = sentiments.head(100) # testdata
dta['item'] = dta.index
dta.head()

In [ ]:
hexbin = sns.jointplot(x="item", y="sentiment", data=dta, kind="scatter")

#bins='log', cmap='inferno'