In [1]:
%matplotlib inline
Code for finalizing the model data
Author: Jimmy Charité
Email: jimmy.charite@gmail.com
In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import pysentiment
from textstat.textstat import textstat
from wordcloud import WordCloud
import nltk
import statsmodels.formula.api as smf
import statsmodels.api as sm
In [3]:
retval=os.chdir("..")
In [4]:
def pd_tab(df,col,sort_by='count',asc=False):
tab=df[col].value_counts(dropna=False).reset_index(name='count')
tab.columns=[col,'count']
tab['percent']=tab['count']/tab['count'].sum()
tab.sort_values(by=sort_by,inplace=True,ascending=asc)
return tab
In [5]:
raw_data=pd.read_pickle('./clean_data/raw_data_post_parse.pkl')
raw_data.head()
Out[5]:
In [6]:
raw_data.columns
Out[6]:
In [7]:
fin_cols=['helpful','num_sents', 'num_words', 'readability',
'neg_senti', 'pos_senti', 'neu_senti', 'comp_senti',
'text_lemma']
vec_cols=[s for s in raw_data.columns if s[:3]=='vec']
fin_cols.extend(vec_cols)
fin_cols
Out[7]:
In [8]:
raw_data=raw_data[fin_cols].copy()
In [9]:
raw_data.head()
Out[9]:
In [10]:
g=sns.distplot(raw_data.num_sents)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number of Sentences\n',fontsize=20)
g.set_xlabel('Count',fontsize=15)
Out[10]:
In [11]:
g=sns.distplot(np.log(raw_data.num_sents))
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Log Number of Sentences\n',fontsize=20)
g.set_xlabel('Count',fontsize=15)
Out[11]:
Will use the log b/c of extreme skewness
In [12]:
raw_data['num_sents']=np.log(raw_data.num_sents)
In [13]:
g=sns.distplot(raw_data.num_words)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number of Words\n',fontsize=20)
g.set_xlabel('Count',fontsize=15)
Out[13]:
In [14]:
g=sns.distplot(np.log(raw_data.num_words))
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Log Number of Words\n',fontsize=20)
g.set_xlabel('Count',fontsize=15)
Out[14]:
In [15]:
raw_data['num_words']=np.log(raw_data.num_words)
In [16]:
raw_data.readability.describe()
Out[16]:
In [17]:
g=sns.distplot(raw_data.readability)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Readbility\n',fontsize=20)
g.set_xlabel('Count',fontsize=15)
Out[17]:
In [18]:
raw_data.readability.isnull().sum()
Out[18]:
In retrospect, this seems less appropriate. However, I will just shift it to make it positive, then take the log
In [19]:
raw_data['readability']=np.log(raw_data.readability+100*np.abs(np.min(raw_data.readability)))
In [20]:
raw_data.readability.describe()
Out[20]:
In [21]:
g=sns.distplot(raw_data.readability)
g.axes.set_ylim(0,)
g.axes.set_xlim(6.5,7.25)
g.axes.set_title('Readbility\n',fontsize=20)
g.set_xlabel('Count',fontsize=15)
Out[21]:
This comically thin distribution will be adjusted with scaling
In [22]:
raw_data.neg_senti.describe()
Out[22]:
In [23]:
g=sns.distplot(raw_data.neg_senti)
g.axes.set_ylim(0,)
g.axes.set_title('Negative Sentiment\n',fontsize=20)
g.set_xlabel('Score',fontsize=15)
Out[23]:
In [24]:
raw_data.pos_senti.describe()
Out[24]:
In [25]:
g=sns.distplot(raw_data.pos_senti)
g.axes.set_ylim(0,)
g.axes.set_title('Positive Sentiment\n',fontsize=20)
g.set_xlabel('Score',fontsize=15)
Out[25]:
In [26]:
raw_data.neu_senti.describe()
Out[26]:
In [27]:
g=sns.distplot(raw_data.neu_senti)
g.axes.set_ylim(0,)
g.axes.set_title('Neutral Sentiment\n',fontsize=20)
g.set_xlabel('Score',fontsize=15)
Out[27]:
In [28]:
raw_data.comp_senti.describe()
Out[28]:
In [29]:
g=sns.distplot(raw_data.comp_senti)
g.axes.set_ylim(0,)
g.axes.set_title('Composite Sentiment Score\n',fontsize=20)
g.set_xlabel('Score',fontsize=15)
Out[29]:
In [30]:
g=sns.regplot(x="pos_senti", y="neg_senti", data=raw_data,
fit_reg=True)
#g.axes.set_ylim(0,)
#g.axes.set_xlim(0,)
g.axes.set_title('Positive vs Negative Sentiment\n',fontsize=20)
g.set_xlabel('Positive Sentiment',fontsize=15)
g.set_ylabel('Negative Sentiment',fontsize=15)
Out[30]:
In [31]:
f='pos_senti ~ neg_senti'
results = smf.ols(formula=f, data=raw_data).fit()
print(results.summary())
I expected a larger R^2
In [32]:
sns.set(context="paper", font="monospace")
In [33]:
corrmat = raw_data[['neg_senti', 'pos_senti', 'neu_senti', 'comp_senti']].corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=1, square=True)
ax.set_title('Sentiment Correlation Matrix Heatmap\n',fontsize=20)
plt.savefig('./plots/Sentiment_Correlation_Matrix_Heatmap.png', bbox_inches='tight')
In [34]:
corrmat
Out[34]:
Ask expected, the sentiments are highly correlated
In [35]:
corrmat = raw_data[[s for s in raw_data.columns if s!='text_lemma']].corr()
f, ax = plt.subplots(figsize=(16, 12))
sns.heatmap(corrmat, vmax=1, square=True)
ax.set_title('Correlation Matrix Heatmap\n',fontsize=20)
plt.savefig('./plots/Correlation_Matrix_Heatmap.png', bbox_inches='tight')
In [36]:
corrmat
Out[36]:
In [37]:
raw_data=raw_data[fin_cols].copy()
In [38]:
raw_data.head()
Out[38]:
In [39]:
raw_data.to_pickle('./clean_data/clean_data.pkl')