In [1]:
%matplotlib inline

Finalizing Model Data

Code for finalizing the model data
Author: Jimmy Charité
Email: jimmy.charite@gmail.com

Directory & Packages


In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import pysentiment
from textstat.textstat import textstat 
from wordcloud import WordCloud
import nltk
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [3]:
retval=os.chdir("..")

Helper Functions


In [4]:
def pd_tab(df,col,sort_by='count',asc=False):
    tab=df[col].value_counts(dropna=False).reset_index(name='count')
    tab.columns=[col,'count']
    tab['percent']=tab['count']/tab['count'].sum()
    tab.sort_values(by=sort_by,inplace=True,ascending=asc)
    return tab

Upload Data


In [5]:
raw_data=pd.read_pickle('./clean_data/raw_data_post_parse.pkl')
raw_data.head()


Out[5]:
Id ProductId UserId ProfileName HelpfulnessNumerator HelpfulnessDenominator Score Time Summary Text ... vec299 num_sents num_words readability sentiment_dict neg_senti pos_senti neu_senti comp_senti text_lemma
0 2 B00813GRG4 A1D87F6ZCVE5NK dll pa 0 0 1 1346976000 Not as Advertised Product arrived labeled as Jumbo Salted Peanut... ... 0.020952 2 37 8.0 {'neg': 0.079, 'neu': 0.853, 'pos': 0.068, 'co... 0.079 0.068 0.853 -0.1027 product arrive label peanut actually small siz...
1 5 B006K2ZZ7K A1UQRSCLF8GW1T Michael D. Bigham "M. Wassir" 0 0 5 1350777600 Great taffy Great taffy at a great price. There was a wid... ... 0.113610 4 35 1.3 {'neg': 0.0, 'neu': 0.552, 'pos': 0.448, 'comp... 0.000 0.448 0.552 0.9468 great taffy great price wide assortment yummy ...
2 6 B006K2ZZ7K ADT0SRK1MGOEU Twoapennything 0 0 4 1342051200 Nice Taffy I got a wild hair for taffy and ordered this f... ... 0.046176 5 92 8.6 {'neg': 0.029, 'neu': 0.809, 'pos': 0.163, 'co... 0.029 0.163 0.809 0.8830 get wild hair taffy order pound bag taffy enjo...
3 7 B006K2ZZ7K A1SP2KVKFXXRU1 David C. Sullivan 0 0 5 1340150400 Great! Just as good as the expensive brands! This saltwater taffy had great flavors and was... ... 0.137415 5 63 7.7 {'neg': 0.034, 'neu': 0.693, 'pos': 0.273, 'co... 0.034 0.273 0.693 0.9346 saltwater taffy great flavor soft chewy candy ...
4 8 B006K2ZZ7K A3JRGQVEQN31IQ Pamela G. Williams 0 0 5 1336003200 Wonderful, tasty taffy This taffy is so good. It is very soft and ch... ... 0.123007 5 34 3.8 {'neg': 0.0, 'neu': 0.52, 'pos': 0.48, 'compou... 0.000 0.480 0.520 0.9487 taffy good soft chewy flavor amazing definitel...

5 rows × 326 columns

Finalizing Features


In [6]:
raw_data.columns


Out[6]:
Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text',
       ...
       'vec299', 'num_sents', 'num_words', 'readability', 'sentiment_dict',
       'neg_senti', 'pos_senti', 'neu_senti', 'comp_senti', 'text_lemma'],
      dtype='object', length=326)

In [7]:
fin_cols=['helpful','num_sents', 'num_words', 'readability',
          'neg_senti', 'pos_senti', 'neu_senti', 'comp_senti',
          'text_lemma']
vec_cols=[s for s in raw_data.columns if s[:3]=='vec']
fin_cols.extend(vec_cols)
fin_cols


Out[7]:
['helpful',
 'num_sents',
 'num_words',
 'readability',
 'neg_senti',
 'pos_senti',
 'neu_senti',
 'comp_senti',
 'text_lemma',
 'vec0',
 'vec1',
 'vec2',
 'vec3',
 'vec4',
 'vec5',
 'vec6',
 'vec7',
 'vec8',
 'vec9',
 'vec10',
 'vec11',
 'vec12',
 'vec13',
 'vec14',
 'vec15',
 'vec16',
 'vec17',
 'vec18',
 'vec19',
 'vec20',
 'vec21',
 'vec22',
 'vec23',
 'vec24',
 'vec25',
 'vec26',
 'vec27',
 'vec28',
 'vec29',
 'vec30',
 'vec31',
 'vec32',
 'vec33',
 'vec34',
 'vec35',
 'vec36',
 'vec37',
 'vec38',
 'vec39',
 'vec40',
 'vec41',
 'vec42',
 'vec43',
 'vec44',
 'vec45',
 'vec46',
 'vec47',
 'vec48',
 'vec49',
 'vec50',
 'vec51',
 'vec52',
 'vec53',
 'vec54',
 'vec55',
 'vec56',
 'vec57',
 'vec58',
 'vec59',
 'vec60',
 'vec61',
 'vec62',
 'vec63',
 'vec64',
 'vec65',
 'vec66',
 'vec67',
 'vec68',
 'vec69',
 'vec70',
 'vec71',
 'vec72',
 'vec73',
 'vec74',
 'vec75',
 'vec76',
 'vec77',
 'vec78',
 'vec79',
 'vec80',
 'vec81',
 'vec82',
 'vec83',
 'vec84',
 'vec85',
 'vec86',
 'vec87',
 'vec88',
 'vec89',
 'vec90',
 'vec91',
 'vec92',
 'vec93',
 'vec94',
 'vec95',
 'vec96',
 'vec97',
 'vec98',
 'vec99',
 'vec100',
 'vec101',
 'vec102',
 'vec103',
 'vec104',
 'vec105',
 'vec106',
 'vec107',
 'vec108',
 'vec109',
 'vec110',
 'vec111',
 'vec112',
 'vec113',
 'vec114',
 'vec115',
 'vec116',
 'vec117',
 'vec118',
 'vec119',
 'vec120',
 'vec121',
 'vec122',
 'vec123',
 'vec124',
 'vec125',
 'vec126',
 'vec127',
 'vec128',
 'vec129',
 'vec130',
 'vec131',
 'vec132',
 'vec133',
 'vec134',
 'vec135',
 'vec136',
 'vec137',
 'vec138',
 'vec139',
 'vec140',
 'vec141',
 'vec142',
 'vec143',
 'vec144',
 'vec145',
 'vec146',
 'vec147',
 'vec148',
 'vec149',
 'vec150',
 'vec151',
 'vec152',
 'vec153',
 'vec154',
 'vec155',
 'vec156',
 'vec157',
 'vec158',
 'vec159',
 'vec160',
 'vec161',
 'vec162',
 'vec163',
 'vec164',
 'vec165',
 'vec166',
 'vec167',
 'vec168',
 'vec169',
 'vec170',
 'vec171',
 'vec172',
 'vec173',
 'vec174',
 'vec175',
 'vec176',
 'vec177',
 'vec178',
 'vec179',
 'vec180',
 'vec181',
 'vec182',
 'vec183',
 'vec184',
 'vec185',
 'vec186',
 'vec187',
 'vec188',
 'vec189',
 'vec190',
 'vec191',
 'vec192',
 'vec193',
 'vec194',
 'vec195',
 'vec196',
 'vec197',
 'vec198',
 'vec199',
 'vec200',
 'vec201',
 'vec202',
 'vec203',
 'vec204',
 'vec205',
 'vec206',
 'vec207',
 'vec208',
 'vec209',
 'vec210',
 'vec211',
 'vec212',
 'vec213',
 'vec214',
 'vec215',
 'vec216',
 'vec217',
 'vec218',
 'vec219',
 'vec220',
 'vec221',
 'vec222',
 'vec223',
 'vec224',
 'vec225',
 'vec226',
 'vec227',
 'vec228',
 'vec229',
 'vec230',
 'vec231',
 'vec232',
 'vec233',
 'vec234',
 'vec235',
 'vec236',
 'vec237',
 'vec238',
 'vec239',
 'vec240',
 'vec241',
 'vec242',
 'vec243',
 'vec244',
 'vec245',
 'vec246',
 'vec247',
 'vec248',
 'vec249',
 'vec250',
 'vec251',
 'vec252',
 'vec253',
 'vec254',
 'vec255',
 'vec256',
 'vec257',
 'vec258',
 'vec259',
 'vec260',
 'vec261',
 'vec262',
 'vec263',
 'vec264',
 'vec265',
 'vec266',
 'vec267',
 'vec268',
 'vec269',
 'vec270',
 'vec271',
 'vec272',
 'vec273',
 'vec274',
 'vec275',
 'vec276',
 'vec277',
 'vec278',
 'vec279',
 'vec280',
 'vec281',
 'vec282',
 'vec283',
 'vec284',
 'vec285',
 'vec286',
 'vec287',
 'vec288',
 'vec289',
 'vec290',
 'vec291',
 'vec292',
 'vec293',
 'vec294',
 'vec295',
 'vec296',
 'vec297',
 'vec298',
 'vec299']

In [8]:
raw_data=raw_data[fin_cols].copy()

In [9]:
raw_data.head()


Out[9]:
helpful num_sents num_words readability neg_senti pos_senti neu_senti comp_senti text_lemma vec0 ... vec290 vec291 vec292 vec293 vec294 vec295 vec296 vec297 vec298 vec299
0 0.0 2 37 8.0 0.079 0.068 0.853 -0.1027 product arrive label peanut actually small siz... -0.019901 ... -0.178709 0.120293 0.048853 -0.028560 0.024294 -0.051074 -0.082868 -0.058978 0.058156 0.020952
1 0.0 4 35 1.3 0.000 0.448 0.552 0.9468 great taffy great price wide assortment yummy ... -0.076091 ... -0.125921 0.026862 -0.011833 -0.023788 0.028657 -0.001059 -0.003236 -0.048324 -0.050874 0.113610
2 0.0 5 92 8.6 0.029 0.163 0.809 0.8830 get wild hair taffy order pound bag taffy enjo... -0.048797 ... -0.154745 0.004021 0.004185 0.006071 -0.032341 0.030001 0.004792 -0.122627 -0.015319 0.046176
3 0.0 5 63 7.7 0.034 0.273 0.693 0.9346 saltwater taffy great flavor soft chewy candy ... -0.009421 ... -0.185385 0.038134 0.014824 -0.012089 0.007642 -0.013590 0.038388 -0.117533 0.042929 0.137415
4 0.0 5 34 3.8 0.000 0.480 0.520 0.9487 taffy good soft chewy flavor amazing definitel... -0.073490 ... -0.155703 0.041312 -0.121036 -0.063175 0.075995 -0.005276 0.051416 -0.136569 0.021066 0.123007

5 rows × 309 columns

Number of Sentences


In [10]:
g=sns.distplot(raw_data.num_sents)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number of Sentences\n',fontsize=20)
g.set_xlabel('Count',fontsize=15)


/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[10]:
<matplotlib.text.Text at 0x7efff62459b0>

In [11]:
g=sns.distplot(np.log(raw_data.num_sents))
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Log Number of Sentences\n',fontsize=20)
g.set_xlabel('Count',fontsize=15)


/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[11]:
<matplotlib.text.Text at 0x7efffe5931d0>

Will use the log b/c of extreme skewness


In [12]:
raw_data['num_sents']=np.log(raw_data.num_sents)

Number of Words


In [13]:
g=sns.distplot(raw_data.num_words)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number of Words\n',fontsize=20)
g.set_xlabel('Count',fontsize=15)


/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[13]:
<matplotlib.text.Text at 0x7efff5f707b8>

In [14]:
g=sns.distplot(np.log(raw_data.num_words))
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Log Number of Words\n',fontsize=20)
g.set_xlabel('Count',fontsize=15)


/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[14]:
<matplotlib.text.Text at 0x7efff5dd29e8>

In [15]:
raw_data['num_words']=np.log(raw_data.num_words)

Readability


In [16]:
raw_data.readability.describe()


Out[16]:
count    198659.000000
mean          6.398967
std           3.436570
min          -8.400000
25%           4.300000
50%           6.000000
75%           8.000000
max         345.600000
Name: readability, dtype: float64

In [17]:
g=sns.distplot(raw_data.readability)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Readbility\n',fontsize=20)
g.set_xlabel('Count',fontsize=15)


/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[17]:
<matplotlib.text.Text at 0x7efff5c90f28>

In [18]:
raw_data.readability.isnull().sum()


Out[18]:
0

In retrospect, this seems less appropriate. However, I will just shift it to make it positive, then take the log


In [19]:
raw_data['readability']=np.log(raw_data.readability+100*np.abs(np.min(raw_data.readability)))

In [20]:
raw_data.readability.describe()


Out[20]:
count    198659.000000
mean          6.740983
std           0.004010
min           6.723352
25%           6.738508
50%           6.740519
75%           6.742881
max           7.078004
Name: readability, dtype: float64

In [21]:
g=sns.distplot(raw_data.readability)
g.axes.set_ylim(0,)
g.axes.set_xlim(6.5,7.25)
g.axes.set_title('Readbility\n',fontsize=20)
g.set_xlabel('Count',fontsize=15)


/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[21]:
<matplotlib.text.Text at 0x7efff5af4e80>

This comically thin distribution will be adjusted with scaling

Sentiment


In [22]:
raw_data.neg_senti.describe()


Out[22]:
count    198659.000000
mean          0.043195
std           0.052190
min           0.000000
25%           0.000000
50%           0.030000
75%           0.068000
max           0.602000
Name: neg_senti, dtype: float64

In [23]:
g=sns.distplot(raw_data.neg_senti)
g.axes.set_ylim(0,)
g.axes.set_title('Negative Sentiment\n',fontsize=20)
g.set_xlabel('Score',fontsize=15)


/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[23]:
<matplotlib.text.Text at 0x7efff5959668>

In [24]:
raw_data.pos_senti.describe()


Out[24]:
count    198659.000000
mean          0.191822
std           0.106541
min           0.000000
25%           0.116000
50%           0.178000
75%           0.257000
max           0.964000
Name: pos_senti, dtype: float64

In [25]:
g=sns.distplot(raw_data.pos_senti)
g.axes.set_ylim(0,)
g.axes.set_title('Positive Sentiment\n',fontsize=20)
g.set_xlabel('Score',fontsize=15)


/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[25]:
<matplotlib.text.Text at 0x7efff5836668>

In [26]:
raw_data.neu_senti.describe()


Out[26]:
count    198659.000000
mean          0.764983
std           0.100293
min           0.036000
25%           0.704000
50%           0.775000
75%           0.835000
max           1.000000
Name: neu_senti, dtype: float64

In [27]:
g=sns.distplot(raw_data.neu_senti)
g.axes.set_ylim(0,)
g.axes.set_title('Neutral Sentiment\n',fontsize=20)
g.set_xlabel('Score',fontsize=15)


/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[27]:
<matplotlib.text.Text at 0x7efff565b668>

In [28]:
raw_data.comp_senti.describe()


Out[28]:
count    198659.000000
mean          0.643604
std           0.473956
min          -0.998300
25%           0.573000
50%           0.855500
75%           0.942700
max           0.999800
Name: comp_senti, dtype: float64

In [29]:
g=sns.distplot(raw_data.comp_senti)
g.axes.set_ylim(0,)
g.axes.set_title('Composite Sentiment Score\n',fontsize=20)
g.set_xlabel('Score',fontsize=15)


/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[29]:
<matplotlib.text.Text at 0x7efff5517d68>

In [30]:
g=sns.regplot(x="pos_senti", y="neg_senti", data=raw_data,
             fit_reg=True)
#g.axes.set_ylim(0,)
#g.axes.set_xlim(0,)
g.axes.set_title('Positive vs Negative Sentiment\n',fontsize=20)
g.set_xlabel('Positive Sentiment',fontsize=15)
g.set_ylabel('Negative Sentiment',fontsize=15)


Out[30]:
<matplotlib.text.Text at 0x7efff52bb128>

In [31]:
f='pos_senti ~ neg_senti'
results = smf.ols(formula=f, data=raw_data).fit()
print(results.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:              pos_senti   R-squared:                       0.130
Model:                            OLS   Adj. R-squared:                  0.130
Method:                 Least Squares   F-statistic:                 2.980e+04
Date:                Sun, 21 May 2017   Prob (F-statistic):               0.00
Time:                        22:48:26   Log-Likelihood:             1.7684e+05
No. Observations:              198659   AIC:                        -3.537e+05
Df Residuals:                  198657   BIC:                        -3.537e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      0.2237      0.000    773.013      0.000         0.223     0.224
neg_senti     -0.7372      0.004   -172.617      0.000        -0.746    -0.729
==============================================================================
Omnibus:                     9458.602   Durbin-Watson:                   1.843
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            11034.711
Skew:                           0.539   Prob(JB):                         0.00
Kurtosis:                       3.415   Cond. No.                         19.2
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

I expected a larger R^2


In [32]:
sns.set(context="paper", font="monospace")

In [33]:
corrmat = raw_data[['neg_senti', 'pos_senti', 'neu_senti', 'comp_senti']].corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=1, square=True)
ax.set_title('Sentiment Correlation Matrix Heatmap\n',fontsize=20)
plt.savefig('./plots/Sentiment_Correlation_Matrix_Heatmap.png', bbox_inches='tight')



In [34]:
corrmat


Out[34]:
neg_senti pos_senti neu_senti comp_senti
neg_senti 1.000000 -0.361147 -0.136729 -0.660513
pos_senti -0.361147 1.000000 -0.874365 0.608528
neu_senti -0.136729 -0.874365 1.000000 -0.302719
comp_senti -0.660513 0.608528 -0.302719 1.000000

Ask expected, the sentiments are highly correlated


In [35]:
corrmat = raw_data[[s for s in raw_data.columns if s!='text_lemma']].corr()
f, ax = plt.subplots(figsize=(16, 12))
sns.heatmap(corrmat, vmax=1, square=True)
ax.set_title('Correlation Matrix Heatmap\n',fontsize=20)
plt.savefig('./plots/Correlation_Matrix_Heatmap.png', bbox_inches='tight')



In [36]:
corrmat


Out[36]:
helpful num_sents num_words readability neg_senti pos_senti neu_senti comp_senti vec0 vec1 ... vec290 vec291 vec292 vec293 vec294 vec295 vec296 vec297 vec298 vec299
helpful 1.000000 0.101176 0.117021 0.045834 0.012478 -0.047061 0.043529 0.014837 -0.021024 -0.000308 ... -0.006312 -0.016650 0.010745 0.003621 -0.041179 0.020941 -0.038949 0.010570 0.011678 -0.031660
num_sents 0.101176 1.000000 0.852068 -0.030509 0.076832 -0.219416 0.193097 0.127060 0.080847 -0.023221 ... 0.096276 0.030849 0.041608 -0.014270 -0.136374 0.105767 -0.133743 0.112584 -0.015821 -0.159772
num_words 0.117021 0.852068 1.000000 0.313085 0.073248 -0.339856 0.322906 0.121910 0.093157 -0.023474 ... 0.025480 0.066575 0.140240 -0.035748 -0.166247 0.148201 -0.230496 0.089968 0.084351 -0.194113
readability 0.045834 -0.030509 0.313085 1.000000 0.022891 -0.157609 0.155496 0.006546 -0.040833 -0.082752 ... -0.086714 0.102816 0.108107 0.099261 -0.200150 0.147153 -0.066937 -0.091301 0.087600 -0.150674
neg_senti 0.012478 0.076832 0.073248 0.022891 1.000000 -0.361147 -0.136729 -0.660513 -0.066167 -0.015326 ... 0.017673 0.040843 -0.028029 -0.016988 0.041079 0.122023 -0.130083 0.097845 0.059839 -0.086497
pos_senti -0.047061 -0.219416 -0.339856 -0.157609 -0.361147 1.000000 -0.874365 0.608528 -0.124317 0.166925 ... 0.004793 -0.030210 -0.193280 0.041793 0.137701 -0.087122 0.385883 -0.184340 -0.112153 0.182514
neu_senti 0.043529 0.193097 0.322906 0.155496 -0.136729 -0.874365 1.000000 -0.302719 0.166511 -0.169324 ... -0.014260 0.010835 0.219893 -0.035567 -0.167636 0.029034 -0.342245 0.144911 0.088001 -0.148827
comp_senti 0.014837 0.127060 0.121910 0.006546 -0.660513 0.608528 -0.302719 1.000000 -0.007327 0.085703 ... 0.007676 -0.032437 -0.051333 0.003716 -0.022217 -0.031866 0.203050 -0.150316 -0.054746 0.067414
vec0 -0.021024 0.080847 0.093157 -0.040833 -0.066167 -0.124317 0.166511 -0.007327 1.000000 -0.239346 ... 0.140677 0.206046 0.237824 -0.041626 0.147721 -0.204304 0.059901 0.384844 -0.171336 0.039900
vec1 -0.000308 -0.023221 -0.023474 -0.082752 -0.015326 0.166925 -0.169324 0.085703 -0.239346 1.000000 ... 0.124256 -0.010269 -0.115197 -0.014984 0.225464 -0.143607 -0.194399 -0.033445 -0.064594 0.169178
vec2 -0.001800 -0.040844 -0.083776 0.038187 -0.102236 0.043041 0.007467 0.036784 -0.139377 0.012549 ... 0.204658 -0.006103 0.152946 -0.107047 -0.373972 0.075628 0.089887 -0.242217 -0.376876 -0.022572
vec3 0.037009 0.048317 0.219438 0.266689 -0.059989 -0.071295 0.106951 0.080793 0.007059 0.004844 ... 0.196621 0.056743 0.330639 -0.045391 -0.200419 0.072164 0.025184 -0.033891 -0.166246 -0.193170
vec4 -0.027483 -0.030523 -0.014919 -0.039737 -0.176090 0.104084 -0.018921 0.151693 0.075945 -0.060827 ... -0.129243 -0.094865 0.062424 0.042619 -0.072820 -0.022953 0.099799 -0.062912 -0.025225 -0.081417
vec5 0.005861 0.034797 0.026264 0.086985 -0.002146 0.068732 -0.071889 0.057091 -0.150680 0.095188 ... 0.266313 0.118189 -0.411483 -0.000468 -0.224229 0.203962 0.098254 -0.347256 0.074543 0.116049
vec6 -0.028466 -0.080706 -0.159235 -0.119074 0.069101 0.242410 -0.293459 0.026779 -0.082140 0.142972 ... -0.070788 -0.039885 -0.236534 0.168851 0.481742 -0.109835 0.120877 0.142119 0.089868 0.138145
vec7 0.015983 0.040698 0.111544 0.237758 -0.053963 -0.093716 0.127608 0.034884 0.002990 -0.284373 ... -0.153826 0.044967 0.012524 -0.246825 -0.468049 0.260467 0.012502 -0.192033 0.006660 -0.211779
vec8 -0.008543 -0.053366 -0.042866 -0.068535 -0.083356 -0.030607 0.075896 0.020454 0.103116 -0.174756 ... 0.049581 -0.001291 0.157500 -0.090473 -0.074079 -0.063739 0.076762 0.031495 -0.081361 0.037611
vec9 -0.024680 -0.030126 0.001294 -0.138502 0.083311 -0.060317 0.020743 -0.060879 0.146462 0.135247 ... -0.274154 0.028144 0.033730 0.144731 0.365791 -0.225978 -0.130372 0.299036 0.247475 0.011821
vec10 0.022359 0.075521 0.146375 0.249865 0.110868 -0.142832 0.094007 -0.079335 -0.027544 -0.164458 ... -0.293097 -0.081100 0.095453 0.209641 -0.097731 0.131152 -0.068611 0.096385 0.051349 -0.442117
vec11 0.013360 0.024754 -0.148576 -0.218639 -0.085405 0.044330 -0.002639 0.002961 -0.187227 0.042166 ... 0.109710 -0.093437 -0.028785 0.158295 -0.113576 -0.077821 0.047306 -0.019741 -0.031634 -0.055061
vec12 0.016946 0.129597 0.132072 -0.030925 0.092374 -0.224558 0.190474 -0.146186 0.266408 -0.108939 ... -0.332689 0.137497 0.092700 0.195032 0.246625 -0.194367 -0.263180 0.341567 0.253517 -0.054801
vec13 0.002316 -0.033118 -0.074886 0.002575 0.008436 0.151440 -0.165278 0.048670 -0.190556 0.209237 ... 0.056679 -0.002352 -0.290314 -0.080823 -0.039640 0.122661 -0.118793 -0.217003 0.075118 -0.032563
vec14 -0.004829 -0.034882 -0.047273 0.092256 -0.097265 0.039312 0.008859 0.052992 0.024597 -0.164679 ... 0.172790 0.203899 -0.065643 -0.335584 -0.304207 0.165112 0.064876 -0.336967 -0.174648 0.144551
vec15 0.026936 0.082896 0.078156 -0.055285 -0.015697 -0.111698 0.126834 -0.027553 0.107417 -0.115801 ... -0.037488 -0.243637 0.105147 0.189803 0.115211 -0.063695 0.047062 0.384396 0.092247 -0.220648
vec16 0.042710 0.117887 0.106741 0.109839 0.090225 -0.197286 0.162602 -0.106401 -0.127513 -0.086831 ... 0.120651 -0.131306 0.005134 0.048534 -0.286665 0.202404 -0.056175 0.002245 -0.105485 -0.253934
vec17 0.009666 -0.048920 -0.077814 -0.095418 -0.122056 0.083072 -0.024717 0.084116 -0.195741 0.226244 ... -0.155056 0.049024 0.048734 -0.302486 -0.227978 0.088641 -0.073952 -0.330209 -0.089352 0.253440
vec18 -0.031889 -0.076767 -0.112490 0.017747 0.033408 0.016888 -0.035323 -0.060855 0.105047 0.021997 ... 0.220337 0.097333 0.234584 0.031465 -0.027726 -0.056936 0.147275 0.035333 -0.404321 0.050212
vec19 -0.017722 -0.114135 -0.169977 -0.116136 0.060587 0.164823 -0.206634 0.011535 0.019015 0.012475 ... 0.135354 -0.014151 -0.222981 0.171999 0.214826 -0.109719 0.194199 0.022289 0.041999 0.104576
vec20 0.010571 0.053664 0.052573 -0.097521 0.119144 -0.119693 0.065173 -0.108112 0.167841 0.004613 ... 0.076374 0.151426 -0.198339 -0.057686 0.265916 -0.233256 -0.126688 0.161773 0.310187 0.222974
vec21 0.028727 0.132612 0.183525 0.123579 0.049268 -0.196257 0.182820 -0.065637 -0.111487 0.018035 ... -0.045784 -0.071549 -0.039100 0.049204 -0.312219 0.101839 -0.135744 0.014577 0.106556 -0.138166
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
vec270 0.046687 0.113716 0.212135 0.035422 0.134830 -0.421580 0.377687 -0.208423 0.125224 -0.032285 ... 0.083018 0.088619 0.199429 -0.087864 -0.020807 -0.107104 -0.355612 0.226833 0.076884 -0.058821
vec271 -0.010712 -0.036333 -0.114920 -0.039947 0.015466 -0.049967 0.045022 -0.079912 -0.003563 -0.116323 ... -0.009944 0.070457 0.039621 0.086599 -0.038119 0.038870 0.131242 0.067536 -0.026943 0.032962
vec272 0.017403 0.150534 0.196077 0.200743 0.094152 -0.309863 0.280163 -0.142861 -0.015141 -0.190043 ... 0.048669 0.218655 -0.041228 0.008569 -0.463058 0.201739 -0.167363 -0.244383 0.025017 -0.013483
vec273 0.031521 0.010851 0.072408 0.060361 -0.038523 -0.032932 0.055037 0.010160 0.022519 -0.006879 ... 0.180951 -0.043665 -0.002274 0.070459 0.130973 -0.296575 0.000056 0.157145 0.118444 0.182334
vec274 0.021748 0.058078 0.176714 0.093189 0.047291 -0.261210 0.252870 -0.101457 0.188002 -0.116188 ... 0.018539 -0.092921 0.111762 -0.052368 0.064583 -0.072599 -0.181438 0.322904 0.148508 -0.091687
vec275 -0.052947 -0.078521 -0.149816 -0.199782 -0.007388 0.125448 -0.129395 0.030249 0.003701 0.072829 ... 0.332487 0.072493 -0.195277 -0.164607 0.191395 0.008341 0.117617 -0.077701 0.053703 0.130733
vec276 -0.024138 -0.046860 -0.151765 -0.070463 -0.039156 0.154118 -0.143344 0.065948 -0.122047 -0.021997 ... 0.026207 -0.123804 -0.137541 -0.164441 -0.131555 0.253992 0.077084 -0.137691 -0.042582 0.012189
vec277 0.016283 -0.045008 0.004159 0.120040 -0.092154 -0.046709 0.097530 0.027718 -0.066422 -0.290467 ... -0.347357 -0.087059 0.058642 0.016361 -0.384046 0.145848 0.069743 -0.096444 0.109076 -0.272880
vec278 -0.039918 -0.114212 -0.129923 -0.072647 -0.132023 0.225731 -0.171071 0.140027 0.102135 0.091509 ... 0.103889 0.061577 0.092860 -0.004333 0.174749 -0.201516 0.228591 0.002645 -0.193668 0.098340
vec279 -0.036219 -0.097571 -0.139562 -0.136086 -0.030958 0.220027 -0.217598 0.107606 -0.128492 0.242425 ... 0.219319 0.029836 -0.300291 -0.242878 0.108033 0.125545 0.167167 -0.365112 0.084133 0.216840
vec280 -0.004194 0.103293 0.057103 -0.067578 0.001907 -0.040280 0.041804 0.003452 0.136609 -0.062074 ... 0.134555 0.070313 -0.018814 -0.170358 -0.128993 0.244894 0.084874 -0.155362 -0.227278 -0.153922
vec281 0.043319 0.004465 0.170179 0.211801 0.005537 -0.259527 0.272804 -0.073884 -0.073035 -0.193041 ... -0.240477 -0.066219 0.276648 0.168121 -0.034327 0.022993 -0.090216 0.112205 0.190575 -0.191582
vec282 0.012622 0.075245 0.116539 0.223578 0.100512 -0.094632 0.048206 -0.061312 -0.205968 0.132335 ... 0.064774 0.139208 -0.189202 0.075291 -0.220616 0.109271 -0.136714 -0.355535 0.060926 0.067327
vec283 -0.004048 -0.041612 0.029245 0.027726 -0.013222 -0.060225 0.070833 -0.026591 0.075360 -0.018149 ... -0.294120 -0.026700 0.388810 0.030704 0.166547 -0.251729 -0.062269 0.253014 0.029334 -0.028913
vec284 0.023236 -0.035544 0.038400 0.163045 -0.093831 0.062545 -0.017634 0.090981 -0.092794 -0.181956 ... -0.157039 -0.136451 -0.024965 0.169442 -0.039534 0.019582 0.119338 -0.008964 0.112580 -0.180220
vec285 0.004928 -0.095984 -0.175659 -0.046647 -0.018414 0.201379 -0.204361 0.065982 -0.375118 0.026617 ... 0.102106 -0.081220 -0.327520 0.035204 -0.217459 0.147503 0.262739 -0.405118 0.054221 0.060966
vec286 -0.059088 -0.217704 -0.220235 -0.135602 -0.149810 0.312414 -0.253891 0.179578 0.058300 0.047025 ... 0.118051 0.057925 -0.210279 -0.155900 0.074927 -0.086695 0.253000 -0.213195 0.057477 0.405567
vec287 -0.042835 -0.116410 -0.168371 -0.051483 -0.023383 0.135190 -0.131440 0.004425 0.242188 -0.069623 ... 0.135522 0.256998 0.002601 -0.059057 0.233511 -0.235458 0.211204 0.014788 0.000029 0.291241
vec288 0.008528 0.128002 0.073882 -0.052030 0.079790 -0.025959 -0.013930 -0.020813 0.019296 0.141945 ... 0.062339 0.116458 -0.341633 -0.180933 -0.077763 0.202168 -0.177380 -0.052322 0.026598 -0.101161
vec289 -0.012864 -0.011573 -0.063629 -0.080055 -0.032587 0.008152 0.008299 -0.002056 -0.056779 -0.027825 ... 0.177160 -0.015737 -0.032328 -0.040737 0.004178 0.088520 0.034649 -0.116996 0.058200 -0.139885
vec290 -0.006312 0.096276 0.025480 -0.086714 0.017673 0.004793 -0.014260 0.007676 0.140677 0.124256 ... 1.000000 0.086359 -0.124330 -0.143958 -0.038215 0.077831 0.060642 0.036517 -0.247817 -0.021947
vec291 -0.016650 0.030849 0.066575 0.102816 0.040843 -0.030210 0.010835 -0.032437 0.206046 -0.010269 ... 0.086359 1.000000 -0.025664 -0.018051 0.026688 -0.130943 -0.089177 -0.032861 0.049265 0.191543
vec292 0.010745 0.041608 0.140240 0.108107 -0.028029 -0.193280 0.219893 -0.051333 0.237824 -0.115197 ... -0.124330 -0.025664 1.000000 0.134685 -0.022947 -0.113328 0.063752 0.376372 -0.271742 -0.074341
vec293 0.003621 -0.014270 -0.035748 0.099261 -0.016988 0.041793 -0.035567 0.003716 -0.041626 -0.014984 ... -0.143958 -0.018051 0.134685 1.000000 0.182641 -0.309797 0.162131 0.306469 0.050644 -0.098392
vec294 -0.041179 -0.136374 -0.166247 -0.200150 0.041079 0.137701 -0.167636 -0.022217 0.147721 0.225464 ... -0.038215 0.026688 -0.022947 0.182641 1.000000 -0.459458 -0.014614 0.382778 0.181371 0.243914
vec295 0.020941 0.105767 0.148201 0.147153 0.122023 -0.087122 0.029034 -0.031866 -0.204304 -0.143607 ... 0.077831 -0.130943 -0.113328 -0.309797 -0.459458 1.000000 0.047613 -0.329496 -0.015858 -0.333320
vec296 -0.038949 -0.133743 -0.230496 -0.066937 -0.130083 0.385883 -0.342245 0.203050 0.059901 -0.194399 ... 0.060642 -0.089177 0.063752 0.162131 -0.014614 0.047613 1.000000 -0.011686 -0.221068 0.012556
vec297 0.010570 0.112584 0.089968 -0.091301 0.097845 -0.184340 0.144911 -0.150316 0.384844 -0.033445 ... 0.036517 -0.032861 0.376372 0.306469 0.382778 -0.329496 -0.011686 1.000000 -0.074942 -0.158611
vec298 0.011678 -0.015821 0.084351 0.087600 0.059839 -0.112153 0.088001 -0.054746 -0.171336 -0.064594 ... -0.247817 0.049265 -0.271742 0.050644 0.181371 -0.015858 -0.221068 -0.074942 1.000000 -0.062006
vec299 -0.031660 -0.159772 -0.194113 -0.150674 -0.086497 0.182514 -0.148827 0.067414 0.039900 0.169178 ... -0.021947 0.191543 -0.074341 -0.098392 0.243914 -0.333320 0.012556 -0.158611 -0.062006 1.000000

308 rows × 308 columns


In [37]:
raw_data=raw_data[fin_cols].copy()

In [38]:
raw_data.head()


Out[38]:
helpful num_sents num_words readability neg_senti pos_senti neu_senti comp_senti text_lemma vec0 ... vec290 vec291 vec292 vec293 vec294 vec295 vec296 vec297 vec298 vec299
0 0.0 0.693147 3.610918 6.742881 0.079 0.068 0.853 -0.1027 product arrive label peanut actually small siz... -0.019901 ... -0.178709 0.120293 0.048853 -0.028560 0.024294 -0.051074 -0.082868 -0.058978 0.058156 0.020952
1 0.0 1.386294 3.555348 6.734948 0.000 0.448 0.552 0.9468 great taffy great price wide assortment yummy ... -0.076091 ... -0.125921 0.026862 -0.011833 -0.023788 0.028657 -0.001059 -0.003236 -0.048324 -0.050874 0.113610
2 0.0 1.609438 4.521789 6.743588 0.029 0.163 0.809 0.8830 get wild hair taffy order pound bag taffy enjo... -0.048797 ... -0.154745 0.004021 0.004185 0.006071 -0.032341 0.030001 0.004792 -0.122627 -0.015319 0.046176
3 0.0 1.609438 4.143135 6.742527 0.034 0.273 0.693 0.9346 saltwater taffy great flavor soft chewy candy ... -0.009421 ... -0.185385 0.038134 0.014824 -0.012089 0.007642 -0.013590 0.038388 -0.117533 0.042929 0.137415
4 0.0 1.609438 3.526361 6.737915 0.000 0.480 0.520 0.9487 taffy good soft chewy flavor amazing definitel... -0.073490 ... -0.155703 0.041312 -0.121036 -0.063175 0.075995 -0.005276 0.051416 -0.136569 0.021066 0.123007

5 rows × 309 columns


In [39]:
raw_data.to_pickle('./clean_data/clean_data.pkl')