In [1]:

    
%matplotlib inline

Initial Data Cleaning and Exploration

Code for the initial data cleaning and exploration done before modeling
Author: Jimmy Charité
Email: jimmy.charite@gmail.com

Directory & Packages



In [2]:

    
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import pysentiment
from textstat.textstat import textstat 
from wordcloud import WordCloud
import nltk
from bs4 import BeautifulSoup

The default directory is the code subdirectory. Changing to the main repo directory above.



In [3]:

    
retval=os.chdir("..")

Helper Functions



In [4]:

    
def pd_tab(df,col,sort_by='count',asc=False):
    tab=df[col].value_counts(dropna=False).reset_index(name='count')
    tab.columns=[col,'count']
    tab['percent']=tab['count']/tab['count'].sum()
    tab.sort_values(by=sort_by,inplace=True,ascending=asc)
    return tab

Upload Data



In [5]:

    
raw_data=pd.read_csv("./raw_data/Reviews.csv")
raw_data.head()









    Out[5]:






  
    
      
      Id
      ProductId
      UserId
      ProfileName
      HelpfulnessNumerator
      HelpfulnessDenominator
      Score
      Time
      Summary
      Text
    
  
  
    
      0
      1
      B001E4KFG0
      A3SGXH7AUHU8GW
      delmartian
      1
      1
      5
      1303862400
      Good Quality Dog Food
      I have bought several of the Vitality canned d...
    
    
      1
      2
      B00813GRG4
      A1D87F6ZCVE5NK
      dll pa
      0
      0
      1
      1346976000
      Not as Advertised
      Product arrived labeled as Jumbo Salted Peanut...
    
    
      2
      3
      B000LQOCH0
      ABXLMWJIXXAIN
      Natalia Corres "Natalia Corres"
      1
      1
      4
      1219017600
      "Delight" says it all
      This is a confection that has been around a fe...
    
    
      3
      4
      B000UA0QIQ
      A395BORC6FGVXV
      Karl
      3
      3
      2
      1307923200
      Cough Medicine
      If you are looking for the secret ingredient i...
    
    
      4
      5
      B006K2ZZ7K
      A1UQRSCLF8GW1T
      Michael D. Bigham "M. Wassir"
      0
      0
      5
      1350777600
      Great taffy
      Great taffy at a great price.  There was a wid...

Inspecting the Raw Features



In [6]:

    
raw_data.columns









    Out[6]:





Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')



In [7]:

    
len(raw_data)









    Out[7]:





568454

Data Key

product/productId: asin, e.g. amazon.com/dp/B001E4KFG0
review/userId: id of the user, e.g. A3SGXH7AUHU8GW
review/profileName: name of the user
review/helpfulness: fraction of users who found the review helpful
review/score: rating of the product
review/time: time of the review (unix time)
review/summary: review summary
review/text: text of the review

ID



In [8]:

    
raw_data.Id.is_unique









    Out[8]:





True

Product ID



In [9]:

    
len(raw_data.ProductId.unique())









    Out[9]:





74258



In [10]:

    
len(raw_data.ProductId.unique())/len(raw_data)









    Out[10]:





0.13063150228514533



In [11]:

    
pd_tab(raw_data,'ProductId').head(10)



In [12]:

    
pd_tab(raw_data,'ProductId').tail(10)

UserID



In [13]:

    
len(raw_data.UserId.unique())









    Out[13]:





256059



In [14]:

    
len(raw_data.UserId.unique())/len(raw_data)









    Out[14]:





0.450448057362601



In [15]:

    
pd_tab(raw_data,'UserId').head(10)









    Out[15]:






  
    
      
      UserId
      count
      percent
    
  
  
    
      0
      A3OXHLG6DIBRW8
      448
      0.000788
    
    
      1
      A1YUL9PCJR3JTY
      421
      0.000741
    
    
      2
      AY12DBB0U420B
      389
      0.000684
    
    
      3
      A281NPSIMI1C2R
      365
      0.000642
    
    
      4
      A1Z54EM24Y40LL
      256
      0.000450
    
    
      5
      A1TMAVN4CEM8U8
      204
      0.000359
    
    
      6
      A2MUGFV2TDQ47K
      201
      0.000354
    
    
      7
      A3TVZM3ZIXG8YW
      199
      0.000350
    
    
      8
      A3PJZ8TU8FDQ1K
      178
      0.000313
    
    
      9
      AQQLWCMRNDFGI
      176
      0.000310



In [16]:

    
pd_tab(raw_data,'UserId').tail(10)









    Out[16]:






  
    
      
      UserId
      count
      percent
    
  
  
    
      139127
      AFCYJ726FYLGR
      1
      0.000002
    
    
      139128
      A1T0JL58LS4BL6
      1
      0.000002
    
    
      139129
      A13BJ4VU1A42I1
      1
      0.000002
    
    
      139130
      AEULOMVGWA2S6
      1
      0.000002
    
    
      139131
      A98GHZIYNHNX6
      1
      0.000002
    
    
      139132
      A1UIPJJOOG6OGX
      1
      0.000002
    
    
      139133
      A3J435DYMJ6E15
      1
      0.000002
    
    
      139134
      A2G9YGFKY0AX7P
      1
      0.000002
    
    
      139135
      A3KR94YR310LKT
      1
      0.000002
    
    
      256058
      A3AW15DKOM03KZ
      1
      0.000002

Profile Name

ignoring

Helpfulness Numerator



In [17]:

    
raw_data.HelpfulnessNumerator.isnull().sum()









    Out[17]:





0



In [18]:

    
np.sum(raw_data.HelpfulnessNumerator==0)









    Out[18]:





303826



In [19]:

    
np.sum(raw_data.HelpfulnessNumerator==0)/len(raw_data)









    Out[19]:





0.53447772379119507

At least 53% are not helpful



In [20]:

    
raw_data.HelpfulnessNumerator.describe()









    Out[20]:





count    568454.000000
mean          1.743817
std           7.636513
min           0.000000
25%           0.000000
50%           0.000000
75%           2.000000
max         866.000000
Name: HelpfulnessNumerator, dtype: float64



In [21]:

    
g=sns.distplot(raw_data.HelpfulnessNumerator)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number Found Helpful\n',fontsize=20)
g.set_xlabel('Counts',fontsize=15)









    



/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[21]:





<matplotlib.text.Text at 0x7f0e3507e198>

Very skewed



In [22]:

    
g=sns.distplot(raw_data[raw_data.HelpfulnessNumerator>0].HelpfulnessNumerator)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number Found Helpful\n(Non-Zero Counts)',fontsize=20)
g.set_xlabel('Counts',fontsize=15)









    



/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[22]:





<matplotlib.text.Text at 0x7f0e3407fdd8>



In [23]:

    
g=sns.distplot(raw_data[raw_data.HelpfulnessNumerator<100].HelpfulnessNumerator)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number Found Helpful\n(Counts Less than 100)',fontsize=20)
g.set_xlabel('Counts',fontsize=15)









    



/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[23]:





<matplotlib.text.Text at 0x7f0e33fd0908>

Helpfulness Denominator



In [24]:

    
raw_data.HelpfulnessDenominator.isnull().sum()









    Out[24]:





0



In [25]:

    
np.sum(raw_data.HelpfulnessDenominator==0)









    Out[25]:





270052



In [26]:

    
np.sum(raw_data.HelpfulnessDenominator==0)/len(raw_data)









    Out[26]:





0.4750639453676111



In [27]:

    
raw_data.HelpfulnessDenominator.describe()









    Out[27]:





count    568454.00000
mean          2.22881
std           8.28974
min           0.00000
25%           0.00000
50%           1.00000
75%           2.00000
max         923.00000
Name: HelpfulnessDenominator, dtype: float64



In [28]:

    
raw_data[raw_data.HelpfulnessDenominator>100].HelpfulnessDenominator.describe()









    Out[28]:





count    423.000000
mean     204.886525
std      131.705928
min      101.000000
25%      120.000000
50%      152.000000
75%      235.000000
max      923.000000
Name: HelpfulnessDenominator, dtype: float64



In [29]:

    
g=sns.distplot(raw_data.HelpfulnessDenominator)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number Found Helpful or Unhelpful\n',fontsize=20)
g.set_xlabel('Counts',fontsize=15)









    



/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[29]:





<matplotlib.text.Text at 0x7f0e30d4fd30>

Helpfulness Numerator/Denominator



In [30]:

    
len(raw_data[raw_data.HelpfulnessDenominator<raw_data.HelpfulnessNumerator])









    Out[30]:





2



In [31]:

    
raw_data[raw_data.HelpfulnessDenominator<raw_data.HelpfulnessNumerator]









    Out[31]:






  
    
      
      Id
      ProductId
      UserId
      ProfileName
      HelpfulnessNumerator
      HelpfulnessDenominator
      Score
      Time
      Summary
      Text
    
  
  
    
      44736
      44737
      B001EQ55RW
      A2V0I904FH7ABY
      Ram
      3
      2
      4
      1212883200
      Pure cocoa taste with crunchy almonds inside
      It was almost a 'love at first bite' - the per...
    
    
      64421
      64422
      B000MIDROQ
      A161DK06JJMCYF
      J. E. Stephens "Jeanne"
      3
      1
      5
      1224892800
      Bought This for My Son at College
      My son loves spaghetti so I didn't hesitate or...



In [32]:

    
raw_data=raw_data.loc[(raw_data.HelpfulnessDenominator<raw_data.HelpfulnessNumerator)==False]



In [33]:

    
raw_data['Unhelpful']=raw_data.HelpfulnessDenominator-raw_data.HelpfulnessNumerator



In [34]:

    
g=sns.regplot(x="HelpfulnessNumerator", y="Unhelpful", data=raw_data[raw_data.HelpfulnessDenominator<100],
             fit_reg=False)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number Found Helpful vs Unhelpful\n',fontsize=20)
g.set_xlabel('No. Found Helpful',fontsize=15)
g.set_ylabel('No. Found Unhelpful',fontsize=15)









    Out[34]:





<matplotlib.text.Text at 0x7f0e30c6a588>



In [35]:

    
raw_data['ppt_helpful']=raw_data.HelpfulnessNumerator/raw_data.HelpfulnessDenominator
raw_data.ix[raw_data.HelpfulnessDenominator==0,'ppt_helpful']=0



In [36]:

    
g=sns.distplot(raw_data.ppt_helpful)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Percent Helpful\n',fontsize=20)
g.set_xlabel('Percent',fontsize=15)









    



/home/jimmy/anaconda3/envs/py36/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[36]:





<matplotlib.text.Text at 0x7f0e30e2bd68>



In [37]:

    
probs=list(np.linspace(start=0,stop=1,num=20))



In [38]:

    
raw_data.ppt_helpful.describe(percentiles=probs)









    Out[38]:





count    568452.000000
mean          0.407855
std           0.462054
min           0.000000
0%            0.000000
5.3%          0.000000
10.5%         0.000000
15.8%         0.000000
21.1%         0.000000
26.3%         0.000000
31.6%         0.000000
36.8%         0.000000
42.1%         0.000000
47.4%         0.000000
50%           0.000000
52.6%         0.000000
57.9%         0.500000
63.2%         0.727273
68.4%         1.000000
73.7%         1.000000
78.9%         1.000000
84.2%         1.000000
89.5%         1.000000
94.7%         1.000000
100%          1.000000
max           1.000000
Name: ppt_helpful, dtype: float64



In [39]:

    
probs=list(np.linspace(start=0,stop=1,num=20))
for p in probs:
    ppt=np.sum(raw_data.ppt_helpful<p)/len(raw_data)
    print('Less than {}% Helpful: {}%'.format(round(p*100,2), round(ppt*100,2)))









    



Less than 0.0% Helpful: 0.0%
Less than 5.26% Helpful: 53.48%
Less than 10.53% Helpful: 53.6%
Less than 15.79% Helpful: 53.81%
Less than 21.05% Helpful: 54.18%
Less than 26.32% Helpful: 54.62%
Less than 31.58% Helpful: 54.78%
Less than 36.84% Helpful: 55.77%
Less than 42.11% Helpful: 56.12%
Less than 47.37% Helpful: 56.31%
Less than 52.63% Helpful: 60.13%
Less than 57.89% Helpful: 60.37%
Less than 63.16% Helpful: 60.88%
Less than 68.42% Helpful: 62.8%
Less than 73.68% Helpful: 63.19%
Less than 78.95% Helpful: 64.51%
Less than 84.21% Helpful: 65.77%
Less than 89.47% Helpful: 66.64%
Less than 94.74% Helpful: 67.42%
Less than 100.0% Helpful: 67.75%



In [40]:

    
for p in probs:
    ppt=np.sum(raw_data.ppt_helpful>=p)/len(raw_data)
    print('At Least {}% Helpful: {}%'.format(round(p*100,2), round(ppt*100,2)))









    



At Least 0.0% Helpful: 100.0%
At Least 5.26% Helpful: 46.52%
At Least 10.53% Helpful: 46.4%
At Least 15.79% Helpful: 46.19%
At Least 21.05% Helpful: 45.82%
At Least 26.32% Helpful: 45.38%
At Least 31.58% Helpful: 45.22%
At Least 36.84% Helpful: 44.23%
At Least 42.11% Helpful: 43.88%
At Least 47.37% Helpful: 43.69%
At Least 52.63% Helpful: 39.87%
At Least 57.89% Helpful: 39.63%
At Least 63.16% Helpful: 39.12%
At Least 68.42% Helpful: 37.2%
At Least 73.68% Helpful: 36.81%
At Least 78.95% Helpful: 35.49%
At Least 84.21% Helpful: 34.23%
At Least 89.47% Helpful: 33.36%
At Least 94.74% Helpful: 32.58%
At Least 100.0% Helpful: 32.25%



In [41]:

    
np.sum((raw_data.ppt_helpful>=0.5) & (raw_data.ppt_helpful<=.8))/len(raw_data)









    Out[41]:





0.088551012222667883



In [42]:

    
np.sum((raw_data.ppt_helpful>=0.7) & (raw_data.ppt_helpful<=.8))/len(raw_data)









    Out[42]:





0.023317711961608017



In [43]:

    
np.sum((raw_data.ppt_helpful>=0.8) & (raw_data.ppt_helpful<=.9))/len(raw_data)









    Out[43]:





0.022733669685391204



In [44]:

    
np.sum((raw_data.ppt_helpful>=0.7) & (raw_data.ppt_helpful<=.9))/len(raw_data)









    Out[44]:





0.039530162617072331



In [45]:

    
np.sum((raw_data.ppt_helpful>=0.9) & (raw_data.ppt_helpful<=1))/len(raw_data)









    Out[45]:





0.33344064230577075

Will probably define helpful reviews as those +90%



In [46]:

    
g=sns.regplot(x="HelpfulnessDenominator", y="ppt_helpful", data=raw_data[raw_data.HelpfulnessDenominator<100],
             fit_reg=False)
g.axes.set_ylim(0,1)
g.axes.set_xlim(0,)
g.axes.set_title('Percent Helpful vs Total Found Helpful or Unhelpful\n',fontsize=20)
g.set_xlabel('No. Found Helpful or Unhelpful',fontsize=15)
g.set_ylabel('Percent Found Helpful',fontsize=15)









    Out[46]:





<matplotlib.text.Text at 0x7f0e3096ec50>

Interesting pattern. Difficult to understand what this means.

Product Rating



In [47]:

    
raw_data.Score.isnull().sum()









    Out[47]:





0



In [48]:

    
raw_data.Score.describe()









    Out[48]:





count    568452.000000
mean          4.183198
std           1.310438
min           1.000000
25%           4.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: Score, dtype: float64



In [49]:

    
pd_tab(raw_data,'Score',sort_by='Score')



In [50]:

    
g=sns.lmplot(x="HelpfulnessDenominator", y="ppt_helpful", data=raw_data[raw_data.HelpfulnessDenominator<100],
             hue='Score', fit_reg=False)

For this analysis I will assume that the helpfulness prediction will be made without the produce score

Time



In [51]:

    
raw_data.Time.head()









    Out[51]:





0    1303862400
1    1346976000
2    1219017600
3    1307923200
4    1350777600
Name: Time, dtype: int64



In [52]:

    
raw_data['date_time']=pd.to_datetime(raw_data['Time'],unit='s')
raw_data['date']=pd.to_datetime(raw_data['date_time'],unit='d')



In [53]:

    
raw_data.date_time.describe()









    Out[53]:





count                  568452
unique                   3168
top       2012-10-16 00:00:00
freq                     1143
first     1999-10-08 00:00:00
last      2012-10-26 00:00:00
Name: date_time, dtype: object



In [54]:

    
raw_data.date.describe()









    Out[54]:





count                  568452
unique                   3168
top       2012-10-16 00:00:00
freq                     1143
first     1999-10-08 00:00:00
last      2012-10-26 00:00:00
Name: date, dtype: object

Median Percent Helpful



In [55]:

    
ts=raw_data[['date','ppt_helpful']].copy()
ts['ppt_helpful']=ts.groupby(['date']).ppt_helpful.transform('median')
ts.set_index(['date'],inplace=True)
ts.sort_index(inplace=True)
ts.plot()









    Out[55]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f0e308ac080>

Average Helpfulness



In [56]:

    
ts=raw_data[['date','ppt_helpful']].copy()
ts['ppt_helpful']=ts.groupby(['date']).ppt_helpful.transform('mean')
ts.set_index(['date'],inplace=True)
ts.sort_index(inplace=True)
ts.plot()









    Out[56]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f0e3086ccc0>

Count Reviews



In [57]:

    
ts=raw_data[['date','ppt_helpful']].copy()
ts['ppt_helpful']=ts.groupby(['date']).ppt_helpful.transform('count')
ts.set_index(['date'],inplace=True)
ts.sort_index(inplace=True)
ts.plot()









    Out[57]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f0e0f7f1f60>



In [58]:

    
ts.head()









    Out[58]:






  
    
      
      ppt_helpful
    
    
      date
      
    
  
  
    
      1999-10-08
      1.0
    
    
      1999-10-25
      1.0
    
    
      1999-12-02
      1.0
    
    
      1999-12-06
      3.0
    
    
      1999-12-06
      3.0



In [59]:

    
ts.tail()









    Out[59]:






  
    
      
      ppt_helpful
    
    
      date
      
    
  
  
    
      2012-10-26
      564.0
    
    
      2012-10-26
      564.0
    
    
      2012-10-26
      564.0
    
    
      2012-10-26
      564.0
    
    
      2012-10-26
      564.0



In [60]:

    
len(raw_data[raw_data.date>=pd.to_datetime('2010-01-01')])









    Out[60]:





447842



In [61]:

    
len(raw_data[raw_data.date>=pd.to_datetime('2012-01-01')])









    Out[61]:





198659



In [62]:

    
raw_data['year']=raw_data.date.dt.year



In [63]:

    
pd_tab(raw_data,'year',sort_by='year')



In [64]:

    
tab=raw_data.groupby(['year']).ppt_helpful.mean().reset_index().sort_values(by='year')



In [65]:

    
tab









    Out[65]:






  
    
      
      year
      ppt_helpful
    
  
  
    
      0
      1999
      0.416667
    
    
      1
      2000
      0.649432
    
    
      2
      2001
      0.600230
    
    
      3
      2002
      0.580093
    
    
      4
      2003
      0.667129
    
    
      5
      2004
      0.693606
    
    
      6
      2005
      0.716568
    
    
      7
      2006
      0.701132
    
    
      8
      2007
      0.649950
    
    
      9
      2008
      0.560758
    
    
      10
      2009
      0.535150
    
    
      11
      2010
      0.504281
    
    
      12
      2011
      0.422771
    
    
      13
      2012
      0.251971

The helpfulness scores are definitely non-stationary across years. I will just use the data from 2012.

Building Model Training Data

Limiting Sample



In [66]:

    
raw_data_2=raw_data[(raw_data.year==2012)].copy()

Defining Predictor



In [67]:

    
raw_data_2['helpful']=(raw_data_2.ppt_helpful>=0.9).astype(float)



In [68]:

    
pd_tab(raw_data_2,'helpful')



In [69]:

    
del raw_data

Minor Prelim Text Cleaning



In [70]:

    
raw_data_2['Text'] = raw_data_2['Text'].apply(lambda x: BeautifulSoup(x,'lxml').get_text())

New Features



In [71]:

    
nlp=spacy.load('en')



In [72]:

    
raw_data_2['doc_id']=(np.linspace(start=1,stop=len(raw_data_2),num=len(raw_data_2))-1)
raw_data_2['doc_id'].head()









    Out[72]:





1    0.0
4    1.0
5    2.0
6    3.0
7    4.0
Name: doc_id, dtype: float64



In [73]:

    
parse_doc_list=[]
parse_doc_list_id=[]
i=0
for doc in nlp.pipe(raw_data_2.Text.astype(str),batch_size=10000,n_threads=4):
    parse_doc_list.append(doc)
    parse_doc_list_id.append(i)
    i=i+1



In [74]:

    
raw_data_2['parsed_text'] = parse_doc_list



In [108]:

    
type(parse_doc_list[0])









    Out[108]:





spacy.tokens.doc.Doc

Document Vector



In [76]:

    
doc_vecs = np.row_stack([doc.vector for doc in parse_doc_list])
doc_vecs = np.column_stack((doc_vecs,parse_doc_list_id))



In [77]:

    
doc_vecs.shape









    Out[77]:





(198659, 301)



In [78]:

    
len(raw_data_2)









    Out[78]:





198659



In [79]:

    
doc_vecs=pd.DataFrame(doc_vecs)



In [80]:

    
cols=['vec'+str(s) for s in doc_vecs.columns]
cols[-1]='doc_id'



In [81]:

    
doc_vecs.columns=cols



In [82]:

    
doc_vecs.to_pickle('./clean_data/doc_vecs.pkl')



In [83]:

    
raw_data_2=pd.merge(raw_data_2,doc_vecs,how='left',on=['doc_id'])

Word and Sentence Count



In [84]:

    
def sent_count(X):
    return len([x for x in X.sents])

def word_count(X):
    return len(X)



In [85]:

    
raw_data_2['num_sents'] = raw_data_2['parsed_text'].apply(sent_count)



In [86]:

    
raw_data_2['num_words'] = raw_data_2['parsed_text'].apply(word_count)

Readability



In [87]:

    
raw_data_2['readability'] = raw_data_2['Text'].apply(textstat.automated_readability_index)

Sentiment



In [90]:

    
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [91]:

    
sent_analyzer = SentimentIntensityAnalyzer()



In [92]:

    
raw_data_2['sentiment_dict'] = raw_data_2['Text'].apply(sent_analyzer.polarity_scores)



In [93]:

    
raw_data_2['neg_senti'] = raw_data_2['sentiment_dict'].apply(lambda x: x['neg'])



In [94]:

    
raw_data_2['pos_senti'] = raw_data_2['sentiment_dict'].apply(lambda x: x['pos'])



In [95]:

    
raw_data_2['neu_senti'] = raw_data_2['sentiment_dict'].apply(lambda x: x['neu'])



In [96]:

    
raw_data_2['comp_senti'] = raw_data_2['sentiment_dict'].apply(lambda x: x['compound'])

Word Count Type Features



In [98]:

    
def return_lemma_text(text):
    '''
    Return space separated lemmas, excluding spaces, urls, #s, emails, stop words, and proper nouns
    '''
    return ' '.join([t.lemma_.lower() for t in text if (t.is_punct==False) &
                                                (t.is_space==False) &
                                                (t.like_url==False) &
                                                (t.like_num==False) &
                                                (t.like_email==False) &
                                                (t.is_stop==False) &
                                                (t.pos_!='PROPN')])



In [99]:

    
raw_data_2['text_lemma'] = raw_data_2['parsed_text'].apply(return_lemma_text)



In [100]:

    
raw_data_2['Text'].head()









    Out[100]:





0    Product arrived labeled as Jumbo Salted Peanut...
1    Great taffy at a great price.  There was a wid...
2    I got a wild hair for taffy and ordered this f...
3    This saltwater taffy had great flavors and was...
4    This taffy is so good.  It is very soft and ch...
Name: Text, dtype: object



In [101]:

    
raw_data_2['text_lemma'].head()









    Out[101]:





0    product arrive label peanut actually small siz...
1    great taffy great price wide assortment yummy ...
2    get wild hair taffy order pound bag taffy enjo...
3    saltwater taffy great flavor soft chewy candy ...
4    taffy good soft chewy flavor amazing definitel...
Name: text_lemma, dtype: object



In [109]:

    
del raw_data_2['parsed_text']



In [110]:

    
raw_data_2.to_pickle('./clean_data/raw_data_post_parse.pkl')

The Obligatory NLP Word Clouds

All Reviews



In [104]:

    
text=' '.join(raw_data_2.text_lemma)
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('./plots/all_reviews_word_cloud.png', bbox_inches='tight')
plt.show()

Helpful Reviews



In [105]:

    
text=' '.join(raw_data_2[raw_data_2.helpful==1].text_lemma)
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('./plots/helpful_reviews_word_cloud.png', bbox_inches='tight')
plt.show()

Unhelpful Reviews



In [106]:

    
text=' '.join(raw_data_2[raw_data_2.helpful==0].text_lemma)
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('./plots/unhelpful_reviews_word_cloud.png', bbox_inches='tight')
plt.show()

	ProductId	count	percent
0	B007JFMH8M	913	0.001606
2	B002QWP89S	632	0.001112
3	B0026RQTGE	632	0.001112
4	B002QWHJOU	632	0.001112
1	B002QWP8H0	632	0.001112
5	B003B3OOPA	623	0.001096
6	B001EO5Q64	567	0.000997
12	B000VK8AVK	564	0.000992
15	B007M83302	564	0.000992
14	B0013NUGDE	564	0.000992

	ProductId	count	percent
53986	B006L22I56	1	0.000002
53985	B003VMI0GK	1	0.000002
53984	B00029XN7G	1	0.000002
53983	B0015THQ1G	1	0.000002
53982	B003QW9NCA	1	0.000002
53981	B00008MOI8	1	0.000002
53980	B001RAQLOQ	1	0.000002
53979	B004HBQZ0C	1	0.000002
53978	B0040HAFVI	1	0.000002
74257	B007HP5FLK	1	0.000002

	Score	count	percent
0	5	363121	0.638789
1	4	80654	0.141884
3	3	42640	0.075011
4	2	29769	0.052369
2	1	52268	0.091948

	Id	ProductId	UserId	ProfileName	HelpfulnessNumerator	HelpfulnessDenominator	Score	Time	Summary	Text
0	1	B001E4KFG0	A3SGXH7AUHU8GW	delmartian	1	1	5	1303862400	Good Quality Dog Food	I have bought several of the Vitality canned d...
1	2	B00813GRG4	A1D87F6ZCVE5NK	dll pa	0	0	1	1346976000	Not as Advertised	Product arrived labeled as Jumbo Salted Peanut...
2	3	B000LQOCH0	ABXLMWJIXXAIN	Natalia Corres "Natalia Corres"	1	1	4	1219017600	"Delight" says it all	This is a confection that has been around a fe...
3	4	B000UA0QIQ	A395BORC6FGVXV	Karl	3	3	2	1307923200	Cough Medicine	If you are looking for the secret ingredient i...
4	5	B006K2ZZ7K	A1UQRSCLF8GW1T	Michael D. Bigham "M. Wassir"	0	0	5	1350777600	Great taffy	Great taffy at a great price. There was a wid...

	UserId	count	percent
0	A3OXHLG6DIBRW8	448	0.000788
1	A1YUL9PCJR3JTY	421	0.000741
2	AY12DBB0U420B	389	0.000684
3	A281NPSIMI1C2R	365	0.000642
4	A1Z54EM24Y40LL	256	0.000450
5	A1TMAVN4CEM8U8	204	0.000359
6	A2MUGFV2TDQ47K	201	0.000354
7	A3TVZM3ZIXG8YW	199	0.000350
8	A3PJZ8TU8FDQ1K	178	0.000313
9	AQQLWCMRNDFGI	176	0.000310

	UserId	count	percent
139127	AFCYJ726FYLGR	1	0.000002
139128	A1T0JL58LS4BL6	1	0.000002
139129	A13BJ4VU1A42I1	1	0.000002
139130	AEULOMVGWA2S6	1	0.000002
139131	A98GHZIYNHNX6	1	0.000002
139132	A1UIPJJOOG6OGX	1	0.000002
139133	A3J435DYMJ6E15	1	0.000002
139134	A2G9YGFKY0AX7P	1	0.000002
139135	A3KR94YR310LKT	1	0.000002
256058	A3AW15DKOM03KZ	1	0.000002

	Id	ProductId	UserId	ProfileName	HelpfulnessNumerator	HelpfulnessDenominator	Score	Time	Summary	Text
44736	44737	B001EQ55RW	A2V0I904FH7ABY	Ram	3	2	4	1212883200	Pure cocoa taste with crunchy almonds inside	It was almost a 'love at first bite' - the per...
64421	64422	B000MIDROQ	A161DK06JJMCYF	J. E. Stephens "Jeanne"	3	1	5	1224892800	Bought This for My Son at College	My son loves spaghetti so I didn't hesitate or...

	ppt_helpful
date
1999-10-08	1.0
1999-10-25	1.0
1999-12-02	1.0
1999-12-06	3.0
1999-12-06	3.0

	ppt_helpful
date
2012-10-26	564.0
2012-10-26	564.0
2012-10-26	564.0
2012-10-26	564.0
2012-10-26	564.0

	year	count	percent
0	2012	198659	0.349474
1	2011	163299	0.287270
2	2010	85884	0.151084
3	2009	55326	0.097327
4	2008	34161	0.060095
5	2007	22300	0.039229
6	2006	6671	0.011735
7	2005	1335	0.002348
8	2004	561	0.000987
9	2003	132	0.000232
10	2002	73	0.000128
12	2001	13	0.000023
11	2000	32	0.000056
13	1999	6	0.000011

	year	ppt_helpful
0	1999	0.416667
1	2000	0.649432
2	2001	0.600230
3	2002	0.580093
4	2003	0.667129
5	2004	0.693606
6	2005	0.716568
7	2006	0.701132
8	2007	0.649950
9	2008	0.560758
10	2009	0.535150
11	2010	0.504281
12	2011	0.422771
13	2012	0.251971