In [1]:
%matplotlib inline
Code for the initial data cleaning and exploration done before modeling
Author: Jimmy Charité
Email: jimmy.charite@gmail.com
In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import pysentiment
from textstat.textstat import textstat
from wordcloud import WordCloud
import nltk
from bs4 import BeautifulSoup
The default directory is the code subdirectory. Changing to the main repo directory above.
In [3]:
retval=os.chdir("..")
In [4]:
def pd_tab(df,col,sort_by='count',asc=False):
tab=df[col].value_counts(dropna=False).reset_index(name='count')
tab.columns=[col,'count']
tab['percent']=tab['count']/tab['count'].sum()
tab.sort_values(by=sort_by,inplace=True,ascending=asc)
return tab
In [5]:
raw_data=pd.read_csv("./raw_data/Reviews.csv")
raw_data.head()
Out[5]:
In [6]:
raw_data.columns
Out[6]:
In [7]:
len(raw_data)
Out[7]:
Data Key
In [8]:
raw_data.Id.is_unique
Out[8]:
In [9]:
len(raw_data.ProductId.unique())
Out[9]:
In [10]:
len(raw_data.ProductId.unique())/len(raw_data)
Out[10]:
In [11]:
pd_tab(raw_data,'ProductId').head(10)
Out[11]:
In [12]:
pd_tab(raw_data,'ProductId').tail(10)
Out[12]:
In [13]:
len(raw_data.UserId.unique())
Out[13]:
In [14]:
len(raw_data.UserId.unique())/len(raw_data)
Out[14]:
In [15]:
pd_tab(raw_data,'UserId').head(10)
Out[15]:
In [16]:
pd_tab(raw_data,'UserId').tail(10)
Out[16]:
ignoring
In [17]:
raw_data.HelpfulnessNumerator.isnull().sum()
Out[17]:
In [18]:
np.sum(raw_data.HelpfulnessNumerator==0)
Out[18]:
In [19]:
np.sum(raw_data.HelpfulnessNumerator==0)/len(raw_data)
Out[19]:
At least 53% are not helpful
In [20]:
raw_data.HelpfulnessNumerator.describe()
Out[20]:
In [21]:
g=sns.distplot(raw_data.HelpfulnessNumerator)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number Found Helpful\n',fontsize=20)
g.set_xlabel('Counts',fontsize=15)
Out[21]:
Very skewed
In [22]:
g=sns.distplot(raw_data[raw_data.HelpfulnessNumerator>0].HelpfulnessNumerator)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number Found Helpful\n(Non-Zero Counts)',fontsize=20)
g.set_xlabel('Counts',fontsize=15)
Out[22]:
In [23]:
g=sns.distplot(raw_data[raw_data.HelpfulnessNumerator<100].HelpfulnessNumerator)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number Found Helpful\n(Counts Less than 100)',fontsize=20)
g.set_xlabel('Counts',fontsize=15)
Out[23]:
In [24]:
raw_data.HelpfulnessDenominator.isnull().sum()
Out[24]:
In [25]:
np.sum(raw_data.HelpfulnessDenominator==0)
Out[25]:
In [26]:
np.sum(raw_data.HelpfulnessDenominator==0)/len(raw_data)
Out[26]:
In [27]:
raw_data.HelpfulnessDenominator.describe()
Out[27]:
In [28]:
raw_data[raw_data.HelpfulnessDenominator>100].HelpfulnessDenominator.describe()
Out[28]:
In [29]:
g=sns.distplot(raw_data.HelpfulnessDenominator)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number Found Helpful or Unhelpful\n',fontsize=20)
g.set_xlabel('Counts',fontsize=15)
Out[29]:
In [30]:
len(raw_data[raw_data.HelpfulnessDenominator<raw_data.HelpfulnessNumerator])
Out[30]:
In [31]:
raw_data[raw_data.HelpfulnessDenominator<raw_data.HelpfulnessNumerator]
Out[31]:
In [32]:
raw_data=raw_data.loc[(raw_data.HelpfulnessDenominator<raw_data.HelpfulnessNumerator)==False]
In [33]:
raw_data['Unhelpful']=raw_data.HelpfulnessDenominator-raw_data.HelpfulnessNumerator
In [34]:
g=sns.regplot(x="HelpfulnessNumerator", y="Unhelpful", data=raw_data[raw_data.HelpfulnessDenominator<100],
fit_reg=False)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Number Found Helpful vs Unhelpful\n',fontsize=20)
g.set_xlabel('No. Found Helpful',fontsize=15)
g.set_ylabel('No. Found Unhelpful',fontsize=15)
Out[34]:
In [35]:
raw_data['ppt_helpful']=raw_data.HelpfulnessNumerator/raw_data.HelpfulnessDenominator
raw_data.ix[raw_data.HelpfulnessDenominator==0,'ppt_helpful']=0
In [36]:
g=sns.distplot(raw_data.ppt_helpful)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Percent Helpful\n',fontsize=20)
g.set_xlabel('Percent',fontsize=15)
Out[36]:
In [37]:
probs=list(np.linspace(start=0,stop=1,num=20))
In [38]:
raw_data.ppt_helpful.describe(percentiles=probs)
Out[38]:
In [39]:
probs=list(np.linspace(start=0,stop=1,num=20))
for p in probs:
ppt=np.sum(raw_data.ppt_helpful<p)/len(raw_data)
print('Less than {}% Helpful: {}%'.format(round(p*100,2), round(ppt*100,2)))
In [40]:
for p in probs:
ppt=np.sum(raw_data.ppt_helpful>=p)/len(raw_data)
print('At Least {}% Helpful: {}%'.format(round(p*100,2), round(ppt*100,2)))
In [41]:
np.sum((raw_data.ppt_helpful>=0.5) & (raw_data.ppt_helpful<=.8))/len(raw_data)
Out[41]:
In [42]:
np.sum((raw_data.ppt_helpful>=0.7) & (raw_data.ppt_helpful<=.8))/len(raw_data)
Out[42]:
In [43]:
np.sum((raw_data.ppt_helpful>=0.8) & (raw_data.ppt_helpful<=.9))/len(raw_data)
Out[43]:
In [44]:
np.sum((raw_data.ppt_helpful>=0.7) & (raw_data.ppt_helpful<=.9))/len(raw_data)
Out[44]:
In [45]:
np.sum((raw_data.ppt_helpful>=0.9) & (raw_data.ppt_helpful<=1))/len(raw_data)
Out[45]:
Will probably define helpful reviews as those +90%
In [46]:
g=sns.regplot(x="HelpfulnessDenominator", y="ppt_helpful", data=raw_data[raw_data.HelpfulnessDenominator<100],
fit_reg=False)
g.axes.set_ylim(0,1)
g.axes.set_xlim(0,)
g.axes.set_title('Percent Helpful vs Total Found Helpful or Unhelpful\n',fontsize=20)
g.set_xlabel('No. Found Helpful or Unhelpful',fontsize=15)
g.set_ylabel('Percent Found Helpful',fontsize=15)
Out[46]:
Interesting pattern. Difficult to understand what this means.
In [47]:
raw_data.Score.isnull().sum()
Out[47]:
In [48]:
raw_data.Score.describe()
Out[48]:
In [49]:
pd_tab(raw_data,'Score',sort_by='Score')
Out[49]:
In [50]:
g=sns.lmplot(x="HelpfulnessDenominator", y="ppt_helpful", data=raw_data[raw_data.HelpfulnessDenominator<100],
hue='Score', fit_reg=False)
For this analysis I will assume that the helpfulness prediction will be made without the produce score
In [51]:
raw_data.Time.head()
Out[51]:
In [52]:
raw_data['date_time']=pd.to_datetime(raw_data['Time'],unit='s')
raw_data['date']=pd.to_datetime(raw_data['date_time'],unit='d')
In [53]:
raw_data.date_time.describe()
Out[53]:
In [54]:
raw_data.date.describe()
Out[54]:
Median Percent Helpful
In [55]:
ts=raw_data[['date','ppt_helpful']].copy()
ts['ppt_helpful']=ts.groupby(['date']).ppt_helpful.transform('median')
ts.set_index(['date'],inplace=True)
ts.sort_index(inplace=True)
ts.plot()
Out[55]:
Average Helpfulness
In [56]:
ts=raw_data[['date','ppt_helpful']].copy()
ts['ppt_helpful']=ts.groupby(['date']).ppt_helpful.transform('mean')
ts.set_index(['date'],inplace=True)
ts.sort_index(inplace=True)
ts.plot()
Out[56]:
Count Reviews
In [57]:
ts=raw_data[['date','ppt_helpful']].copy()
ts['ppt_helpful']=ts.groupby(['date']).ppt_helpful.transform('count')
ts.set_index(['date'],inplace=True)
ts.sort_index(inplace=True)
ts.plot()
Out[57]:
In [58]:
ts.head()
Out[58]:
In [59]:
ts.tail()
Out[59]:
In [60]:
len(raw_data[raw_data.date>=pd.to_datetime('2010-01-01')])
Out[60]:
In [61]:
len(raw_data[raw_data.date>=pd.to_datetime('2012-01-01')])
Out[61]:
In [62]:
raw_data['year']=raw_data.date.dt.year
In [63]:
pd_tab(raw_data,'year',sort_by='year')
Out[63]:
In [64]:
tab=raw_data.groupby(['year']).ppt_helpful.mean().reset_index().sort_values(by='year')
In [65]:
tab
Out[65]:
The helpfulness scores are definitely non-stationary across years. I will just use the data from 2012.
In [66]:
raw_data_2=raw_data[(raw_data.year==2012)].copy()
In [67]:
raw_data_2['helpful']=(raw_data_2.ppt_helpful>=0.9).astype(float)
In [68]:
pd_tab(raw_data_2,'helpful')
Out[68]:
In [69]:
del raw_data
In [70]:
raw_data_2['Text'] = raw_data_2['Text'].apply(lambda x: BeautifulSoup(x,'lxml').get_text())
In [71]:
nlp=spacy.load('en')
In [72]:
raw_data_2['doc_id']=(np.linspace(start=1,stop=len(raw_data_2),num=len(raw_data_2))-1)
raw_data_2['doc_id'].head()
Out[72]:
In [73]:
parse_doc_list=[]
parse_doc_list_id=[]
i=0
for doc in nlp.pipe(raw_data_2.Text.astype(str),batch_size=10000,n_threads=4):
parse_doc_list.append(doc)
parse_doc_list_id.append(i)
i=i+1
In [74]:
raw_data_2['parsed_text'] = parse_doc_list
In [108]:
type(parse_doc_list[0])
Out[108]:
In [76]:
doc_vecs = np.row_stack([doc.vector for doc in parse_doc_list])
doc_vecs = np.column_stack((doc_vecs,parse_doc_list_id))
In [77]:
doc_vecs.shape
Out[77]:
In [78]:
len(raw_data_2)
Out[78]:
In [79]:
doc_vecs=pd.DataFrame(doc_vecs)
In [80]:
cols=['vec'+str(s) for s in doc_vecs.columns]
cols[-1]='doc_id'
In [81]:
doc_vecs.columns=cols
In [82]:
doc_vecs.to_pickle('./clean_data/doc_vecs.pkl')
In [83]:
raw_data_2=pd.merge(raw_data_2,doc_vecs,how='left',on=['doc_id'])
In [84]:
def sent_count(X):
return len([x for x in X.sents])
def word_count(X):
return len(X)
In [85]:
raw_data_2['num_sents'] = raw_data_2['parsed_text'].apply(sent_count)
In [86]:
raw_data_2['num_words'] = raw_data_2['parsed_text'].apply(word_count)
In [87]:
raw_data_2['readability'] = raw_data_2['Text'].apply(textstat.automated_readability_index)
In [90]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
In [91]:
sent_analyzer = SentimentIntensityAnalyzer()
In [92]:
raw_data_2['sentiment_dict'] = raw_data_2['Text'].apply(sent_analyzer.polarity_scores)
In [93]:
raw_data_2['neg_senti'] = raw_data_2['sentiment_dict'].apply(lambda x: x['neg'])
In [94]:
raw_data_2['pos_senti'] = raw_data_2['sentiment_dict'].apply(lambda x: x['pos'])
In [95]:
raw_data_2['neu_senti'] = raw_data_2['sentiment_dict'].apply(lambda x: x['neu'])
In [96]:
raw_data_2['comp_senti'] = raw_data_2['sentiment_dict'].apply(lambda x: x['compound'])
In [98]:
def return_lemma_text(text):
'''
Return space separated lemmas, excluding spaces, urls, #s, emails, stop words, and proper nouns
'''
return ' '.join([t.lemma_.lower() for t in text if (t.is_punct==False) &
(t.is_space==False) &
(t.like_url==False) &
(t.like_num==False) &
(t.like_email==False) &
(t.is_stop==False) &
(t.pos_!='PROPN')])
In [99]:
raw_data_2['text_lemma'] = raw_data_2['parsed_text'].apply(return_lemma_text)
In [100]:
raw_data_2['Text'].head()
Out[100]:
In [101]:
raw_data_2['text_lemma'].head()
Out[101]:
In [109]:
del raw_data_2['parsed_text']
In [110]:
raw_data_2.to_pickle('./clean_data/raw_data_post_parse.pkl')
All Reviews
In [104]:
text=' '.join(raw_data_2.text_lemma)
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('./plots/all_reviews_word_cloud.png', bbox_inches='tight')
plt.show()
Helpful Reviews
In [105]:
text=' '.join(raw_data_2[raw_data_2.helpful==1].text_lemma)
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('./plots/helpful_reviews_word_cloud.png', bbox_inches='tight')
plt.show()
Unhelpful Reviews
In [106]:
text=' '.join(raw_data_2[raw_data_2.helpful==0].text_lemma)
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('./plots/unhelpful_reviews_word_cloud.png', bbox_inches='tight')
plt.show()