In [1]:
import numpy as np
import pandas as pd
import gzip
import json
import gzip
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
pd.set_option('display.max_colwidth', -1)
In [2]:
#Some functions to handle files
def parse(path):
with open(path) as data_file:
for d in data_file:
yield eval(d)
def getDF(path):
i = 0
df = {}
for d in parse(path):
df[i] = d
i += 1
return pd.DataFrame.from_dict(df, orient='index')
Let's load the data in a Pandas dataframe, the data can be downloaded from http://jmcauley.ucsd.edu/data/amazon/, I have uncompressed it in my data directory.
In [3]:
%%time
#Load the dataset
df = getDF('data/Electronics_5_sample_400k.json')
Let's take a quick look at the data:
In [4]:
df.head()
Out[4]:
In [5]:
df.shape
Out[5]:
Let's convert the unixReviewTime to date type.
In [6]:
#transform the Unix timestamp column to date
from datetime import datetime
df['date'] = pd.to_datetime(df['unixReviewTime'],unit='s')
Now let's see if there are any null values in the data.
In [7]:
# Identifiy missing values
null_data = df[df.isnull().any(axis=1)]
print 'There are {} rows with missing values'.format(len(null_data.index))
In [8]:
[col for col in null_data.columns if df[col].isnull().any()]
Out[8]:
Since the null column is reviewerName and we are interested in the review text, we will keep these rows.
Let's look at how long are the reviews.
In [9]:
# Review length distribution
df['reviewLength'] = df['reviewText'].apply(len)
df['reviewWordCount'] = df['reviewText'].map(lambda x: len(str.split(x)))
#Reviews over 1000 character are quite rare so don't want to include them in the plot
ax = df[df['reviewLength'] < 1000]['reviewLength'].plot(kind='hist')
ax.set_xlabel("Review Length")
ax.set_ylabel("Frequency")
plt.title('Review Length Distribution')
plt.show()
In [10]:
ax = df[df['reviewWordCount'] < 600]['reviewWordCount'].plot(kind='hist')
ax.set_xlabel("Word Count")
ax.set_ylabel("Frequency")
plt.title('Word Count Distribution')
plt.show()
Let's see summarize the overall scores
In [11]:
df['overall'].describe()
Out[11]:
In [12]:
ax = df['overall'].value_counts().plot.bar()
ax.set_xlabel("Rating")
ax.set_ylabel("Frequency")
plt.title('Rating Distribution')
plt.show()
Let's see if there is a relation between the number of reviews and the score.
In [13]:
g1 = df.groupby(["asin", "overall"]).size().reset_index(name='count')
g1 = g1.sort_values(by=['count'], ascending=[False])
#Top 10 products with most reviews
g1.head(10)
Out[13]:
In [14]:
g1.tail(10)
Out[14]:
In [15]:
ax = g1[g1['count'] < 30].boxplot(column='count', by='overall')
ax.set_xlabel("Overall rating")
ax.set_ylabel("Counts")
plt.title('')
plt.show()
It seems to be a relation between the high score and the number of reviews.
Let's use the vader lexicon to make some analysis
In [16]:
#import nltk
#nltk.download('vader_lexicon')
In [18]:
%%time
from nltk.sentiment.vader import SentimentIntensityAnalyzer
SIA = SentimentIntensityAnalyzer()
#Compound score positive: score >= 0.5, negative score <= -0.5, neutral (compound score > -0.5) and (compound score < 0.5)
df = df.merge(df.reviewText.apply(lambda x: pd.Series({
'vader_compound': SIA.polarity_scores(x)['compound'],
'vader_pos': SIA.polarity_scores(x)['pos'],
'vader_neg': SIA.polarity_scores(x)['neg']
})),left_index=True, right_index=True)
In [19]:
ax = df.boxplot(column=['vader_compound'], by='overall')
ax.set_xlabel("Overall rating")
ax.set_ylabel("Compound Score")
plt.title('')
plt.show()
In [20]:
ax = df.boxplot(column=['vader_pos', 'vader_neg'], by='overall')
plt.show()
Let's label our dataset
In [21]:
def get_sentiment(row):
if row['vader_compound'] >= 0.5 or row['overall'] > 3:
return 'Positive'
elif row['vader_compound'] <= -0.5 or row['overall'] < 3:
return 'Negative'
else:
return 'Neutral'
df['sentiment'] = df.apply(lambda row: get_sentiment(row), axis=1)
In [28]:
df[['overall', 'vader_compound', 'reviewText', 'sentiment']].head(30)
Out[28]:
Let's visualize the most common words, if you use anaconde install the wordcloud package using conda install -c conda-forge wordcloud=1.2.1.
In [22]:
from wordcloud import WordCloud,STOPWORDS
stopwords = set(STOPWORDS)
# Transform to single string
positive_reviews_str = df[df['sentiment'] == 'Positive'].reviewText.str.cat()
# Create wordclouds
wordcloud_positive = WordCloud(
background_color='white',
stopwords=stopwords,
max_words=200,
max_font_size=40,
scale=3,
random_state=1 # chosen at random by flipping a coin; it was heads
).generate(positive_reviews_str)
fig = plt.figure(figsize=(30,10))
ax1 = fig.add_subplot(211)
ax1.imshow(wordcloud_positive,interpolation='bilinear')
ax1.axis("off")
ax1.set_title('Reviews with Positive Scores', fontsize=20)
plt.show()
In [23]:
negative_reviews_str = df[df['sentiment'] == 'Negative'].reviewText.str.cat()
wordcloud_negative = WordCloud(
background_color='black',
stopwords=stopwords,
max_words=200,
max_font_size=40,
scale=3,
random_state=1 # chosen at random by flipping a coin; it was heads
).generate(negative_reviews_str)
fig = plt.figure(figsize=(30,10))
ax1 = fig.add_subplot(211)
ax1.imshow(wordcloud_negative,interpolation='bilinear')
ax1.axis("off")
ax1.set_title('Reviews with Negative Scores', fontsize=20)
plt.show()
Overall Sentiment
In [51]:
df.groupby(['sentiment']).size()
Out[51]:
In [24]:
from sklearn.model_selection import train_test_split
from sklearn import feature_extraction, ensemble, cross_validation, metrics
from sklearn.metrics import confusion_matrix
Let's focus only in the positive and negative reviews.
In [25]:
df = df[df['sentiment'] != 'Neutral']
df= df.reset_index(drop=True)
For our model the sentiment column needs to be transformed into a binary column
In [26]:
df['label'] = df['sentiment'].map(lambda x: 1 if x == "Positive" else 0)
In [27]:
train, test = train_test_split(df, test_size=0.2)
In [36]:
%%time
vectorizer = feature_extraction.text.CountVectorizer(analyzer = "word", stop_words = 'english', max_features = 1000, preprocessor = None,)
vectorizer.fit(train.reviewText)
x_train = vectorizer.transform(train.reviewText)
(train.reviewText)
x_test = vectorizer.transform(test.reviewText)
y_train = train.label
y_test = test.label
prediction = dict()
In [32]:
def print_confusion(y, y_hat):
confusion = pd.crosstab(y, y_hat, rownames=['Predicted'], colnames=[' True'], margins=True)
print(confusion)
In [33]:
def print_score(model, x, y):
print ('Model Score: {:2.4}%' . format(model.score(x, y)*100))
In [37]:
%%time
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(x_train, y_train)
prediction['Multinomial'] = model.predict(x_test)
print_confusion(y_test, prediction['Multinomial'])
print_score(model, x_train, y_train)
In [38]:
%%time
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB().fit(x_train, y_train)
prediction['Bernoulli'] = model.predict(x_test)
print_confusion(y_test, prediction['Bernoulli'])
print_score(model, x_train, y_train)
In [39]:
%%time
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5)
logreg_result = logreg.fit(x_train, y_train)
prediction['Logistic'] = logreg.predict(x_test)
print_confusion(y_test, prediction['Logistic'])
print_score(logreg, x_train, y_train)
In [40]:
%%time
from sklearn import svm
svc_model = svm.LinearSVC(penalty = 'l1', dual=False, C=1.0, random_state=2016)
svc_model.fit(x_train, y_train)
prediction['SVM'] = svc_model.predict(x_test)
print_confusion(y_test, prediction['SVM'])
print_score(svc_model, x_train, y_train)
In [41]:
from sklearn.ensemble import RandomForestClassifier
In [42]:
%%time
rfc_model = RandomForestClassifier(n_estimators = 100, class_weight='balanced', random_state = 2016)
rfc_model.fit(x_train, y_train)
prediction['Random Forest'] = rfc_model.predict(x_test)
In [43]:
%%time
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rfc_model, x_train, y_train, scoring = "roc_auc")
print("CV AUC {}, Average AUC {}".format(scores, scores.mean()))
print_confusion(y_test, prediction['Random Forest'])
print_score(rfc_model, x_train, y_train)
In [44]:
from sklearn.metrics import roc_curve, auc
def plot_roc_auc (y, prediction, title_text):
cmp = 0
colors = ['g', 'o', 'y', 'k', 'm' ]
for model, predicted in prediction.items():
false_positive_rate, true_positive_rate, thresholds = roc_curve(y, predicted)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.plot(false_positive_rate, true_positive_rate, colors[cmp], label='%s: AUC %0.2f'% (model,roc_auc))
cmp += 1
plt.title(title_text)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [45]:
plot_roc_auc (y_test, prediction, 'Classifiers comparison with ROC')
In [ ]: