In [7]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict, train_test_split,GridSearchCV
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB
import json
import seaborn as sns
from sklearn.metrics import roc_curve, auc, precision_recall_curve, confusion_matrix, classification_report, accuracy_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
Import & Analize Data
In [10]:
#Import data from json file and create a list
data = []
with open('/home/borjaregueral/Digital_Music_5.json') as f:
for line in f:
data.append(json.loads(line))
#Create a dataframe with the columns that are interesting for this exercise
#Columns left out: 'helpful', 'reviewTime', 'reviewerID','reviewerName'
names = ["overall", "reviewText"]
amazonraw = pd.DataFrame(data, columns=names)
amazonraw['overall'] = amazonraw['overall'].astype(int)
amazonraw.head()
Out[10]:
In [11]:
#Analyse the dataset: types, length of the dataframe and NaN
amazonraw.info()
amazonraw.dtypes
Out[11]:
Build Sentiment Scores and Categories
In [12]:
amazonraw.overall.describe()
Out[12]:
In [13]:
#Change the Overall variable into a categorical variable
#Ratings equal or lower than 3 have been considered negative as the mean is 4.25.
#The hypothesis is that although the abovmentioned ratings could be considered positive they are negative
amazonraw.loc[amazonraw['overall'] <= 3, 'Sentiment'] = 0
amazonraw.loc[amazonraw['overall'] >=4 , 'Sentiment'] = 1
amazonraw.loc[amazonraw['Sentiment'] == 0, 'Category'] ='Negative'
amazonraw.loc[amazonraw['Sentiment'] == 1, 'Category'] = 'Positive'
In [14]:
#Count the each of the categories
a = amazonraw['Category'].value_counts('Positive')
b = pd.value_counts(amazonraw['Category'].values, sort=False)
print('Number of ocurrencies:\n', b)
print('\n')
print('Frequency of each value:\n', a)
In [15]:
#Downsample majority class (due to computational restrictions we downsample the majority instead of upsampling the minority)
# Separate majority and minority classes
amazon_majority = amazonraw[amazonraw.Sentiment == 1]
amazon_minority = amazonraw[amazonraw.Sentiment == 0]
# Downsample mairlinesass
amazon_majority_downsampled = resample(amazon_majority, replace=False, n_samples=12590, random_state=123)
# Combine minority class with downsampled majority class
amazon = pd.concat([amazon_majority_downsampled, amazon_minority])
# Display new class counts
amazon.Category.value_counts()
Out[15]:
In [16]:
#Graphical representation of the positive and negative reviews
plt.figure(figsize=(20, 5))
plt.subplot(1, 2, 1)
sns.set(style="white")
ax = sns.countplot(x="overall", data=amazonraw)
plt.title('Amazon Ratings')
plt.subplot(1, 2, 2)
sns.set(style="white")
ax = sns.countplot(x="Category", data=amazon)
plt.title('Categories in the downsampled dataset')
Out[16]:
In [17]:
#Create new dataframe that has the Categories, Overall scores, Sentiment and ReviewText
names = ['Category',"overall",'Sentiment', "reviewText"]
amazon1 = pd.DataFrame(amazon, columns=names)
amazon.head()
Out[17]:
In [18]:
#Lines are reshuffled and 50% of the dataset is used to reduce the computing effort
amazon2 = amazon1.sample(frac=1, random_state=7)
In [19]:
#Predictors and prediced variables are formed
X = amazon2['reviewText']
y = amazon2['Sentiment']
#Split the data set into train and test 70/30
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=135)
#KFold for cross validation analysis
kf = KFold(5)
In [20]:
#Analysis starts with Bag of Words and common English words are extracted
vect = CountVectorizer(analyzer = 'word', stop_words='english').fit(X_train)
X_trainvec = vect.transform(X_train)
X_testvec = vect.transform(X_test)
In [13]:
#Count the number of english words and take a look at the type of words that are extracted
print("Number of stop words is :", len(ENGLISH_STOP_WORDS), "\n")
print("Examples: ", list(ENGLISH_STOP_WORDS)[::10])
In [14]:
#Take a look at the features identified by bag of words
features_names = vect.get_feature_names()
print(len(features_names))
print("\n")
# print first 20 features
print(features_names[:20])
print("\n")
# print last 20 features
print(features_names[-20:])
In [15]:
#Size of the X_trainvector sparse matrix
print(X_trainvec.shape)
X_trainvec
Out[15]:
In [16]:
#Check the size of the y_train vector to avoid problems when running the logistic regression model
y_train.shape
Out[16]:
Bernoulli
In [17]:
# Initialize and fit the model.
l3 = BernoulliNB()
l3.fit(X_trainvec, y_train)
# Predict on training set
predtrain_y = l3.predict(X_trainvec)
In [18]:
#Predicting on the test set
l3 = BernoulliNB()
l3.fit(X_testvec, y_test)
# Predict on training set
predtest_y = l3.predict(X_testvec)
In [19]:
#Evaluation of the model (testing)
target_names = ['0.0', '1.0']
print(classification_report(y_test, predtest_y, target_names=target_names))
confusion = confusion_matrix(y_test, predtest_y)
print(confusion)
# Accuracy tables.
table_test = pd.crosstab(y_test, predtest_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0] / table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0] / table_test.loc['All','All']
print((
'Bernouilli accuracy: {}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}\n\n'
).format(cross_val_score(l3,X_testvec,y_test,cv=kf).mean(),test_tI_errors, test_tII_errors))
Logistic Model
In [20]:
# Initialize and fit the model.
lr = LogisticRegression()
lr.fit(X_trainvec, y_train)
Out[20]:
In [21]:
#Once the model has been trained test it on the test dataset
lr.fit(X_testvec, y_test)
# Predict on test set
predtest_y = lr.predict(X_testvec)
In [22]:
#Evaluate model (test set)
target_names = ['0.0', '1.0']
print(classification_report(y_test, predtest_y, target_names=target_names))
confusion = confusion_matrix(y_test, predtest_y)
print(confusion)
# Accuracy tables.
table_test = pd.crosstab(y_test, predtest_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0] / table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0] / table_test.loc['All','All']
print((
'Logistics accuracy: {}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}\n\n'
).format(cross_val_score(lr,X_testvec,y_test,cv=kf).mean(),test_tI_errors, test_tII_errors))
TFIDF
In [23]:
vect2 = TfidfVectorizer(min_df=20, analyzer = 'word', stop_words = 'english',
ngram_range = (1,3)
).fit(X_train)
X_train_vectorized = vect2.transform(X_train)
X_test_vectorized = vect2.transform(X_test)
In [24]:
features_names = vect2.get_feature_names()
print(len(features_names))
Logistic Model
In [33]:
# Initialize and fit the model.
lr2 = LogisticRegression(class_weight='balanced')
#Create range of values to fit parameters
k1 = ['l1', 'l2']
k2 = np.arange(50) + 1
k3 = ['balanced', None]
parameters = {'penalty': k1,
'C': k2,
'class_weight':k3}
#Fit parameters
lrr = GridSearchCV(lr2, param_grid=parameters, cv=kf)
#Fit on Training set
lrr.fit(X_train_vectorized, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:", lrr.best_params_)
In [34]:
#Once the model has been trained test it on the test dataset
lr2.fit(X_test_vectorized, y_test)
# Predict on test set
predtest2_y = lrr.predict(X_test_vectorized)
In [35]:
#Evaluate model (test set)
target_names = ['0.0', '1.0']
print(classification_report(y_test, predtest2_y, target_names=target_names))
confusion = confusion_matrix(y_test, predtest2_y)
print(confusion)
# Accuracy tables.
table_test = pd.crosstab(y_test, predtest2_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0] / table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0] / table_test.loc['All','All']
print((
'Losgistics model accuracy: {}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}\n\n'
).format(cross_val_score(lr2,X_test_vectorized,y_test,cv=kf).mean(),test_tI_errors, test_tII_errors))
Bernouilli Model
In [36]:
# Initialize and fit the model.
l3 = BernoulliNB()
#Create range of values to fit parameters
k1 = np.arange(50) + 1
parameters = {'alpha': k1
}
#Fit parameters
l33 = GridSearchCV(l3, param_grid=parameters, cv=kf)
#Fit on Training set
l33.fit(X_train_vectorized, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:", l33.best_params_)
In [37]:
# Predict on the test data set
l33.fit(X_test_vectorized, y_test)
# Predict on training set
predtest3_y = l33.predict(X_test_vectorized)
In [38]:
#Evaluation of the model (testing)
target_names = ['0.0', '1.0']
print(classification_report(y_test, predtest3_y, target_names=target_names))
confusion = confusion_matrix(y_test, predtest3_y)
print(confusion)
# Accuracy tables.
table_test = pd.crosstab(y_test, predtest3_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0] / table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0] / table_test.loc['All','All']
print((
'Bernouilli set accuracy: {}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}\n\n'
).format(cross_val_score(l33,X_test_vectorized,y_test,cv=kf).mean(),test_tI_errors, test_tII_errors))
KNN model
In [39]:
# Initialize and fit the model
KNN = KNeighborsClassifier(n_jobs=-1)
#Create range of values to fit parameters
k1 = [1,3,5,7,9,11,13,15,17,19,21]
k3 = ['uniform', 'distance']
parameters = {'n_neighbors': k1,
'weights':k3}
#Fit parameters
clf = GridSearchCV(KNN, param_grid=parameters, cv=kf)
#Fit the tunned model
clf.fit(X_train_vectorized, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:", clf.best_params_)
In [40]:
#Initialize the model on test dataset
clf.fit(X_test_vectorized, y_test)
# Predict on test dataset
predtest3_y = clf.predict(X_test_vectorized)
In [41]:
#Evaluate model on the test set
target_names = ['0.0', '1.0']
print(classification_report(y_test, predtest3_y, target_names=target_names))
#Create confusion matrix
confusion = confusion_matrix(y_test, predtest3_y)
print(confusion)
# Accuracy tables.
table_test = pd.crosstab(y_test, predtest3_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0] / table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0] / table_test.loc['All','All']
#Print Results
print((
'KNN accuracy: {}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}\n\n'
).format(cross_val_score(clf,X_test_vectorized,y_test,cv=kf).mean(),test_tI_errors, test_tII_errors))
Random Forest
In [25]:
#For the Random Forest hyperparameters tuning,due to computational restrictions,
#grid search will be applied to one paramter at a time on the train set
#updating the value as we move along the hyperparameters tuning
#Number of trees
param_test1 = {'n_estimators':range(300,400,20)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(),
param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=kf)
gsearch1.fit(X_train_vectorized, y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
Out[25]:
In [41]:
#Max depth and min sample split
#Tried values for max depth from 2-60 with values under 0.8641. To find the value that increases accuracy
# the range between 60-80 is used
# min sample split values from 50-500 being the value between 80-120 the ones that increases accuracy
param_test2 = {'max_depth':range(61,80,2), 'min_samples_split': range(80,121,20)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators = 360),
param_grid = param_test2, scoring='roc_auc',n_jobs=-1,iid=False, cv=kf)
gsearch2.fit(X_train_vectorized, y_train)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
Out[41]:
In [46]:
#Re run the min_sample split with the min_sample leaf
param_test3 = {'min_samples_leaf':range(2,33,10)}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators = 360, max_depth = 65 , min_samples_split = 80 ),
param_grid = param_test3, scoring='roc_auc',n_jobs=-1,iid=False, cv=kf)
gsearch3.fit(X_train_vectorized, y_train)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
Out[46]:
In [45]:
#Based on the results shown for the minimum sample split, we will lwave it in the default number
#Re run the min_sample split with the min_sample leaf
param_test4 = {'criterion':['gini', 'entropy']}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators = 360, max_depth = 65 , min_samples_split = 80),
param_grid = param_test4, scoring='roc_auc',n_jobs=-1,iid=False, cv=kf)
gsearch4.fit(X_train_vectorized, y_train)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
Out[45]:
In [47]:
#Fit in test dataset
gsearch4.fit(X_test_vectorized, y_test)
#Predict on test dataset
predtestrf_y = gsearch4.predict(X_test_vectorized)
In [48]:
#Test Scores
target_names = ['0', '1']
print(classification_report(y_test, predtestrf_y, target_names=target_names))
cnf = confusion_matrix(y_test, predtestrf_y)
print(cnf)
table_test = pd.crosstab(y_test, predtestrf_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']
print((
'Random Forest accuracy:{}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}'
).format(cross_val_score(gsearch4,X_test_vectorized,y_test,cv=kf).mean(),test_tI_errors, test_tII_errors))
Decision Tree
In [45]:
# Train model
OTM = DecisionTreeClassifier()
#Create range of values to fit parameters
k2 = ['auto', 'sqrt', 'log2']
parameters = {'max_features': k2
}
#Fit parameters
OTM1 = GridSearchCV(OTM, param_grid=parameters, cv=kf)
#Fit the tunned model
OTM1.fit(X_train_vectorized, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:", OTM1.best_params_)
In [46]:
#Fit on test dataset
OTM1.fit(X_test_vectorized, y_test)
#Predict parameters on test dataset
predtestrf_y = OTM1.predict(X_test_vectorized)
In [47]:
#Test Scores
target_names = ['0', '1']
print(classification_report(y_test, predtestrf_y, target_names=target_names))
cnf = confusion_matrix(y_test, predtestrf_y)
print(cnf)
table_test = pd.crosstab(y_test, predtestrf_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']
print((
'Decision Tree accuracy:{}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}'
).format(cross_val_score(OTM1,X_test_vectorized,y_test,cv=kf).mean(),test_tI_errors, test_tII_errors))
SVC
In [48]:
# Train model
svc = SVC()
#Create range of values to fit parameters
ks1 = np.arange(20)+1
ks4 = ['linear','rbf']
parameters = {'C': ks1,
'kernel': ks4}
#Fit parameters
svc1 = GridSearchCV(svc, param_grid=parameters, cv=kf)
#Fit the tunned model
svc1.fit(X_train_vectorized, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:", svc1.best_params_)
In [49]:
#Fit tunned model on Test set
svc1.fit(X_test_vectorized, y_test)
# Predict on training set
predtestsvc_y = svc1.predict(X_test_vectorized)
In [50]:
#Test Scores
target_names = ['0.0', '1.0']
print(classification_report(y_test, predtestsvc_y, target_names=target_names))
cnf = confusion_matrix(y_test, predtestsvc_y)
print(cnf)
table_test = pd.crosstab(y_test, predtestsvc_y, margins=True)
print((
'SVC accuracy:{}\n'
).format(cross_val_score(svc1,X_test_vectorized,y_test,cv=kf).mean(),test_tI_errors, test_tII_errors))
Gradient Boosting
In [42]:
#For the Gradient Boosting hyperparameters tuning,due to computational restrictions,
#grid search will be applied to one paramter at a time on the train set
#updating the value as we move along the hyperparameters tuning
#Number of trees
param_test1 = {'n_estimators':range(20,90,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10),
param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=kf)
gsearch1.fit(X_train_vectorized, y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
Out[42]:
In [44]:
#Max depth and min sample split
param_test2 = {'max_depth':range(5,20,2), 'min_samples_split':range(200,1001,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80, max_features='sqrt', subsample=0.8, random_state=10),
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=kf)
gsearch2.fit(X_train_vectorized, y_train)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
Out[44]:
In [46]:
#Re run the min_sample split with the min_sample leaf
param_test3 = {'min_samples_split':range(200,1001,200),'min_samples_leaf':range(30,71,10)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80,max_depth=19,min_samples_split=600,max_features='sqrt', subsample=0.8, random_state=10),
param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=kf)
gsearch3.fit(X_train_vectorized, y_train)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
Out[46]:
In [47]:
#Max features considering the results obtained
#for the combination of the 'min_samples_split', 'min_samples_leaf' and 'max_depth'
#The value of 600 has been maintained as it is the one that gives a better accuracy for every value of 'max_depth'
param_test4 = {'max_features':range(60,74,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80,max_depth=19,min_samples_split=600,min_samples_leaf=40,max_features='sqrt', subsample=0.8, random_state=10),
param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=kf)
gsearch4.fit(X_train_vectorized, y_train)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
Out[47]:
In [48]:
#Tuning the subsample
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9,0.95]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,
n_estimators=80,max_depth=19,min_samples_split=600,
min_samples_leaf=40,max_features=62,
subsample=0.8, random_state=10),
param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=kf)
gsearch5.fit(X_train_vectorized, y_train)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
Out[48]:
In [49]:
#Instead of having a 10% learning rate, we halve the learning rate and double the number of trees to see if we
#can improve the accuracy
param_test5 = {'subsample':[0.8,0.85,0.9,0.95]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=160,
max_depth=19,min_samples_split=600,
min_samples_leaf=40,max_features=62,
subsample=0.9, random_state=10),
param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=kf)
gsearch5.fit(X_train_vectorized, y_train)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
Out[49]:
In [50]:
#Fit on the test set
gsearch5.fit(X_test_vectorized, y_test)
# Predict on test set
predtestrf_y = gsearch5.predict(X_test_vectorized)
In [51]:
#Test Scores
target_names = ['0', '1']
print(classification_report(y_test, predtestrf_y, target_names=target_names))
cnf = confusion_matrix(y_test, predtestrf_y)
print(cnf)
table_test = pd.crosstab(y_test, predtestrf_y, margins=True)
test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']
print((
'Gradient Boosting accuracy:{}\n'
'Percent Type I errors: {}\n'
'Percent Type II errors: {}'
).format(cross_val_score(gsearch5,X_test_vectorized,y_test,cv=kf).mean(),test_tI_errors, test_tII_errors))
In the amazon reviews analysis the digital music dataset has been used. To create both categories, ratings 1-3 have been included as negative and 4-5 as positive. This has been done because in the raw data, ratings are skewed to the positive side being the average around 4.0. Data once categorized has been downsampled to reduce the biased towards the positive reviews of the models that have been run.
To build up the features the bag of words and the TFIDF have been used. In both cases, stop words in English have been extracted so that the number of features is reduced. From the bag of words, 59374 features were built. Additionally, only those that appear in more than 20 reviews have been used. In this case, the Naïve-Bayes and the logistic Regression models wwere used. The accuracy in each case was of 0.6943 and 0.7495.
The TFIDF was applied considering n-grams (1,3) and the number of features was reduced to 11926. The initial models (Naïve-Bayes and Logistic Regression) were tested on the features extracted with the TFIDF to see if there was a significant improvement in the accuracy. The accuracies in both cases were Niave –Bayes: 0.7201 and Logistic Regression: 0.7814 that were considered as the starting points for the use of the TFIDF features.
In all cases, models were tuned in the training sets using gridsearch and the accuracy results obtained are (excluding Logistic Regression and Naïve Bayes classifiers already mentioned):
Decision Tree: 0.6329 KNN: 0.6486 Random Forest: 0.8556 SVC: 0.7724 Gradient Boosting: 0.8597