In [1]:
# This is the book example of sentiment analysis
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [2]:
# Locations
path_movie_dataset = os.path.join("book_code", "Section 6", "movies.txt")

In [3]:
# Load the dataset
dataset = pd.read_csv(path_movie_dataset, sep = '\t', header = None, names = ['Sentiment', 'Review'])

In [4]:
# Print out a sample of the dataset
dataset.head()


Out[4]:
Sentiment Review
0 1 The Da Vinci Code book is just awesome.
1 1 this was the first clive cussler i've ever rea...
2 1 i liked the Da Vinci Code a lot.
3 1 i liked the Da Vinci Code a lot.
4 1 I liked the Da Vinci Code but it ultimatly did...

In [5]:
dataset.shape


Out[5]:
(6918, 2)

In [6]:
# Convert the text into features without removing the stop words
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(dataset['Review'])
X_counts.shape


Out[6]:
(6918, 2132)

In [7]:
# We now split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_counts, dataset.Sentiment, test_size=0.2, random_state=111)

In [8]:
# MLP with size of 5 and 2, L2 regularization term is 1e-5
clf = MLPClassifier(alpha = 1e-5, hidden_layer_sizes = (5, 2), random_state = 1)

In [9]:
# Train the neural network
clf.fit(X = X_train, y = y_train)


Out[9]:
MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [10]:
# Checking accuracy on the training set
y_pred = clf.predict(X_train)
print("Accuracy on the training set: {:.2f}".format(accuracy_score(y_train, y_pred)))


Accuracy on the training set: 1.00

In [11]:
# A "near perfect" accuracy on the training set may be a sign of overfitting, let's double check
scores = cross_val_score(clf, X_train, y_train, cv = 5)
print("Cross validation score {:.2f} (+/- {:.2f})".format(scores.mean(), scores.std() * 2))


Cross validation score 0.99 (+/- 0.01)

In [12]:
# Now let's repeat the process but removing the stop words
no_stop_words_count_vect = CountVectorizer(stop_words = 'english')
X_nstopw_count = no_stop_words_count_vect.fit_transform(dataset['Review'])
X_nstopw_count.shape


Out[12]:
(6918, 1921)

In [13]:
# Train and testing subdatasets
X_nstopw_train, X_nstopw_test, y_nstopw_train, y_nstopw_test = train_test_split(X_nstopw_count, dataset.Sentiment, test_size = 0.2, random_state = 1234)

In [14]:
# ANN model
clf_nstopw = MLPClassifier(max_iter = 1000, alpha = 1e-5, hidden_layer_sizes = (5, 2), random_state = 1)

In [15]:
# Train the ANN model
clf_nstopw.fit(X = X_nstopw_train, y = y_nstopw_train)


Out[15]:
MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [16]:
# Check accuracy on the training subdataset
y_nstopw_pred_train = clf_nstopw.predict(X_nstopw_train)
print("Accuracy on the training dataset, {:.2f}".format(accuracy_score(y_nstopw_train, y_nstopw_pred_train)))


Accuracy on the training dataset, 1.00

In [17]:
# Again, it looks too good to be true, so we check for overfitting
scores = cross_val_score(clf_nstopw, X_nstopw_train, y_nstopw_train, cv = 5)
print("Cross validation score, {:.2f} (+/- {:.2f})".format(scores.mean(), scores.std() * 2))


Cross validation score, 0.99 (+/- 0.00)

In [18]:
# Check accuracy on the testing sub-dataset
y_nstopw_pred_test = clf_nstopw.predict(X_nstopw_test)
print("Accuracy on the test sub-dataset, {:.2f}".format(accuracy_score(y_nstopw_test, y_nstopw_pred_test)))


Accuracy on the test sub-dataset, 0.99

In [ ]: