In [1]:
# This is the book example of sentiment analysis
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
In [2]:
# Locations
path_movie_dataset = os.path.join("book_code", "Section 6", "movies.txt")
In [3]:
# Load the dataset
dataset = pd.read_csv(path_movie_dataset, sep = '\t', header = None, names = ['Sentiment', 'Review'])
In [4]:
# Print out a sample of the dataset
dataset.head()
Out[4]:
In [5]:
dataset.shape
Out[5]:
In [6]:
# Convert the text into features without removing the stop words
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(dataset['Review'])
X_counts.shape
Out[6]:
In [7]:
# We now split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_counts, dataset.Sentiment, test_size=0.2, random_state=111)
In [8]:
# MLP with size of 5 and 2, L2 regularization term is 1e-5
clf = MLPClassifier(alpha = 1e-5, hidden_layer_sizes = (5, 2), random_state = 1)
In [9]:
# Train the neural network
clf.fit(X = X_train, y = y_train)
Out[9]:
In [10]:
# Checking accuracy on the training set
y_pred = clf.predict(X_train)
print("Accuracy on the training set: {:.2f}".format(accuracy_score(y_train, y_pred)))
In [11]:
# A "near perfect" accuracy on the training set may be a sign of overfitting, let's double check
scores = cross_val_score(clf, X_train, y_train, cv = 5)
print("Cross validation score {:.2f} (+/- {:.2f})".format(scores.mean(), scores.std() * 2))
In [12]:
# Now let's repeat the process but removing the stop words
no_stop_words_count_vect = CountVectorizer(stop_words = 'english')
X_nstopw_count = no_stop_words_count_vect.fit_transform(dataset['Review'])
X_nstopw_count.shape
Out[12]:
In [13]:
# Train and testing subdatasets
X_nstopw_train, X_nstopw_test, y_nstopw_train, y_nstopw_test = train_test_split(X_nstopw_count, dataset.Sentiment, test_size = 0.2, random_state = 1234)
In [14]:
# ANN model
clf_nstopw = MLPClassifier(max_iter = 1000, alpha = 1e-5, hidden_layer_sizes = (5, 2), random_state = 1)
In [15]:
# Train the ANN model
clf_nstopw.fit(X = X_nstopw_train, y = y_nstopw_train)
Out[15]:
In [16]:
# Check accuracy on the training subdataset
y_nstopw_pred_train = clf_nstopw.predict(X_nstopw_train)
print("Accuracy on the training dataset, {:.2f}".format(accuracy_score(y_nstopw_train, y_nstopw_pred_train)))
In [17]:
# Again, it looks too good to be true, so we check for overfitting
scores = cross_val_score(clf_nstopw, X_nstopw_train, y_nstopw_train, cv = 5)
print("Cross validation score, {:.2f} (+/- {:.2f})".format(scores.mean(), scores.std() * 2))
In [18]:
# Check accuracy on the testing sub-dataset
y_nstopw_pred_test = clf_nstopw.predict(X_nstopw_test)
print("Accuracy on the test sub-dataset, {:.2f}".format(accuracy_score(y_nstopw_test, y_nstopw_pred_test)))
In [ ]: