notebook.community

Edit and run



In [1]:

    
# This is the book example of sentiment analysis
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score



In [2]:

    
# Locations
path_movie_dataset = os.path.join("book_code", "Section 6", "movies.txt")



In [3]:

    
# Load the dataset
dataset = pd.read_csv(path_movie_dataset, sep = '\t', header = None, names = ['Sentiment', 'Review'])



In [4]:

    
# Print out a sample of the dataset
dataset.head()









    Out[4]:







  
    
      
      Sentiment
      Review
    
  
  
    
      0
      1
      The Da Vinci Code book is just awesome.
    
    
      1
      1
      this was the first clive cussler i've ever rea...
    
    
      2
      1
      i liked the Da Vinci Code a lot.
    
    
      3
      1
      i liked the Da Vinci Code a lot.
    
    
      4
      1
      I liked the Da Vinci Code but it ultimatly did...



In [5]:

    
dataset.shape









    Out[5]:





(6918, 2)



In [6]:

    
# Convert the text into features without removing the stop words
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(dataset['Review'])
X_counts.shape









    Out[6]:





(6918, 2132)



In [7]:

    
# We now split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_counts, dataset.Sentiment, test_size=0.2, random_state=111)



In [8]:

    
# MLP with size of 5 and 2, L2 regularization term is 1e-5
clf = MLPClassifier(alpha = 1e-5, hidden_layer_sizes = (5, 2), random_state = 1)



In [9]:

    
# Train the neural network
clf.fit(X = X_train, y = y_train)









    Out[9]:





MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)



In [10]:

    
# Checking accuracy on the training set
y_pred = clf.predict(X_train)
print("Accuracy on the training set: {:.2f}".format(accuracy_score(y_train, y_pred)))









    



Accuracy on the training set: 1.00



In [11]:

    
# A "near perfect" accuracy on the training set may be a sign of overfitting, let's double check
scores = cross_val_score(clf, X_train, y_train, cv = 5)
print("Cross validation score {:.2f} (+/- {:.2f})".format(scores.mean(), scores.std() * 2))









    



Cross validation score 0.99 (+/- 0.01)



In [12]:

    
# Now let's repeat the process but removing the stop words
no_stop_words_count_vect = CountVectorizer(stop_words = 'english')
X_nstopw_count = no_stop_words_count_vect.fit_transform(dataset['Review'])
X_nstopw_count.shape









    Out[12]:





(6918, 1921)



In [13]:

    
# Train and testing subdatasets
X_nstopw_train, X_nstopw_test, y_nstopw_train, y_nstopw_test = train_test_split(X_nstopw_count, dataset.Sentiment, test_size = 0.2, random_state = 1234)



In [14]:

    
# ANN model
clf_nstopw = MLPClassifier(max_iter = 1000, alpha = 1e-5, hidden_layer_sizes = (5, 2), random_state = 1)



In [15]:

    
# Train the ANN model
clf_nstopw.fit(X = X_nstopw_train, y = y_nstopw_train)









    Out[15]:





MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)



In [16]:

    
# Check accuracy on the training subdataset
y_nstopw_pred_train = clf_nstopw.predict(X_nstopw_train)
print("Accuracy on the training dataset, {:.2f}".format(accuracy_score(y_nstopw_train, y_nstopw_pred_train)))









    



Accuracy on the training dataset, 1.00



In [17]:

    
# Again, it looks too good to be true, so we check for overfitting
scores = cross_val_score(clf_nstopw, X_nstopw_train, y_nstopw_train, cv = 5)
print("Cross validation score, {:.2f} (+/- {:.2f})".format(scores.mean(), scores.std() * 2))









    



Cross validation score, 0.99 (+/- 0.00)



In [18]:

    
# Check accuracy on the testing sub-dataset
y_nstopw_pred_test = clf_nstopw.predict(X_nstopw_test)
print("Accuracy on the test sub-dataset, {:.2f}".format(accuracy_score(y_nstopw_test, y_nstopw_pred_test)))









    



Accuracy on the test sub-dataset, 0.99



In [ ]:

	Sentiment	Review
0	1	The Da Vinci Code book is just awesome.
1	1	this was the first clive cussler i've ever rea...
2	1	i liked the Da Vinci Code a lot.
3	1	i liked the Da Vinci Code a lot.
4	1	I liked the Da Vinci Code but it ultimatly did...