In [1]:
from __future__ import print_function # Python 2/3 compatibility
import numpy as np
import pandas as pd
from IPython.display import Image
In [2]:
train_df = pd.read_csv("data/train.tsv", sep="\t")
In [3]:
train_df.sample(10)
Out[3]:
Caution: If you do this enough times, you will be overfitting to the Validation data. To avoid that it might be advisable to split into three ways like Train-Validation-Test and generate the final score on Test Data.
In [4]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train_df["review"], train_df["sentiment"], test_size=0.2)
In [5]:
print("Training Data: {}, Validation: {}".format(len(X_train), len(X_valid)))
Computers don't understand Texts, so we need to convert texts to numbers before we could do any math on it and see if we can build a system to classify a review as Positive or Negative.
Ways to vectorize data:
Scikit-Learn has nice APIs for preprocessing and feature extraction modules. In fact, these can be used even if you build your own models or use another libriary for model building process.
In [6]:
from sklearn.feature_extraction.text import CountVectorizer
In [7]:
# The API is very similar to model building process.
# Step 1: Instantiate the Vectorizer or more generally called Transformer
vect = CountVectorizer(max_features=5000, binary=True, stop_words="english")
In [8]:
# Fit your Training Data
vect.fit(X_train)
# Transform your training and validation data
X_train_vect = vect.transform(X_train)
X_valid_vect = vect.transform(X_valid)
In [22]:
from sklearn.linear_model import LogisticRegression
In [17]:
model_1 = LogisticRegression()
model_1.fit(X_train_vect, y_train)
Out[17]:
In [20]:
# Training Accuracy
print("Training Accuracy: {:.3f}".format(model_1.score(X_train_vect, y_train)))
print("Validation Accuracy: {:.3f}".format(model_1.score(X_valid_vect, y_valid)))
In [23]:
from sklearn.naive_bayes import MultinomialNB
In [24]:
model_2 = MultinomialNB()
model_2.fit(X_train_vect, y_train)
Out[24]:
In [25]:
# Training Accuracy
print("Training Accuracy: {:.3f}".format(model_2.score(X_train_vect, y_train)))
print("Validation Accuracy: {:.3f}".format(model_2.score(X_valid_vect, y_valid)))
In [26]:
from sklearn.ensemble import RandomForestClassifier
In [30]:
model_3 = RandomForestClassifier(min_samples_leaf=3, n_estimators=25, n_jobs=-1)
model_3.fit(X_train_vect, y_train)
Out[30]:
In [31]:
# Training Accuracy
print("Training Accuracy: {:.3f}".format(model_3.score(X_train_vect, y_train)))
print("Validation Accuracy: {:.3f}".format(model_3.score(X_valid_vect, y_valid)))
In [33]:
from sklearn.ensemble import GradientBoostingClassifier
In [34]:
model_4 = RandomForestClassifier(min_samples_leaf=3, n_estimators=25, n_jobs=-1)
model_4.fit(X_train_vect, y_train)
Out[34]:
In [35]:
# Training Accuracy
print("Training Accuracy: {:.3f}".format(model_4.score(X_train_vect, y_train)))
print("Validation Accuracy: {:.3f}".format(model_4.score(X_valid_vect, y_valid)))
In [36]:
from sklearn.neural_network import MLPClassifier
In [38]:
model_5 = MLPClassifier(hidden_layer_sizes=(32,), max_iter=100)
model_5.fit(X_train_vect, y_train)
Out[38]:
In [39]:
# Training Accuracy
print("Training Accuracy: {:.3f}".format(model_5.score(X_train_vect, y_train)))
print("Validation Accuracy: {:.3f}".format(model_5.score(X_valid_vect, y_valid)))
Neural Nets - Textbook Case of Overfitting. Maybe the model is too powerful :-D
In [40]:
## Pass
In [42]:
from sklearn.ensemble import VotingClassifier
In [47]:
classifiers = [("Logistic Regression", model_1),
("Naive Bayes", model_2),
("Random Forest", model_3),
("Gradient Boosted", model_4),
("Neural Nets", model_5)]
In [48]:
classifiers
Out[48]:
In [49]:
final_model = VotingClassifier(classifiers, n_jobs=-1)
In [51]:
# Unfortuately, have to run Fit Again on the ensembled model before using it
# Wish there was an option to not have to fit again
final_model.fit(X_train_vect, y_train)
Out[51]:
In [52]:
# Drum Rolls - Accuracy on the final Model
print("Training Accuracy: {:.3f}".format(final_model.score(X_train_vect, y_train)))
print("Validation Accuracy: {:.3f}".format(final_model.score(X_valid_vect, y_valid)))
In [53]:
# Read in the Test Dataset
# Note that it's missing the Sentiment Column. That's what we need to Predict
#
test_df = pd.read_csv("data/test.tsv", sep="\t")
test_df.head()
Out[53]:
In [54]:
# Vectorize the Review Text
X_test = test_df.review
X_test_vect = vect.transform(X_test)
In [55]:
y_test_pred = final_model.predict(X_test_vect)
In [58]:
df = pd.DataFrame({
"document_id": test_df.document_id,
"sentiment": y_test_pred
})
In [59]:
df.to_csv("data/ensemble_submission1.csv", index=False)