In [1]:
from __future__ import print_function # Python 2/3 compatibility
import numpy as np
import pandas as pd
from IPython.display import Image
In [2]:
train_df = pd.read_csv("data/train.tsv", sep="\t")
In [3]:
train_df.sample(10)
Out[3]:
Caution: If you do this enough times, you will be overfitting to the Validation data. To avoid that it might be advisable to split into three ways like Train-Validation-Test and generate the final score on Test Data.
In [4]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train_df["review"], train_df["sentiment"], test_size=0.2)
In [5]:
print("Training Data: {}, Validation: {}".format(len(X_train), len(X_valid)))
Computers don't understand Texts, so we need to convert texts to numbers before we could do any math on it and see if we can build a system to classify a review as Positive or Negative.
Ways to vectorize data:
Scikit-Learn has nice APIs for preprocessing and feature extraction modules. In fact, these can be used even if you build your own models or use another libriary for model building process.
In [6]:
from sklearn.feature_extraction.text import CountVectorizer
In [8]:
# The API is very similar to model building process.
# Step 1: Instantiate the Vectorizer or more generally called Transformer
vect = CountVectorizer(max_features=5000, binary=True, stop_words="english")
In [10]:
# Fit your Training Data
vect.fit(X_train)
# Transform your training and validation data
X_train_vect = vect.transform(X_train)
X_valid_vect = vect.transform(X_valid)
In [11]:
X_train.head()
Out[11]:
In [12]:
# Creates a Sparse Matrix
X_train_vect
Out[12]:
In [13]:
# Understand the Vectorizer
vect
Out[13]:
In [14]:
# Does similar things to what we did manually in our bag of words model
# vect.vocabulary_
In [15]:
# Does similar things to what we did manually in our bag of words model
from itertools import islice
list(islice(vect.vocabulary_.items(), 10))
Out[15]:
In [16]:
pd.DataFrame(X_train_vect.todense(), columns=vect.vocabulary_.keys()).head()
Out[16]:
In [17]:
from sklearn.linear_model import LogisticRegression
In [18]:
model = LogisticRegression()
In [19]:
model.fit(X_train_vect, y_train)
Out[19]:
In [20]:
# Training Accuracy
print("Training Accuracy: {:.3f}".format(model.score(X_train_vect, y_train)))
In [21]:
## Validation Accuracy
print("Validation Accuracy: {:.3f}".format(model.score(X_valid_vect, y_valid)))
In [22]:
model = LogisticRegression(C=0.1)
model.fit(X_train_vect, y_train)
Out[22]:
In [23]:
# Training Accuracy
print("Training Accuracy: {:.3f}".format(model.score(X_train_vect, y_train)))
print("Validation Accuracy: {:.3f}".format(model.score(X_valid_vect, y_valid)))
In [24]:
# Read in the Test Dataset
# Note that it's missing the Sentiment Column. That's what we need to Predict
#
test_df = pd.read_csv("data/test.tsv", sep="\t")
test_df.head()
Out[24]:
In [25]:
# Vectorize the Review Text
X_test = test_df.review
X_test_vect = vect.transform(X_test)
In [26]:
y_test_pred = model.predict(X_test_vect)
In [27]:
df = pd.DataFrame({
"document_id": test_df.document_id,
"sentiment": y_test_pred
})
In [28]:
df.to_csv("data/logistic_reg_submission1.csv", index=False)
In [29]:
!head data/logistic_reg_submission1.csv