Competition Link: https://www.kaggle.com/c/movie-sentiment-analysis
Unzip into Data Directory
In [1]:
from __future__ import print_function # Python 2/3 compatibility
import numpy as np
import pandas as pd
from collections import Counter
from IPython.display import Image
In [2]:
train_df = pd.read_csv("data/train.tsv", sep="\t")
In [3]:
train_df.sample(10)
Out[3]:
In [4]:
# Load the Test Dataset
# Note that it's missing the Sentiment Column. That's what we need to Predict
#
test_df = pd.read_csv("data/test.tsv", sep="\t")
test_df.head()
Out[4]:
In [5]:
# Equal Number of Positive and Negative Sentiments
train_df.sentiment.value_counts()
Out[5]:
In [6]:
# Lets take a look at some examples
def print_reviews(reviews, max_words=500):
for review in reviews:
print(review[:500], end="\n\n")
In [7]:
# Some Positive Reviews
print("Sample **Positive** Reviews: ", "\n")
print_reviews(train_df[train_df["sentiment"] == 1].sample(3).review)
In [8]:
# Some Negative Reviews
print("Sample **Negative** Reviews: ", "\n")
print_reviews(train_df[train_df["sentiment"] == 0].sample(3).review)
Computers don't understand Texts, so we need to convert texts to numbers before we could do any math on it and see if we can build a system to classify a review as Positive or Negative.
Ways to vectorize data:
In [9]:
## Doing it by Hand
def bag_of_words_vocab(reviews):
"""Returns words in the reviews"""
# all_words = []
# for review in reviews:
# for word in review.split():
# all_words.append(word)
## List comprehension method of the same lines above
all_words = [word.lower() for review in reviews for word in review.split(" ")]
return Counter(all_words)
In [10]:
words_vocab = bag_of_words_vocab(train_df.review)
In [11]:
words_vocab.most_common(20)
Out[11]:
In [12]:
pos_words_vocab = bag_of_words_vocab(train_df[train_df.sentiment == 1].review)
neg_words_vocab = bag_of_words_vocab(train_df[train_df.sentiment == 0].review)
In [13]:
pos_words_vocab.most_common(10)
Out[13]:
In [14]:
neg_words_vocab.most_common(10)
Out[14]:
In [15]:
pos_neg_freq = Counter()
for word in words_vocab:
pos_neg_freq[word] = (pos_words_vocab[word] + 1e-3) / (neg_words_vocab[word] + 1e-3)
In [16]:
print("Neutral words:")
print("Pos-to-neg for 'the' = {:.2f}".format(pos_neg_freq["is"]))
print("Pos-to-neg for 'movie' = {:.2f}".format(pos_neg_freq["is"]))
print("\nPositive and Negative review words:")
print("Pos-to-neg for 'amazing' = {:.2f}".format(pos_neg_freq["great"]))
print("Pos-to-neg for 'terrible' = {:.2f}".format(pos_neg_freq["terrible"]))
When Converted to Log Scale -
That not only makes lot of sense when looking at the numbers, but we could use it for our first classifier
In [17]:
# https://www.desmos.com/calculator
Image("images/log-function.png", width=960)
Out[17]:
In [18]:
for word in pos_neg_freq:
pos_neg_freq[word] = np.log(pos_neg_freq[word])
In [19]:
print("Neutral words:")
print("Pos-to-neg for 'the' = {:.2f}".format(pos_neg_freq["is"]))
print("Pos-to-neg for 'movie' = {:.2f}".format(pos_neg_freq["is"]))
print("\nPositive and Negative review words:")
print("Pos-to-neg for 'amazing' = {:.2f}".format(pos_neg_freq["great"]))
print("Pos-to-neg for 'terrible' = {:.2f}".format(pos_neg_freq["terrible"]))
In [20]:
class CountingClassifier(object):
def __init__(self, pos_neg_freq):
self.pos_neg_freq = pos_neg_freq
def fit(self, X, y=None):
# No Machine Learing here. It's just counting
pass
def predict(self, X):
predictions = []
for review in X:
all_words = [word.lower() for word in review.split()]
result = np.sum(self.pos_neg_freq.get(word, 0) for word in all_words)
predictions.append(result)
return np.array(predictions)
In [21]:
counting_model = CountingClassifier(pos_neg_freq)
train_predictions = counting_model.predict(train_df.review)
In [22]:
train_predictions[:10]
Out[22]:
In [23]:
# Covert to Binary Classifier
train_predictions > 0
Out[23]:
In [24]:
y_pred = (train_predictions > 0).astype(int)
y_pred
Out[24]:
In [25]:
y_true = train_df.sentiment
len(y_true)
Out[25]:
In [26]:
np.sum(y_pred == y_true)
Out[26]:
In [27]:
## Accuracy
train_accuracy = np.sum(y_pred == y_true) / len(y_true)
print("Accuracy on Train Data: {:.2f}".format(train_accuracy))
In [28]:
## Test Accracy
test_predictions = counting_model.predict(test_df.review)
test_predictions
Out[28]:
In [29]:
y_pred = (test_predictions > 0).astype(int)
In [30]:
df = pd.DataFrame({
"document_id": test_df.document_id,
"sentiment": y_pred
})
In [31]:
df.head()
Out[31]:
In [32]:
df.to_csv("data/count-submission.csv", index=False)
In [33]:
import matplotlib.pyplot as plt
%matplotlib inline
In [34]:
train_df.review.str.len().hist(log=True)
Out[34]:
In [35]:
test_df.review.str.len().hist(log=True)
Out[35]: