Import train and test data.


In [1]:
import urllib
import pandas as pd
#train_data_url = "http://www.inf.utfsm.cl/~jnancu/stanford-subset/polarity.train"
#test_data_url = "http://www.inf.utfsm.cl/~jnancu/stanford-subset/polarity.dev"
#train_data_f = urllib.urlretrieve(train_data_url, "train_data.csv")
#test_data_f = urllib.urlretrieve(test_data_url, "test_data.csv")
ftr = open("train_data.csv", "r")
fts = open("test_data.csv", "r")
rows = [line.split(" ",1) for line in ftr.readlines()]
train_df = pd.DataFrame(rows, columns=['Sentiment','Text'])
train_df['Sentiment'] = pd.to_numeric(train_df['Sentiment'])
rows = [line.split(" ",1) for line in fts.readlines()]
test_df = pd.DataFrame(rows, columns=['Sentiment','Text'])
test_df['Sentiment'] = pd.to_numeric(test_df['Sentiment'])
train_df.shape
test_df.shape


Out[1]:
(3554, 2)

In [2]:
# Dataset info.
print train_df.columns
train_df[0:5]


Index([u'Sentiment', u'Text'], dtype='object')
Out[2]:
Sentiment Text
0 -1 everything's serious , poetic , earnest and --...
1 -1 narratively , trouble every day is a plodding ...
2 1 a truly wonderful tale combined with stunning ...
3 1 jason patric and ray liotta make for one splen...
4 -1 haneke keeps us at arm's length . guided more ...

In [3]:
# Text processing.
# Lematization function and stopwords filter.
import re, time
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer, word_tokenize
from nltk.stem.porter import PorterStemmer

def word_extractor(text):
    wordlemmatizer = WordNetLemmatizer()
    commonwords = stopwords.words('english')
    text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two
    words = ""
    wordtokens = [ wordlemmatizer.lemmatize(word.lower()) \
        for word in word_tokenize(text.decode('utf-8','ignore')) ]
    for word in wordtokens:
        if word not in commonwords:
            words+=" "+word
    return words

In [4]:
print word_extractor("I love to eat cake")
print word_extractor("I love eating cake")
print word_extractor("I loved eating the cake")
print word_extractor("I do not love eating cake")
print word_extractor("I don't love eating cake")


 love eat cake
 love eating cake
 loved eating cake
 love eating cake
 n't love eating cake

In [5]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Text cleaning.
texts_train = [word_extractor(text) for text in train_df.Text]
texts_test = [word_extractor(text) for text in test_df.Text]

# Text vectorization.
vectorizer = CountVectorizer(ngram_range=(1, 1), binary='False')
vectorizer.fit(np.asarray(texts_train))

# Convert texts to vector representation.
features_train = vectorizer.transform(texts_train)
features_test = vectorizer.transform(texts_test)

# Change labels to 0 or 1 from -1 and 1.
labels_train = np.asarray((train_df.Sentiment.astype(float)+1)/2.0)
labels_test = np.asarray((test_df.Sentiment.astype(float)+1)/2.0)

print features_train.shape

vocab = vectorizer.get_feature_names()
dist = list(np.array(features_train.sum(axis=0)).reshape(-1,))

for tag, count in zip(vocab, dist)[0:3]:
    print count, tag


(3554, 9663)
6 10
4 100
2 101

In [6]:
from sklearn.metrics import classification_report
def score_the_model(model,x,y,xt,yt,text):
    acc_tr = model.score(x,y)
    acc_test = model.score(xt[:-1],yt[:-1])
    print "Training Accuracy %s: %f"%(text,acc_tr)
    print "Test Accuracy %s: %f"%(text,acc_test)
    print "Detailed Analysis Testing Results ..."
    print(classification_report(yt, model.predict(xt), target_names=['+','-']))

In [7]:
from sklearn.linear_model import LogisticRegression
def do_LOGIT(x,y,xt,yt):
    start_t = time.time()
    #Cs = [0.01,0.1,10,100,1000]
    Cs = [0.1]
    for C in Cs:
        print "Usando C= %f"%C
        model = LogisticRegression(penalty='l2',C=C)
        model = model.fit(x, y)
        score_the_model(model,x,y,xt,yt,"LOGISTIC")
do_LOGIT(features_train,labels_train,features_test,labels_test)


Usando C= 0.100000
Training Accuracy LOGISTIC: 0.892234
Test Accuracy LOGISTIC: 0.719111
Detailed Analysis Testing Results ...
             precision    recall  f1-score   support

          +       0.72      0.72      0.72      1803
          -       0.72      0.71      0.71      1751

avg / total       0.72      0.72      0.72      3554


In [8]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import train_test_split
import random

def do_NAIVE_BAYES(x,y,xt,yt):
    model = BernoulliNB()
    model = model.fit(x, y)
    score_the_model(model,x,y,xt,yt,"BernoulliNB")
    return model

model = do_NAIVE_BAYES(features_train, labels_train, features_test, labels_test)
test_pred = model.predict_proba(features_test)
spl = random.sample(xrange(len(test_pred)), 5)
for text, sentiment in zip(test_df.Text[spl], test_pred[spl]):
    print sentiment, text


Training Accuracy BernoulliNB: 0.958638
Test Accuracy BernoulliNB: 0.738531
Detailed Analysis Testing Results ...
             precision    recall  f1-score   support

          +       0.75      0.73      0.74      1803
          -       0.73      0.75      0.74      1751

avg / total       0.74      0.74      0.74      3554

[ 0.72352592  0.27647408] with its dogged hollywood naturalism and the inexorable passage of its characters toward sainthood , windtalkers is nothing but a sticky-sweet soap .

[ 0.07012159  0.92987841] although it's a bit smug and repetitive , this documentary engages your brain in a way few current films do .

[ 0.91373353  0.08626647] it's the kind of movie you can't quite recommend because it is all windup and not much of a pitch , yet you can't bring yourself to dislike it .

[ 0.9946274  0.0053726] the one not-so-small problem with expecting is that the entire exercise has no real point .

[ 0.97900537  0.02099463] apparently writer-director attal thought he need only cast himself and his movie-star wife sitting around in their drawers to justify a film .


In [9]:
# Some examples
test_text = 'A great movie. A masterpiece without doubt.'
custom_test = vectorizer.transform([word_extractor(test_text)])
print model.predict(custom_test), model.predict_proba(custom_test)

test_text = 'an awful movie'
custom_test = vectorizer.transform([word_extractor(test_text)])
print model.predict(custom_test), model.predict_proba(custom_test)


[ 1.] [[ 0.21690469  0.78309531]]
[ 0.] [[ 0.87588935  0.12411065]]