Import train and test data.
In [1]:
import urllib
import pandas as pd
#train_data_url = "http://www.inf.utfsm.cl/~jnancu/stanford-subset/polarity.train"
#test_data_url = "http://www.inf.utfsm.cl/~jnancu/stanford-subset/polarity.dev"
#train_data_f = urllib.urlretrieve(train_data_url, "train_data.csv")
#test_data_f = urllib.urlretrieve(test_data_url, "test_data.csv")
ftr = open("train_data.csv", "r")
fts = open("test_data.csv", "r")
rows = [line.split(" ",1) for line in ftr.readlines()]
train_df = pd.DataFrame(rows, columns=['Sentiment','Text'])
train_df['Sentiment'] = pd.to_numeric(train_df['Sentiment'])
rows = [line.split(" ",1) for line in fts.readlines()]
test_df = pd.DataFrame(rows, columns=['Sentiment','Text'])
test_df['Sentiment'] = pd.to_numeric(test_df['Sentiment'])
train_df.shape
test_df.shape
Out[1]:
In [2]:
# Dataset info.
print train_df.columns
train_df[0:5]
Out[2]:
In [3]:
# Text processing.
# Lematization function and stopwords filter.
import re, time
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer, word_tokenize
from nltk.stem.porter import PorterStemmer
def word_extractor(text):
wordlemmatizer = WordNetLemmatizer()
commonwords = stopwords.words('english')
text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two
words = ""
wordtokens = [ wordlemmatizer.lemmatize(word.lower()) \
for word in word_tokenize(text.decode('utf-8','ignore')) ]
for word in wordtokens:
if word not in commonwords:
words+=" "+word
return words
In [4]:
print word_extractor("I love to eat cake")
print word_extractor("I love eating cake")
print word_extractor("I loved eating the cake")
print word_extractor("I do not love eating cake")
print word_extractor("I don't love eating cake")
In [5]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
# Text cleaning.
texts_train = [word_extractor(text) for text in train_df.Text]
texts_test = [word_extractor(text) for text in test_df.Text]
# Text vectorization.
vectorizer = CountVectorizer(ngram_range=(1, 1), binary='False')
vectorizer.fit(np.asarray(texts_train))
# Convert texts to vector representation.
features_train = vectorizer.transform(texts_train)
features_test = vectorizer.transform(texts_test)
# Change labels to 0 or 1 from -1 and 1.
labels_train = np.asarray((train_df.Sentiment.astype(float)+1)/2.0)
labels_test = np.asarray((test_df.Sentiment.astype(float)+1)/2.0)
print features_train.shape
vocab = vectorizer.get_feature_names()
dist = list(np.array(features_train.sum(axis=0)).reshape(-1,))
for tag, count in zip(vocab, dist)[0:3]:
print count, tag
In [6]:
from sklearn.metrics import classification_report
def score_the_model(model,x,y,xt,yt,text):
acc_tr = model.score(x,y)
acc_test = model.score(xt[:-1],yt[:-1])
print "Training Accuracy %s: %f"%(text,acc_tr)
print "Test Accuracy %s: %f"%(text,acc_test)
print "Detailed Analysis Testing Results ..."
print(classification_report(yt, model.predict(xt), target_names=['+','-']))
In [7]:
from sklearn.linear_model import LogisticRegression
def do_LOGIT(x,y,xt,yt):
start_t = time.time()
#Cs = [0.01,0.1,10,100,1000]
Cs = [0.1]
for C in Cs:
print "Usando C= %f"%C
model = LogisticRegression(penalty='l2',C=C)
model = model.fit(x, y)
score_the_model(model,x,y,xt,yt,"LOGISTIC")
do_LOGIT(features_train,labels_train,features_test,labels_test)
In [8]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import train_test_split
import random
def do_NAIVE_BAYES(x,y,xt,yt):
model = BernoulliNB()
model = model.fit(x, y)
score_the_model(model,x,y,xt,yt,"BernoulliNB")
return model
model = do_NAIVE_BAYES(features_train, labels_train, features_test, labels_test)
test_pred = model.predict_proba(features_test)
spl = random.sample(xrange(len(test_pred)), 5)
for text, sentiment in zip(test_df.Text[spl], test_pred[spl]):
print sentiment, text
In [9]:
# Some examples
test_text = 'A great movie. A masterpiece without doubt.'
custom_test = vectorizer.transform([word_extractor(test_text)])
print model.predict(custom_test), model.predict_proba(custom_test)
test_text = 'an awful movie'
custom_test = vectorizer.transform([word_extractor(test_text)])
print model.predict(custom_test), model.predict_proba(custom_test)