In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.decomposition import TruncatedSVD
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
Out[1]:
In [2]:
train_data=pd.read_csv("../../../data-project1/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
train_data.head()
Out[2]:
In [3]:
train_data.tail()
Out[3]:
Notice that 'sentiment' is binary
In [4]:
train_data.dtypes
Out[4]:
Type 'object' is a string for pandas. We shall later convert to number representation,maybe using typical bag-of-words or word2vec
Starting getting basic information of data:
In [5]:
train_data.info()
Now that we already have general idea of Data Set. We next clean, transform data to create useful features for machine learning
We will start wrting function for analyzing and cleaning the deature 'review', using first review as a point of illustration
In [6]:
train_data.review[0]
Out[6]:
Before we can transform text into number representation, we need to process raw text. Let's first remove HTML and puctuation
In [7]:
soup=BeautifulSoup(train_data.review[0]).get_text()
letters_only = re.sub("[^a-zA-Z]"," ",soup )
letters_only
Out[7]:
Now we start stemming and lemmatizing the text, but it is generally better to first create the pos tagger as we only want to lemmatize verb and noum
In [8]:
tokens=nltk.word_tokenize(letters_only.lower())
tagged_words=nltk.pos_tag(tokens)
tagged_words[0:5]
Out[8]:
Stemming the text: There are genrally 2 stemmers available in nltk, porter and lancaster
In [9]:
porter=nltk.PorterStemmer()
def lemmatize_with_potter(token,tag):
if tag[0].lower in ['v','n']:
return porter.stem(token)
return token
stemmed_text_with_potter=[lemmatize_with_potter(token,tag) for token,tag in tagged_words]
lancaster=nltk.LancasterStemmer()
def lemmatize_with_lancaster(token,tag):
if tag[0].lower in ['v','n']:
return lancaster.stem(token)
return token
stemmed_text_with_lancaster=[lemmatize_with_lancaster(token,tag) for token,tag in tagged_words]
In [10]:
stemmed_text_with_potter[0:10]
Out[10]:
In [11]:
stemmed_text_with_lancaster[0:10]
Out[11]:
Observing that the word 'going' has been stemmed with porter but not with lancaster, I'll choose porter for this task.
let's lemmatizing
In [12]:
tagged_words_after_stem=nltk.pos_tag(stemmed_text_with_potter)
wnl = nltk.WordNetLemmatizer()
def lemmatize_with_WordNet(token,tag):
if tag[0].lower in ['v','n']:
return wnl.lemmatize(token)
return token
stemmed_and_lemmatized_text=[lemmatize_with_WordNet(token,tag) for token,tag in tagged_words_after_stem]
stemmed_and_lemmatized_text[0:10]
Out[12]:
text cleanning summary
In [13]:
porter=nltk.PorterStemmer()
wnl = nltk.WordNetLemmatizer()
def stemmatize_with_potter(token,tag):
if tag[0].lower() in ['v','n']:
return porter.stem(token)
return token
def lemmatize_with_WordNet(token,tag):
if tag[0].lower() in ['v','n']:
return wnl.lemmatize(token)
return token
def corpus_preprocessing(corpus):
preprocessed_corpus = []
for sentence in corpus:
#remove HTML and puctuation
soup=BeautifulSoup(sentence).get_text()
letters_only = re.sub("[^a-zA-Z]"," ",soup )
#Stemming
tokens=nltk.word_tokenize(letters_only.lower())
tagged_words=nltk.pos_tag(tokens)
stemmed_text_with_potter=[stemmatize_with_potter(token,tag) for token,tag in tagged_words]
#lemmatization
tagged_words_after_stem=nltk.pos_tag(stemmed_text_with_potter)
stemmed_and_lemmatized_text=[lemmatize_with_WordNet(token,tag) for token,tag in tagged_words_after_stem]
#join all the tokens
clean_review=" ".join(w for w in stemmed_and_lemmatized_text)
preprocessed_corpus.append(clean_review)
return preprocessed_corpus
Let's transform feature 'review' into numerical representation to feed into machine learning. The common representation of text is the bag-of-words model
in Sklearn, we can use class CountVectorize to transform the data. We shall also use stop-words to reduce the dimension of feature space. Let's now first 5 data from train Dataset to be test_corpus
In [14]:
vectorizer=CountVectorizer(stop_words='english')
test_corpus=train_data.review[0:5]
test_corpus= corpus_preprocessing(test_corpus)
test_corpus=vectorizer.fit_transform(test_corpus)
print(test_corpus.todense())
We could extend the bag-of-words representation with tf-idf to reflect how important a word to a document in a corpus
tdf-idf can be applied with class TfidfVectorizer in sklearn
In [16]:
vectorizer= TfidfVectorizer(stop_words='english')
test_corpus=train_data.review[0:5]
test_corpus= corpus_preprocessing(test_corpus)
test_corpus=vectorizer.fit_transform(test_corpus)
print (test_corpus.todense())
Using stop_words was one technique to reduce dimensionality. We can further reduce the dimensinality by using latent sematic analysis
In sklearn, we can apply class TruncatedSVD into tf-idf matrix
In [17]:
tsvd=TruncatedSVD(100)
tsvd.fit(test_corpus)
test_corpus=tsvd.transform(test_corpus)
test_corpus
Out[17]:
Sklearn provides several kinds of Naives classifiers: GaussianNB, MultinomialNB and BernoulliNB. We will choose MultinomialNB for this task
In [18]:
model=MultinomialNB()
In [19]:
#features from train set
train_features=train_data.review
#pro-processing train features
train_features=corpus_preprocessing(train_features)
vectorizer= TfidfVectorizer(stop_words='english')
train_features=vectorizer.fit_transform(train_features)
tsvd=TruncatedSVD(100)
tsvd.fit(train_features)
train_features=tsvd.transform(train_features)
#target from train set
train_target=train_data.sentiment
#fitting the model
model.fit(train_features,train_target)
In [ ]:
#reading test data
test_data=train_data=pd.read_csv("../../../data-project1/testData.tsv", header=0,delimiter="\t", quoting=3)
#features from test data
test_features=test_data.review
#pre-processing test features
test_features=corpus_preprocessing(test_features)
test_features=vectorizer.transform(test_features)
test_features=tsvd.transform(test_features)
#predicting the sentiment for test set
prediction=model.predict(test_features)
In [ ]:
#writing out submission file
pd.DataFrame( data={"id":test_data["id"], "sentiment":prediction} ).to_csv("../../../data-project1/first_attempt.csv", index=False, quoting=3 )
A variety of metrics exist to evaluate the performance for binary classifiers, i.e accuracy, precision, recall, F1 measure, ROC AUC score. We shall use ROC AUC score for this task as specified by competition site.
We first splitting the train data set for cross validation, let's choose 80% for split_train set and 20% for split test_set
In [ ]:
# Split 80-20 train vs test data
split_train_features, split_test_features, split_train_target, split_test_target = train_test_split(train_features,
train_target,
test_size = 0.20,
random_state = 0)
ROC curve illustrates the classifier's performance for all values of the discrimination threshold.
In [ ]:
#pre-processing split train
vectorizer= TfidfVectorizer(stop_words='english')
split_train_features = corpus_preprocessing(split_train_features)
split_train_features = vectorizer.fit_transform(split_train_features)
tsvd=TruncatedSVD(100)
tsvd.fit(split_train_features)
split_train_features = tsvd.transform(split_train_features)
#pre-processing split test features
split_test_features = corpus_preprocessing(split_test_features)
split_test_features = vectorizer.transform(split_test_features)
split_test_features = tsvd.transform(split_test_features)
#fit and predict using split data
model = MultinomialNB()
model.fit(split_train_features,split_train_target)
split_prediction = model.predict(split_test_features)
score=roc_auc_score(split_test_target, split_prediction)
print (score(split_test_target, split_prediction))
ROC curves plot the classifier's recall against its fall-out.
In [ ]:
false_positive_rates ,recall,thresholds=roc_curve(split_test_target,split_prediction)
plt.title('Receiver Operating Charisteristic')
plt.plot(false_positive_rates,recall,'r', label='AUC = %0.2f' %score)
plt.legend(loc = 'lower right')
plt.ylable('Recall')
plt.xlable('False positive rate')
plt.show()
Class MultinomialNb has a parameter value alpha (default=1.0) We could try to run on another value of alpha to see how the score would change.
In [ ]:
model=MultinomialNB(alpha=0.1)
model.fit(split_train_features,split_train_target)
split_prediction=model.predict(split_test_features)
score=roc_auc_score(split_test_target, split_predict)
print (score(split_test_target, split_predict))
Let's try to generate score over a range of alpha
In [ ]:
alphas=np.logspace(-5,0,6)
print alphas
In [ ]:
def evaluate_alpha(train_features,train_target,test_features,test_target,model,parameter_value, parameter_name):
scores=[]
for test_alpha in params:
model. set_params(**{parameter_name:test_alpha})
model.fit(train_features,train_target)
prediction=model.predict(test_features)
score=roc_auc_score(test_target, prediction)
scores.append((test_alpha,score))
model=MultinomialNB()
alpha_score=evaluate_alpha(split_train_features,split_train_target,split_test_features,split_test_target,model,alphas,'alpha')