In [1]:

    
%load_ext watermark

%watermark -a 'Vahid Mirjalili' -d -p scikit-learn,numpy,numexpr,pandas,matplotlib,plotly -v









    



Vahid Mirjalili 28/12/2014 

CPython 2.7.3
IPython 2.3.1

scikit-learn 0.15.2
numpy 1.9.1
numexpr 2.2.2
pandas 0.15.1
matplotlib 1.4.2
plotly 1.4.7



In [2]:

    
from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import scipy
import sklearn

%matplotlib inline









    



/usr/local/lib/python2.7/dist-packages/PIL/Image.py:71: RuntimeWarning: The _imaging extension was built for another  version of Pillow or PIL
  warnings.warn(str(v), RuntimeWarning)
/usr/local/lib/python2.7/dist-packages/PIL/Image.py:71: RuntimeWarning: The _imaging extension was built for another  version of Pillow or PIL
  warnings.warn(str(v), RuntimeWarning)

1. Read the training and test dataset



In [3]:

    
df = pd.read_table('../data/labeledTrainData.tsv')

df.head()









    Out[3]:






  
    
      
      id
      sentiment
      review
    
  
  
    
      0
       5814_8
       1
       With all this stuff going down at the moment w...
    
    
      1
       2381_9
       1
       \The Classic War of the Worlds\" by Timothy Hi...
    
    
      2
       7759_3
       0
       The film starts with a manager (Nicholas Bell)...
    
    
      3
       3630_4
       0
       It must be assumed that those who praised this...
    
    
      4
       9495_8
       1
       Superbly trashy and wondrously unpretentious 8...



In [4]:

    
df_test = pd.read_table('../data/testData.tsv')

df_test.head()









    Out[4]:






  
    
      
      id
      review
    
  
  
    
      0
       12311_10
       Naturally in a film who's main themes are of m...
    
    
      1
         8348_2
       This movie is a disaster within a disaster fil...
    
    
      2
         5828_4
       All in all, this is a movie for kids. We saw i...
    
    
      3
         7186_2
       Afraid of the Dark left me with the impression...
    
    
      4
        12128_7
       A very accurate depiction of small time mob li...

1.1 Extracting X & y data columns



In [5]:

    
data_train = df.loc[:, 'review']

y_train = df.loc[:, 'sentiment']

data_train.head()









    Out[5]:





0    With all this stuff going down at the moment w...
1    \The Classic War of the Worlds\" by Timothy Hi...
2    The film starts with a manager (Nicholas Bell)...
3    It must be assumed that those who praised this...
4    Superbly trashy and wondrously unpretentious 8...
Name: review, dtype: object



In [6]:

    
data_test = df_test.loc[:, 'review']

data_test.tail()









    Out[6]:





24995    Sony Pictures Classics, I'm looking at you! So...
24996    I always felt that Ms. Merkerson had never got...
24997    I was so disappointed in this movie. I am very...
24998    From the opening sequence, filled with black a...
24999    This is a great horror film for people who don...
Name: review, dtype: object

2. Text Feature Extraction



In [7]:

    
import nltk
import string
import re
from collections import Counter

from nltk.corpus import stopwords









    



/usr/local/lib/python2.7/dist-packages/PIL/Image.py:71: RuntimeWarning: The _imaging extension was built for another  version of Pillow or PIL
  warnings.warn(str(v), RuntimeWarning)
/usr/local/lib/python2.7/dist-packages/PIL/Image.py:71: RuntimeWarning: The _imaging extension was built for another  version of Pillow or PIL
  warnings.warn(str(v), RuntimeWarning)

2.1 Tokenizer Function

Transform to lower-case
Remove the punctuations
Remove the stopwrods
Tokenize the remaining string



In [8]:

    
## For more info, see http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html

stemmer = nltk.stem.porter.PorterStemmer()

def get_tokens(inp_txt):
    
    ## Lower case: ABC -> abc
    txt_lower = inp_txt.lower()
  
    ## Remove punctuations (!, ', ", ., :, ;, )
    #txt_lower_nopunct = txt_lower.translate(string.maketrans("",""), string.punctuation)
    #print(txt_lower_nopunct)
    
    
    ## Tokenize:
    tokens = nltk.word_tokenize(txt_lower) #_nopunct)
    #tokens = nltk.wordpunct_tokenize(txt_lower)
    
    ## remove stop-words:
    tokens_filtered = [w for w in tokens if not w in stopwords.words('english')]
    
    ## stemming:
    stems = [stemmer.stem(t) for t in tokens_filtered]
    stems_nopunct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]
    return (stems_nopunct)

2.2 TF-IDF Feature Extraction

3. Apply Random Forest

Using sklearn.ensemble.RandomForestClassifier with the tunable paramaters:

n_estimators: number of trees
criterion: 'gini', 'entropy'

Important Note: This module, requires dense matrix as input. If a sparse matrix is given, the following error will be raised:
A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.



In [9]:

    
## For the purpose of represntation
from sklearn.ensemble import RandomForestClassifier
from sklearn import pipeline
from mlxtend.sklearn import DenseTransformer

tfidf = sklearn.feature_extraction.text.TfidfVectorizer(
    encoding = 'utf-8',
    decode_error = 'replace',
    strip_accents = 'ascii',
    analyzer = 'word',
    max_features = 100,
    smooth_idf = True,
    sublinear_tf=True,
    max_df=0.5,
    stop_words='english',
    tokenizer = get_tokens
)

clf_pipe = pipeline.Pipeline([
    ('vect', tfidf),
    ('densify', DenseTransformer()),
    ('clf', RandomForestClassifier(n_estimators = 10, criterion='gini'))
])



rf_model = clf_pipe.fit(data_train[0:1000], y_train[0:1000])

pred_rf = rf_model.predict(data_test[0:1000])

pred_rf = np.vstack((df_test.loc[0:999, 'id'], pred_rf)).T

print(pred_rf.shape)



In [11]:

    
print(pred_ef[1:10,:])









    



[['8348_2' 0]
 ['5828_4' 1]
 ['7186_2' 1]
 ['12128_7' 1]
 ['2913_8' 0]
 ['4396_1' 0]
 ['395_2' 0]
 ['10616_1' 0]
 ['9074_9' 0]]

Applying Random Forest to All Data

3. Hyper-parameter Optimization using KFold Cross Validation



In [ ]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn import pipeline
from mlxtend.sklearn import DenseTransformer
import datetime

tfidf = sklearn.feature_extraction.text.TfidfVectorizer(
    encoding = 'utf-8',
    decode_error = 'replace',
    strip_accents = 'ascii',
    analyzer = 'word',
    max_features = 10000,
    smooth_idf = True,
    sublinear_tf=True,
    max_df=0.5,
    stop_words='english',
    tokenizer = get_tokens
)

clf_pipe = pipeline.Pipeline([
    ('vect', tfidf),
    ('densify', DenseTransformer()),
    ('clf', RandomForestClassifier(n_estimators = 10, criterion='gini'))
])


current_time = datetime.datetime.now().time().isoformat()
print("Training part started      (%s)" %(current_time))
rf_model = clf_pipe.fit(data_train, y_train)
current_time = datetime.datetime.now().time().isoformat()
print("Training part finished     (%s)" %(current_time))

pred_rf = rf_model.predict(data_test)
current_time = datetime.datetime.now().time().isoformat()
print("Testin part finished       (%s)" %(current_time))

pred_rf = np.vstack((df_test.loc[:, 'id'], pred_rf)).T

print(pred_rf.shape)



In [57]:

    
from sklearn import pipeline
from sklearn import metrics
from sklearn import grid_search
from sklearn import cross_validation
from sklearn.naive_bayes import MultinomialNB

import datetime
import gc # python's garbage collector

Optimizaing the Additive Smoothing Paramater $\alpha$

The class conditional probability is given by

$$P(x_i | C) = \frac{count(x_i, C) + \alpha}{count(x_i) + \alpha |V|}$$

where

$count(x_i, C)$: count of observing words $x_i$ in class $C$
$count(x_i)$ : count of words $x_i$ in all classes
$\alpha$: smoothing parameter
$|V|$: the size of distinct words set



In [44]:

    
alpha_params = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0]
#alpha_params = [0.01, 0.02]
colors = ["red", "green", "blue", "gray", "#656522", "#12FF14", "#B0B0B0", "#AA0505", "#0145FF", "#670566"]
#pointtypes = ['o', '^']
linestyles = ['-', '--', '-.', ':', '-', '--', '-.', ':', '--', '-.']

fig_roc = plt.figure(1, figsize=(10, 8))
ax_roc = fig_roc.add_subplot(1, 1, 1)

for param,col,ls in zip(alpha_params, colors, linestyles):
    
    clf_pipe = pipeline.Pipeline([
        ('vect', tfidf),
        ('clf', MultinomialNB(alpha=param))
    ])

    cv = cross_validation.StratifiedKFold(y_train, n_folds=5)
    
    auc_res = 0
    xr = np.linspace(0, 1, 100)
    tpr_interp = np.zeros(shape=xr.shape, dtype=float)
    
    for i, (train_inx, test_inx) in enumerate(cv):
        model = clf_pipe.fit(data_train[train_inx], y_train[train_inx])
        pred = model.predict_proba(data_train[test_inx])
        
        fpr, tpr, thresh = metrics.roc_curve(y_train[test_inx], pred[:, 1])
        auc_res += metrics.auc(fpr, tpr)
        
        tpr_interp += scipy.interp(xr, fpr, tpr)

    current_time = datetime.datetime.now().time().isoformat()
    print("Alpha = %.2f   ---->   AUC = %.4f     (%s)" %(param, auc_res/len(cv), current_time))
    
    tpr_interp /= len(cv)
    line_new = plt.plot(xr, tpr_interp)
    plt.setp(line_new, color=col, linewidth=3, linestyle=ls)
    

plt.plot([0, 1], [0, 1], '--', lw=4, color='gray')

plt.setp(ax_roc.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax_roc.get_yticklabels(), rotation='vertical', fontsize=16)
plt.axis([-0.05, 1.05, -0.05, 1.05])
plt.xlabel('False Positive Rate', size=20)
plt.ylabel('True Positive Rate', size=20)
plt.title('Multinomial NB Classification using TF-IDF Features', size=20)
plt.legend(alpha_params, loc='lower right', fontsize=20)
plt.show()









    



Alpha = 0.01   ---->   AUC = 0.9180     (00:06:27.493367)
Alpha = 0.02   ---->   AUC = 0.9223     (01:51:09.611895)
Alpha = 0.05   ---->   AUC = 0.9270     (03:25:57.325411)
Alpha = 0.10   ---->   AUC = 0.9298     (05:00:46.055790)
Alpha = 0.20   ---->   AUC = 0.9319     (06:35:36.087543)
Alpha = 0.50   ---->   AUC = 0.9337     (08:10:33.168714)
Alpha = 1.00   ---->   AUC = 0.9343     (09:45:23.516907)
Alpha = 2.00   ---->   AUC = 0.9345     (11:21:18.424894)

Apply Final Classification Model to Predict Classes in Test Set

Based on our hyper parameter optimization, we realize that using $\alpha = 2.0$ gives the optimal AUC (area under the curve of receiver operating characteristic). So, in our final classification model we use the following:

TF-IDF features
Multinomal naive Bayes with smoothing paramater $\alpha = 2.0$



In [56]:

    
clf_pipe = pipeline.Pipeline([
    ('vect', tfidf),
    ('clf', MultinomialNB(alpha=2.0))
])

final_model = clf_pipe.fit(data_train, y_train)
#pred_multNB = final_model.predict_proba(data_test)

pred_multNB = final_model.predict(data_test)

pred_multNB = np.vstack((df_test.loc[:, 'id'], pred_multNB)).T

print(pred_multNB.shape)

np.savetxt('../results/pred.multinomialNB.alpha_optimized.csv', pred_multNB, fmt='%s,%1d', delimiter=',', header='id,sentiment')

	id	sentiment	review
0	5814_8	1	With all this stuff going down at the moment w...
1	2381_9	1	\The Classic War of the Worlds\" by Timothy Hi...
2	7759_3	0	The film starts with a manager (Nicholas Bell)...
3	3630_4	0	It must be assumed that those who praised this...
4	9495_8	1	Superbly trashy and wondrously unpretentious 8...

	id	review
0	12311_10	Naturally in a film who's main themes are of m...
1	8348_2	This movie is a disaster within a disaster fil...
2	5828_4	All in all, this is a movie for kids. We saw i...
3	7186_2	Afraid of the Dark left me with the impression...
4	12128_7	A very accurate depiction of small time mob li...