In [1]:

    
%load_ext watermark

%watermark -a 'Vahid Mirjalili' -d -p scikit-learn,numpy,numexpr,pandas,matplotlib,plotly -v









    



Vahid Mirjalili 25/12/2014 

CPython 2.7.3
IPython 2.3.1

scikit-learn 0.15.2
numpy 1.9.1
numexpr 2.2.2
pandas 0.15.1
matplotlib 1.4.2
plotly 1.4.7



In [2]:

    
from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import scipy
import sklearn

%matplotlib inline









    



/usr/local/lib/python2.7/dist-packages/PIL/Image.py:71: RuntimeWarning: The _imaging extension was built for another  version of Pillow or PIL
  warnings.warn(str(v), RuntimeWarning)
/usr/local/lib/python2.7/dist-packages/PIL/Image.py:71: RuntimeWarning: The _imaging extension was built for another  version of Pillow or PIL
  warnings.warn(str(v), RuntimeWarning)

1. Read the training and test dataset



In [3]:

    
df = pd.read_table('../data/labeledTrainData.tsv')

df.head()









    Out[3]:






  
    
      
      id
      sentiment
      review
    
  
  
    
      0
       5814_8
       1
       With all this stuff going down at the moment w...
    
    
      1
       2381_9
       1
       \The Classic War of the Worlds\" by Timothy Hi...
    
    
      2
       7759_3
       0
       The film starts with a manager (Nicholas Bell)...
    
    
      3
       3630_4
       0
       It must be assumed that those who praised this...
    
    
      4
       9495_8
       1
       Superbly trashy and wondrously unpretentious 8...



In [4]:

    
df_test = pd.read_table('../data/testData.tsv')

df_test.head()









    Out[4]:






  
    
      
      id
      review
    
  
  
    
      0
       12311_10
       Naturally in a film who's main themes are of m...
    
    
      1
         8348_2
       This movie is a disaster within a disaster fil...
    
    
      2
         5828_4
       All in all, this is a movie for kids. We saw i...
    
    
      3
         7186_2
       Afraid of the Dark left me with the impression...
    
    
      4
        12128_7
       A very accurate depiction of small time mob li...

1.1 Extracting X & y data columns



In [5]:

    
data_train = df.loc[:, 'review']

y_train = df.loc[:, 'sentiment']

data_train.head()









    Out[5]:





0    With all this stuff going down at the moment w...
1    \The Classic War of the Worlds\" by Timothy Hi...
2    The film starts with a manager (Nicholas Bell)...
3    It must be assumed that those who praised this...
4    Superbly trashy and wondrously unpretentious 8...
Name: review, dtype: object



In [6]:

    
data_test = df_test.loc[:, 'review']

data_test.tail()









    Out[6]:





24995    Sony Pictures Classics, I'm looking at you! So...
24996    I always felt that Ms. Merkerson had never got...
24997    I was so disappointed in this movie. I am very...
24998    From the opening sequence, filled with black a...
24999    This is a great horror film for people who don...
Name: review, dtype: object

2. Text Feature Extraction



In [7]:

    
import nltk
import string
import re
from collections import Counter

from nltk.corpus import stopwords









    



/usr/local/lib/python2.7/dist-packages/PIL/Image.py:71: RuntimeWarning: The _imaging extension was built for another  version of Pillow or PIL
  warnings.warn(str(v), RuntimeWarning)
/usr/local/lib/python2.7/dist-packages/PIL/Image.py:71: RuntimeWarning: The _imaging extension was built for another  version of Pillow or PIL
  warnings.warn(str(v), RuntimeWarning)

2.1 Tokenizer Function

Transform to lower-case
Remove the punctuations
Remove the stopwrods
Tokenize the remaining string



In [8]:

    
## For more info, see http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html

stemmer = nltk.stem.porter.PorterStemmer()

def get_tokens(inp_txt):
    
    ## Lower case: ABC -> abc
    txt_lower = inp_txt.lower()
  
    ## Remove punctuations (!, ', ", ., :, ;, )
    #txt_lower_nopunct = txt_lower.translate(string.maketrans("",""), string.punctuation)
    #print(txt_lower_nopunct)
    
    
    ## Tokenize:
    tokens = nltk.word_tokenize(txt_lower) #_nopunct)
    #tokens = nltk.wordpunct_tokenize(txt_lower)
    
    ## remove stop-words:
    tokens_filtered = [w for w in tokens if not w in stopwords.words('english')]
    
    ## stemming:
    stems = [stemmer.stem(t) for t in tokens_filtered]
    stems_nopunct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]
    return (stems_nopunct)

Unit test for tokenizer:



In [9]:

    
get_tokens("What's in a name? That which we call a rose by any other name would smell as sweet.")

## Note: you need to download punkt package in nltk:
# import nltk
# nltk.download(punkt)









    Out[9]:





[u'name', u'call', u'rose', u'name', u'would', u'smell', u'sweet']

2.2 TF-IDF Feature Extraction



In [10]:

    
tfidf = sklearn.feature_extraction.text.TfidfVectorizer(
    encoding = 'utf-8',
    decode_error = 'replace',
    strip_accents = 'ascii',
    analyzer = 'word',
    smooth_idf = True,
    tokenizer = get_tokens
)

tfidf









    Out[10]:





TfidfVectorizer(analyzer='word', binary=False, charset=None,
        charset_error=None, decode_error='replace',
        dtype=<type 'numpy.int64'>, encoding='utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='ascii', sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<function get_tokens at 0x9703ed8>, use_idf=True,
        vocabulary=None)

Unit test for TF-IDF:



In [11]:

    
## Shakespear quote
example_txt_1 = "What's in a name? That which we call a rose by any other name would smell as sweet."
example_txt_2 = "To be, or not to be: that is the question."

tfidf = tfidf.fit([example_txt_1 + example_txt_2])

example1 = tfidf.transform([example_txt_1])
example2 = tfidf.transform([example_txt_2])

print('Features: %s' %tfidf.get_feature_names())
print('Example1: %s' %example1.toarray())
print('Example2: %s' %example2.toarray())









    



Features: [u'call', u'name', u'question', u'rose', u'smell', u'would']
Example1: [[ 0.35355339  0.70710678  0.          0.35355339  0.35355339  0.35355339]]
Example2: [[ 0.  0.  1.  0.  0.  0.]]

2.3 Evaluate TF-IDF on the reviews



In [12]:

    
tfidf_train = tfidf.fit(data_train.ravel())

print('Feature-set size: %s' %len(tfidf_train.get_feature_names()))









    



Feature-set size: 47448



In [ ]:

    
import pickle

pkl_out = open('../data/pickle/tfidf_object.pkl', 'w')
pickle.dump(tfidf, pkl_out)
pkl_out.close()

3. Naive Bayes Classification of Reviews



In [13]:

    
### Vectorizing the training set:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
X_train = vectorizer.fit_transform(data_train)

print("Number of samples N= %d,  Number of features d= %d" % X_train.shape)


### Transforming the test dataset:
X_test = vectorizer.transform(data_test)

print("Number of Test Documents: %d,  Number of features: %d" %X_test.shape)









    



Number of samples N= 25000,  Number of features d= 74536
Number of Test Documents: 25000,  Number of features: 74536



In [14]:

    
from sklearn import metrics


### Train a classifier object and test it on the test set:
def apply_classifier(clf):
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)

    #score = metrics.f1_score(y_train, pred)

    return(pred)



In [15]:

    
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

from timeit import timeit



In [16]:

    
%timeit pred_multNB = apply_classifier(MultinomialNB(alpha=.01))









    



10 loops, best of 3: 92.8 ms per loop



In [35]:

    
pred_multNB = apply_classifier(MultinomialNB(alpha=.01))

pred_multNB = np.vstack((df_test.loc[:, 'id'], pred_multNB)).T

pred_multNB.shape









    Out[35]:





(25000, 2)



In [38]:

    
np.savetxt('../results/pred.multinomialNB.csv', pred_multNB, fmt='%s,%1d', delimiter=',', header='id,sentiment')



In [ ]:

	id	sentiment	review
0	5814_8	1	With all this stuff going down at the moment w...
1	2381_9	1	\The Classic War of the Worlds\" by Timothy Hi...
2	7759_3	0	The film starts with a manager (Nicholas Bell)...
3	3630_4	0	It must be assumed that those who praised this...
4	9495_8	1	Superbly trashy and wondrously unpretentious 8...

	id	review
0	12311_10	Naturally in a film who's main themes are of m...
1	8348_2	This movie is a disaster within a disaster fil...
2	5828_4	All in all, this is a movie for kids. We saw i...
3	7186_2	Afraid of the Dark left me with the impression...
4	12128_7	A very accurate depiction of small time mob li...