In [1]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -v


Sebastian Raschka 

CPython 3.4.2
IPython 2.3.1

Sections



Reading the Training Dataset


In [2]:
import pandas as pd

df_train = pd.read_csv('../data/labeledTrainData.tsv', sep='\t', quoting=3)
df_test = pd.read_csv('../data/testData.tsv', sep='\t', quoting=3)

df_train.tail()


Out[2]:
id sentiment review
24995 3453_3 0 It seems like more consideration has gone into...
24996 5064_1 0 I don't believe they made this film. Completel...
24997 10905_3 0 Guy is a loser. Can't get girls, needs to buil...
24998 10194_3 0 This 30 minute documentary Buñuel made in the ...
24999 8478_8 1 I saw this movie as a child and it broke my he...

In [3]:
X_train = df_train['review']
y_train = df_train['sentiment']





Text Preprocessing


In [4]:
import pickle
stop_words = pickle.load(open('./stopwords.p', 'rb'))
semantic_words = pickle.load(open('./whitelist/semantic_words.p', 'rb'))



Transform texts into bag of words models - Trying different tokenizers


In [5]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer

porter = PorterStemmer()
snowball = EnglishStemmer()

# raw words
tokenizer = lambda text: text.split()

# words after Porter stemming 
tokenizer_porter = lambda text: [porter.stem(word) for word in text.split()]

# Words after Snowball stemming
tokenizer_snowball = lambda text: [snowball.stem(word) for word in text.split()]

# Only words that are in a list of 'positive' or 'negative' words ('whitelist')
# http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon
tokenizer_whitelist = lambda text: [word for word in text.split() if word in semantic_words]

# Porter-stemmed words in whitelist
tokenizer_porter_wl = lambda text: [porter.stem(word) for word in text.split() if word in semantic_words]

# Snowball-stemmed words in whitelist
tokenizer_snowball_wl = lambda text: [snowball.stem(word) for word in text.split() if word in semantic_words]



Looking at vocabulary sizes


In [6]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from mlxtend.sklearn import DenseTransformer

vect_1 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         max_features=5000,
                         tokenizer=tokenizer)

vect_2 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         max_features=5000,
                         tokenizer=tokenizer_porter)
    
vect_3 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         max_features=5000,
                         tokenizer=tokenizer_snowball)  

vect_4 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         max_features=5000,
                         tokenizer=tokenizer_whitelist)  

vect_5 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         max_features=5000,
                         tokenizer=tokenizer_porter_wl)

vect_6 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         max_features=5000,
                         tokenizer=tokenizer_snowball_wl)

vect_7 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         max_features=5000,
                         tokenizer=tokenizer)

vect_8 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         max_features=5000,
                         tokenizer=tokenizer_porter)
    
vect_9 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         max_features=5000,
                         tokenizer=tokenizer_snowball)

vect_10 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         max_features=5000,
                         tokenizer=tokenizer_whitelist)    

vect_11 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         max_features=5000,
                         tokenizer=tokenizer_porter_wl)

vect_12 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         max_features=5000,
                         tokenizer=tokenizer_snowball_wl)


pipelines = []
vectorizers = [vect_1, vect_2, vect_3, vect_4, vect_5, vect_6, vect_7, vect_8, vect_9, vect_10, vect_11, vect_12]
for v in vectorizers:
    pipelines.append(Pipeline([('vect', v),
                               ('dense', DenseTransformer()),
                               ('clf', RandomForestClassifier(n_estimators=100))]))

In [51]:
# done before max_features was set

print('Vocabulary sizes\n')
labels = ['CountVec', 'CountVec porter', 'CountVec snowball', 'CountVec wl', 'CountVec porter+wl','CountVec snowball+wl',
          'TfidfVec', 'TfidfVec porter', 'TfidfVec snowball', 'TfidfVec wl', 'TfidfVec porter+wl','TfidfVec snowball+wl',]

for label, v in zip(labels, vectorizers):
    v.fit(X_train)
    print('%s: %s' % (label, len(v.vocabulary_)))


Vocabulary sizes

CountVec: 73145
CountVec porter: 49632
CountVec snowball: 48958
CountVec wl: 5511
CountVec porter+wl: 3750
CountVec snowball+wl: 3531
TfidfVec: 73145
TfidfVec porter: 49632
TfidfVec snowball: 48958
TfidfVec wl: 5511
TfidfVec porter+wl: 3750
TfidfVec snowball+wl: 3531



Model Selection

Feature Extraction - Cross Validation Error


In [8]:
from sklearn import metrics
from sklearn import cross_validation

labels = ['CountVec', 'CountVec porter', 'CountVec snowball', 'CountVec wl', 'CountVec porter+wl','CountVec snowball+wl',
          'TfidfVec', 'TfidfVec porter', 'TfidfVec snowball', 'TfidfVec wl', 'TfidfVec porter+wl','TfidfVec snowball+wl',]



d = {'Data':labels,
     'ROC AUC (%)':[],}

for i,clf in enumerate(pipelines):
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring='roc_auc', cv=5)
    print('clf %s, %s: %s' % (i+1, labels[i], scores.mean()*100))
    d['ROC AUC (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))


clf 1, CountVec: 91.7881344
clf 2, CountVec porter: 91.9449152
clf 3, CountVec snowball: 91.9391536
clf 4, CountVec wl: 90.3942896
clf 5, CountVec porter+wl: 90.3207072
clf 6, CountVec snowball+wl: 90.2649024
clf 7, TfidfVec: 92.1736432
clf 8, TfidfVec porter: 92.157568
clf 9, TfidfVec snowball: 92.2451872
clf 10, TfidfVec wl: 90.7966944
clf 11, TfidfVec porter+wl: 90.5723472
clf 12, TfidfVec snowball+wl: 90.7430272

In [12]:
df_perform = pd.DataFrame(d)
df_perform = df_perform['ROC AUC (%)']
df_perform.index=(labels)
df_perform


Out[12]:
CountVec                91.79 (+/- 0.25)
CountVec porter         91.94 (+/- 0.11)
CountVec snowball       91.94 (+/- 0.11)
CountVec wl             90.39 (+/- 0.28)
CountVec porter+wl      90.32 (+/- 0.38)
CountVec snowball+wl    90.26 (+/- 0.29)
TfidfVec                92.17 (+/- 0.42)
TfidfVec porter         92.16 (+/- 0.19)
TfidfVec snowball       92.25 (+/- 0.32)
TfidfVec wl             90.80 (+/- 0.39)
TfidfVec porter+wl      90.57 (+/- 0.35)
TfidfVec snowball+wl    90.74 (+/- 0.39)
Name: ROC AUC (%), dtype: object

In [13]:
df_perform.to_csv('../results/rand_forest_featextr_1.csv')



ROC Curve


In [14]:
%matplotlib inline

In [20]:
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import KFold
from scipy import interp

sns.set()
sns.set_style("whitegrid")

classifier = Pipeline([('vect',   TfidfVectorizer(binary=False,
                                             stop_words=stop_words,
                                             ngram_range=(1,1),
                                             preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                                             max_features = 5000,
                                             tokenizer=lambda text: [porter.stem(word) for word in text.split()]
                )),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=100))])


cv = KFold(y_train.shape[0], n_folds=5, random_state=123)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

for i, (train, test) in enumerate(cv):
    probas_ = classifier.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc))

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')

mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.savefig('../images/rand_forest_roc_tfidf_porter_1.eps', dpi=300)
plt.legend(loc="lower right")

plt.show()




Submission 1


In [21]:
clf_1 = Pipeline([('vect',   TfidfVectorizer(binary=False,
                                             stop_words=stop_words,
                                             ngram_range=(1,1),
                                             preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                                             max_features=5000,
                                             tokenizer=lambda text: [porter.stem(word) for word in text.split()]
                )),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=100))])

In [ ]:
X_test = df_test['review']

clf_1.fit(X_train, y_train)
result = clf_1.predict(X_test)

In [30]:
output = pd.DataFrame(data={'id':df_test['id'], 'sentiment':result} )
output.to_csv('../submissions/1_random_forest_tfidf_1.csv', index=False, quoting=3)



Hyperparameter Tuning


In [31]:
vect = TfidfVectorizer(binary=False,
                       stop_words=stop_words,
                       ngram_range=(1,1),
                       preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                       max_features=5000,
                       tokenizer=lambda text: [porter.stem(word) for word in text.split()])



Effect of the number of estimators


In [ ]:
pipe_1 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=50))])

pipe_2 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=200))])

pipe_3 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=400))])

for i,clf in enumerate([pipe_1, pipe_2, pipe_3]):
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring='roc_auc', cv=5)
    print('clf %s, %s: %0.2f (+/- %0.2f)' % (i+1, labels[i], scores.mean()*100, scores.std()*100))



GridSearch


In [35]:
X_train_feat = vect.fit_transform(X_train, y_train)
X_train_feat = X_train_feat.toarray()

In [ ]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report


clf_2 = RandomForestClassifier(n_estimators=50)


tuned_parameters = [
  {'criterion': ['gini', 'entropy'], 
   'max_features': ['auto', 'log2', 'sqrt'],
   'min_samples_split':[2,3], 
   'min_samples_leaf':[1,2]},
 ]


grid_search_1 = GridSearchCV(clf_2, 
                           tuned_parameters, 
                           n_jobs=2, 
                           scoring='roc_auc',
                           cv=5
                )

grid_search_1.fit(X_train_feat, X_train_feat)

print("Best parameters set found on development set:")
print()
print(grid_search_1.best_estimator_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in grid_search_1.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
            % (mean_score, scores.std() / 2, params))