In [38]:
import os
import KaggleWord2VecUtility as util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import pandas as pd
import numpy as np

In [39]:
reload(util)


Out[39]:
<module 'KaggleWord2VecUtility' from 'KaggleWord2VecUtility.py'>

In [41]:
train = pd.read_csv(os.path.join(os.path.dirname('.'), 'data', 'labeledTrainData.tsv'), header=0, \
                delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(os.path.dirname('.'), 'data', 'testData.tsv'), header=0, delimiter="\t", \
               quoting=3)
y = train["sentiment"]
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange( 0, len(train["review"])):
    traindata.append(" ".join(util.KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))
testdata = []
for i in xrange(0,len(test["review"])):
    testdata.append(" ".join(util.KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False)))
print 'vectorizing... ',
tfv = TfidfVectorizer(min_df=3,  max_features=None,
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')
X_all = traindata + testdata
lentrain = len(traindata)

print "fitting pipeline... ",
tfv.fit(X_all)
X_all = tfv.transform(X_all)

X = X_all[:lentrain]
X_test = X_all[lentrain:]


 Cleaning and parsing movie reviews...

vectorizing...  fitting pipeline... 

In [42]:
X.shape


Out[42]:
(25000, 309827)

In [60]:
model = LogisticRegression(penalty='l2', dual=True, tol=0.0001,
                         C=6, fit_intercept=True, intercept_scaling=1.0,
                         class_weight=None, random_state=None)
# .8936 C=0.7
# 0.8952 C=0.95
# 0.8984 C=1.5
# 0.8992 c=2
# 0.9 c=5

# 20 Fold CV Score:  0.964325504 C=7
# 20 Fold CV Score:  0.963949312 C=5
# 20 Fold CV Score:  0.964153344 C=6
# 20 Fold CV Score:  0.964437504 C=8
# 20 Fold CV Score:  0.964532224 C=9
# 20 Fold CV Score:  0.964656256 C=12
# 20 Fold CV Score:  0.964716544 C=15

For checking and outputting model


In [61]:
print "20 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X, y, cv=20, scoring='roc_auc'))


20 Fold CV Score:  0.964153344

In [59]:
print "Retrain on all training data, predicting test labels...\n"
model.fit(X,y)
result = model.predict_proba(X_test)[:,1]
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file                                                                                                                                                        
output.to_csv(os.path.join(os.path.dirname('.'), 'data', 'Bag_of_Words_model.csv'), index=False, quoting=3)

print "Wrote results to Bag_of_Words_model.csv"


Retrain on all training data, predicting test labels...

Wrote results to Bag_of_Words_model.csv

Learning Curves


In [48]:
from sklearn.cross_validation import train_test_split
X_train, X_tester, y_train, y_tester = train_test_split(X, y, test_size=0.05, random_state=42)

In [49]:
def plot_errors(train_error, test_error):
    import matplotlib.pyplot as plt
    %matplotlib inline

    n = range(0, len(train_error))
    
    plt.plot(n, [1 - i for i in train_error] , 'g-', n, [1 - i for i in test_error], 'r-', label='error', linewidth=2)
    plt.show()

In [50]:
def learning_curves(clf_):
    p = 0.05
    test_errors = []
    train_errors = []
    while(p < 1.0):
        X_cur_train, _, y_cur_train, _ = train_test_split(X_train, y_train, test_size=(1 - p), random_state=0)
        clf_.fit(X_cur_train, y_cur_train)
        train_error = clf_.score(X_cur_train, y_cur_train)
        print('Training accuracy %s' % train_error)
        train_errors.append(train_error)
        test_error = clf_.score(X_tester, y_tester)
        print('Test accuracy %s' % test_error)
        test_errors.append(test_error)
        p += 0.05
    plot_errors(train_errors, test_errors)

In [55]:
learning_curves(model)


Training accuracy 1.0
Test accuracy 0.8136
Training accuracy 0.997894736842
Test accuracy 0.856
Training accuracy 0.995227400337
Test accuracy 0.8664
Training accuracy 0.992631578947
Test accuracy 0.8744
Training accuracy 0.990399191511
Test accuracy 0.8736
Training accuracy 0.989754385965
Test accuracy 0.8768
Training accuracy 0.987969201155
Test accuracy 0.8784
Training accuracy 0.986209074639
Test accuracy 0.8896
Training accuracy 0.984373537943
Test accuracy 0.8896
Training accuracy 0.983242105263
Test accuracy 0.8896
Training accuracy 0.982238554586
Test accuracy 0.8888
Training accuracy 0.980280701754
Test accuracy 0.8944
Training accuracy 0.979788819071
Test accuracy 0.8912
Training accuracy 0.979368421053
Test accuracy 0.8952
Training accuracy 0.978329216259
Test accuracy 0.8944
Training accuracy 0.977684210526
Test accuracy 0.8952
Training accuracy 0.976717689602
Test accuracy 0.8952
Training accuracy 0.976280701754
Test accuracy 0.8936
Training accuracy 0.975135183051
Test accuracy 0.8992

In [ ]: