In [1]:
import os
import KaggleWord2VecUtility as util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import pandas as pd
import numpy as np

In [2]:
reload(util)


Out[2]:
<module 'KaggleWord2VecUtility' from 'KaggleWord2VecUtility.pyc'>

In [3]:
train = pd.read_csv(os.path.join(os.path.dirname('.'), 'data', 'labeledTrainData.tsv'), header=0, \
                delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(os.path.dirname('.'), 'data', 'testData.tsv'), header=0, delimiter="\t", \
               quoting=3)
y = train["sentiment"]
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange( 0, len(train["review"])):
    traindata.append(" ".join(util.KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))
testdata = []
for i in xrange(0,len(test["review"])):
    testdata.append(" ".join(util.KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False)))
print 'vectorizing... ',
tfv = TfidfVectorizer(min_df=3,  max_features=None,
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')
X_all = traindata + testdata
lentrain = len(traindata)

print "fitting pipeline... ",
tfv.fit(X_all)
X_all = tfv.transform(X_all)

X = X_all[:lentrain]
X_test = X_all[lentrain:]


Cleaning and parsing movie reviews...

vectorizing...  fitting pipeline... 

In [8]:
X.shape


Out[8]:
(25000, 309827)

In [43]:
model = LogisticRegression(penalty='l2', dual=True, tol=0.0001,
                         C=15, fit_intercept=True, intercept_scaling=1.0,
                         class_weight=None, random_state=None)
# .8936 C=0.7
# 0.8952 C=0.95
# 0.8984 C=1.5
# 0.8992 c=2
# 0.9 c=5

# 20 Fold CV Score:  0.964325504 C=7
# 20 Fold CV Score:  0.963949312 C=5
# 20 Fold CV Score:  0.964153344 C=6
# 20 Fold CV Score:  0.964437504 C=8
# 20 Fold CV Score:  0.964532224 C=9
# 20 Fold CV Score:  0.964656256 C=12
# 20 Fold CV Score:  0.964716544 C=15

For checking and outputting model


In [36]:
print "20 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X, y, cv=20, scoring='roc_auc'))


20 Fold CV Score:  0.965067392

In [37]:
print "Retrain on all training data, predicting test labels...\n"
model.fit(X,y)
result = model.predict_proba(X_test)[:,1]
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file                                                                                                                                                        
output.to_csv(os.path.join(os.path.dirname('.'), 'data', 'Bag_of_Words_model.csv'), index=False, quoting=3)

print "Wrote results to Bag_of_Words_model.csv"


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-37-3156af7650eb> in <module>()
      1 print "Retrain on all training data, predicting test labels...\n"
      2 model.fit(X,y)
----> 3 result = model.predict_proba(X_test)[:,1]
      4 output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
      5 

AttributeError: 'LinearSVC' object has no attribute 'predict_proba'
Retrain on all training data, predicting test labels...

Learning Curves


In [31]:
from sklearn.cross_validation import train_test_split
X_train, X_tester, y_train, y_tester = train_test_split(X, y, test_size=0.05, random_state=42)

In [32]:
def plot_errors(train_error, test_error):
    import matplotlib.pyplot as plt
    %matplotlib inline

    n = range(0, len(train_error))
    
    plt.plot(n, [1 - i for i in train_error] , 'g-', n, [1 - i for i in test_error], 'r-', label='error', linewidth=2)
    plt.show()

In [33]:
def learning_curves(clf_):
    p = 0.05
    test_errors = []
    train_errors = []
    while(p < 1.0):
        X_cur_train, _, y_cur_train, _ = train_test_split(X_train, y_train, test_size=(1 - p), random_state=0)
        clf_.fit(X_cur_train, y_cur_train)
        train_error = clf_.score(X_cur_train, y_cur_train)
        print('Training accuracy %s' % train_error)
        train_errors.append(train_error)
        test_error = clf_.score(X_tester, y_tester)
        print('Test accuracy %s' % test_error)
        test_errors.append(test_error)
        p += 0.05
    plot_errors(train_errors, test_errors)

In [34]:
learning_curves(model)


Training accuracy 1.0
Test accuracy 0.84
Training accuracy 1.0
Test accuracy 0.8624
Training accuracy 1.0
Test accuracy 0.8744
Training accuracy 0.999578947368
Test accuracy 0.8792
Training accuracy 0.999663129527
Test accuracy 0.8768
Training accuracy 0.999719298246
Test accuracy 0.8872
Training accuracy 0.999639076035
Test accuracy 0.8928
Training accuracy 0.999578903042
Test accuracy 0.8952
Training accuracy 0.999532141855
Test accuracy 0.8976
Training accuracy 0.999578947368
Test accuracy 0.8928
Training accuracy 0.999540652274
Test accuracy 0.8976
Training accuracy 0.999368421053
Test accuracy 0.8952
Training accuracy 0.999546544018
Test accuracy 0.9016
Training accuracy 0.999338345865
Test accuracy 0.9056
Training accuracy 0.999270154952
Test accuracy 0.9
Training accuracy 0.999105263158
Test accuracy 0.904
Training accuracy 0.999058800218
Test accuracy 0.9016
Training accuracy 0.99901754386
Test accuracy 0.9032
Training accuracy 0.998980586827
Test accuracy 0.9056

In [ ]: