In [1]:
import os
import KaggleWord2VecUtility as util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import pandas as pd
import numpy as np
In [2]:
reload(util)
Out[2]:
In [3]:
train = pd.read_csv(os.path.join(os.path.dirname('.'), 'data', 'labeledTrainData.tsv'), header=0, \
delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(os.path.dirname('.'), 'data', 'testData.tsv'), header=0, delimiter="\t", \
quoting=3)
y = train["sentiment"]
print "Cleaning and parsing movie reviews...\n"
traindata = []
for i in xrange( 0, len(train["review"])):
traindata.append(" ".join(util.KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))
testdata = []
for i in xrange(0,len(test["review"])):
testdata.append(" ".join(util.KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False)))
print 'vectorizing... ',
tfv = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
X_all = traindata + testdata
lentrain = len(traindata)
print "fitting pipeline... ",
tfv.fit(X_all)
X_all = tfv.transform(X_all)
X = X_all[:lentrain]
X_test = X_all[lentrain:]
In [8]:
X.shape
Out[8]:
In [43]:
model = LogisticRegression(penalty='l2', dual=True, tol=0.0001,
C=15, fit_intercept=True, intercept_scaling=1.0,
class_weight=None, random_state=None)
# .8936 C=0.7
# 0.8952 C=0.95
# 0.8984 C=1.5
# 0.8992 c=2
# 0.9 c=5
# 20 Fold CV Score: 0.964325504 C=7
# 20 Fold CV Score: 0.963949312 C=5
# 20 Fold CV Score: 0.964153344 C=6
# 20 Fold CV Score: 0.964437504 C=8
# 20 Fold CV Score: 0.964532224 C=9
# 20 Fold CV Score: 0.964656256 C=12
# 20 Fold CV Score: 0.964716544 C=15
In [36]:
print "20 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X, y, cv=20, scoring='roc_auc'))
In [37]:
print "Retrain on all training data, predicting test labels...\n"
model.fit(X,y)
result = model.predict_proba(X_test)[:,1]
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
# Use pandas to write the comma-separated output file
output.to_csv(os.path.join(os.path.dirname('.'), 'data', 'Bag_of_Words_model.csv'), index=False, quoting=3)
print "Wrote results to Bag_of_Words_model.csv"
In [31]:
from sklearn.cross_validation import train_test_split
X_train, X_tester, y_train, y_tester = train_test_split(X, y, test_size=0.05, random_state=42)
In [32]:
def plot_errors(train_error, test_error):
import matplotlib.pyplot as plt
%matplotlib inline
n = range(0, len(train_error))
plt.plot(n, [1 - i for i in train_error] , 'g-', n, [1 - i for i in test_error], 'r-', label='error', linewidth=2)
plt.show()
In [33]:
def learning_curves(clf_):
p = 0.05
test_errors = []
train_errors = []
while(p < 1.0):
X_cur_train, _, y_cur_train, _ = train_test_split(X_train, y_train, test_size=(1 - p), random_state=0)
clf_.fit(X_cur_train, y_cur_train)
train_error = clf_.score(X_cur_train, y_cur_train)
print('Training accuracy %s' % train_error)
train_errors.append(train_error)
test_error = clf_.score(X_tester, y_tester)
print('Test accuracy %s' % test_error)
test_errors.append(test_error)
p += 0.05
plot_errors(train_errors, test_errors)
In [34]:
learning_curves(model)
In [ ]: