In [10]:
import sys
import os
import logging
import KaggleWord2VecUtility as util
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from word2vec import Word2Vec, Sent2Vec, LineSentence

In [12]:
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

In [17]:
input_file = 'original_rt_snippets.txt'
model = Word2Vec(LineSentence(os.path.join(os.path.dirname('.'), 'data/stanfordSentimentTreebank', input_file)), size=100, window=5, sg=0, min_count=5, workers=8)
model.save(input_file + '.model')
model.save_word2vec_format(input_file + '.vec')

In [ ]:
sent_file = 'sent.txt'
model = Sent2Vec(LineSentence(sent_file), model_file=input_file + '.model')
model.save_sent2vec_format(sent_file + '.vec')
generate training set

In [33]:
train = pd.read_csv(os.path.join(os.path.dirname('.'), 'data', 'labeledTrainData.tsv'), header=0, \
                delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(os.path.dirname('.'), 'data', 'testData.tsv'), header=0, delimiter='\t', quoting=3)
f = open('all_data.txt', 'w')

for i in xrange( 0, len(train["review"])):
    f.write('{0}\n'.format(' '.join(util.KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False))))
for i in xrange(0, len(test['review'])):
    f.write('{0}\n'.format(' '.join(util.KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False))))
f.close()

In [34]:
model = Sent2Vec(LineSentence('all_data.txt'), model_file=input_file + '.model')

In [35]:
model.save_sent2vec_format('train_test.vec')

In [60]:
train_test = []
for i, line in enumerate(open('train_test.vec')):
    if i == 0:
        continue
    train_test.append(map(float, line.split()[1:]))

In [67]:
len_train = len(train['review'])
    
X_deep = train_test[:len_train]
X_test_deep = train_test[len_train:]

In [70]:
for i,_ in enumerate(X):
    X[i].extend(X_deep[i])
for i,_ in enumerate(X_test):
    X_test[i].extend(X_test_deep[i])

In [65]:
lr = LogisticRegression(penalty='l2', dual=True, tol=0.0001,
                         C=15, fit_intercept=True, intercept_scaling=1.0,
                         class_weight=None, random_state=None)

In [66]:
print "20 Fold CV Score: ", np.mean(cross_validation.cross_val_score(lr, X, y, cv=20, scoring='roc_auc'))


20 Fold CV Score:  0.571346048

In [ ]: