In [10]:
import sys
import os
import logging
import KaggleWord2VecUtility as util
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from word2vec import Word2Vec, Sent2Vec, LineSentence

In [12]:
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

In [17]:
input_file = 'original_rt_snippets.txt'
model = Word2Vec(LineSentence(os.path.join(os.path.dirname('.'), 'data/stanfordSentimentTreebank', input_file)), size=100, window=5, sg=0, min_count=5, workers=8)
model.save(input_file + '.model')
model.save_word2vec_format(input_file + '.vec')

In [ ]:
sent_file = 'sent.txt'
model = Sent2Vec(LineSentence(sent_file), model_file=input_file + '.model')
model.save_sent2vec_format(sent_file + '.vec')
generate training set

In [27]:
train = pd.read_csv(os.path.join(os.path.dirname('.'), 'data', 'labeledTrainData.tsv'), header=0, \
                delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(os.path.dirname('.'), 'data', 'testData.tsv'), header=0, delimiter='\t', quoting=3)
f = open('all_data.txt', 'w')

for i in xrange( 0, len(train["review"])):
    f.write('{0}\n', ' '.join(util.KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))
for i in xrange(0, len(test['review'])):
    f.write('{0}\n',' '.join(util.KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False)))
                


f.close()

In [22]:
model = Sent2Vec(LineSentence('all_data.txt'), model_file=input_file + '.model')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-22-f37265c7463c> in <module>()
      1 model = Sent2Vec(traindata, model_file=input_file + '.model')
----> 2 model.save_sent2vec_format(sent_file + '.vec')

NameError: name 'sent_file' is not defined

In [23]:
model.save_sent2vec_format('train.vec')

In [ ]: