In [1]:
from time import time
start = time()
import datapot as dp
In [2]:
datapot = dp.DataPot()
In [3]:
train_file = open('../data/imdbHTMLReviewsTrainData.jsonlines')
In [4]:
datapot.detect(train_file)
Out[4]:
In order to apply word2vec with pre-trained Google News corpus (3 billion running words) word vector model download it here:
https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download
In [5]:
from datapot.transformer.text_transformer import Word2VecTransformer
In [6]:
datapot.add_transformer('review', Word2VecTransformer('GoogleNews-vectors-negative300.bin.gz'))
Out[6]:
In [7]:
from datapot.transformer.identity_transformer import IdentityTransformer
datapot.add_transformer('id', IdentityTransformer())
Out[7]:
In [8]:
datapot
Out[8]:
In [9]:
datapot.remove_transformer('sentiment', 0).remove_transformer('review', 0)
Out[9]:
In [10]:
datapot.fit(train_file)
Out[10]:
In [11]:
df = datapot.transform(train_file)
In [12]:
df.head()
Out[12]:
In [13]:
X = df.drop(['sentiment', 'id'], axis=1)
y = df.sentiment
In [14]:
from xgboost import XGBClassifier
clf = XGBClassifier().fit(X, y)
In [15]:
test_f = open('data/imdbHTMLReviewsTestData.jsonlines')
test_df = datapot.transform(test_f)
In [16]:
test_id = test_df.id
X_test = test_df.drop(['id', 'sentiment'], axis=1)
In [17]:
import pandas as pd
answer_df = pd.DataFrame()
answer_df['id'] = test_df.id
answer_df['sentiment'] = clf.predict(X_test)
answer_df.to_csv("baseline_answer.csv", index=False)
In [18]:
print(time() - start)