In [1]:
import numpy as np
import pandas as pd
from lxml import html

from passage.models import RNN
from passage.updates import Adadelta
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import Tokenizer


Using gpu device 0: GRID K520

In [2]:
def clean(texts):
    return [html.fromstring(text).text_content().lower().strip() for text in texts]

In [3]:
tr_data = pd.read_csv('labeledTrainData.tsv', delimiter='\t') 
trX = clean(tr_data['review'].values)
trY = tr_data['sentiment'].values

print("Training data loaded and cleaned.")

tokenizer = Tokenizer(min_df=10, max_features=100000)
trX = tokenizer.fit_transform(trX)

print("Training data tokenized.")

layers = [
    Embedding(size=256, n_features=tokenizer.n_features),
    GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
    Dense(size=1, activation='sigmoid', init='orthogonal')
]

model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))


Training data loaded and cleaned.
Training data tokenized.
/home/ubuntu/anaconda/lib/python2.7/site-packages/theano/scan_module/scan_perform_ext.py:133: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility
  from scan_perform.scan_perform import *

In [5]:
tokenizer.n_features


Out[5]:
18589

In [7]:
model.fit(trX, trY, n_epochs=30)

te_data = pd.read_csv('testData.tsv', delimiter='\t')
ids = te_data['id'].values
teX = clean(te_data['review'].values)
teX = tokenizer.transform(teX)
pr_teX = model.predict(teX).flatten()
 
pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('submission.csv', index=False, header=["id", "sentiment"])


Epoch 0 Seen 24621 samples Avg cost 0.5218 Time elapsed 205 seconds
Epoch 1 Seen 49242 samples Avg cost 0.3855 Time elapsed 414 seconds
Epoch 2 Seen 73863 samples Avg cost 0.2971 Time elapsed 656 seconds
Epoch 3 Seen 98484 samples Avg cost 0.2663 Time elapsed 897 seconds
Epoch 4 Seen 123105 samples Avg cost 0.2386 Time elapsed 1138 seconds
Epoch 5 Seen 147726 samples Avg cost 0.2117 Time elapsed 1381 seconds
Epoch 6 Seen 172347 samples Avg cost 0.1950 Time elapsed 1624 seconds
Epoch 7 Seen 196968 samples Avg cost 0.1830 Time elapsed 1854 seconds
Epoch 8 Seen 221589 samples Avg cost 0.1651 Time elapsed 2062 seconds
Epoch 9 Seen 246210 samples Avg cost 0.1551 Time elapsed 2268 seconds
Epoch 10 Seen 270831 samples Avg cost 0.1457 Time elapsed 2507 seconds
Epoch 11 Seen 295452 samples Avg cost 0.1305 Time elapsed 2743 seconds
Epoch 12 Seen 320073 samples Avg cost 0.1199 Time elapsed 2975 seconds
Epoch 13 Seen 344694 samples Avg cost 0.1115 Time elapsed 3208 seconds
Epoch 14 Seen 369315 samples Avg cost 0.0971 Time elapsed 3439 seconds
Epoch 15 Seen 393936 samples Avg cost 0.0854 Time elapsed 3679 seconds
Epoch 16 Seen 418557 samples Avg cost 0.0806 Time elapsed 3922 seconds
Epoch 17 Seen 443178 samples Avg cost 0.0742 Time elapsed 4159 seconds
Epoch 18 Seen 467799 samples Avg cost 0.0656 Time elapsed 4393 seconds
Epoch 19 Seen 492420 samples Avg cost 0.0625 Time elapsed 4628 seconds
Epoch 20 Seen 517041 samples Avg cost 0.0583 Time elapsed 4856 seconds
Epoch 21 Seen 541662 samples Avg cost 0.0497 Time elapsed 5064 seconds
Epoch 22 Seen 566283 samples Avg cost 0.0439 Time elapsed 5269 seconds
Epoch 23 Seen 590904 samples Avg cost 0.0427 Time elapsed 5506 seconds
Epoch 24 Seen 615525 samples Avg cost 0.0339 Time elapsed 5745 seconds
Epoch 25 Seen 640146 samples Avg cost 0.0360 Time elapsed 5987 seconds
Epoch 26 Seen 664767 samples Avg cost 0.0303 Time elapsed 6228 seconds
Epoch 27 Seen 689388 samples Avg cost 0.0292 Time elapsed 6470 seconds
Epoch 28 Seen 714009 samples Avg cost 0.0259 Time elapsed 6705 seconds
Epoch 29 Seen 738630 samples Avg cost 0.0215 Time elapsed 6915 seconds