In [3]:
import sys

In [2]:
sys.path.append("/Users/dikien/Downloads/Passage/")

In [1]:
import numpy as np
import pandas as pd
from lxml import html
from passage.models import RNN
from passage.updates import Adadelta
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import Tokenizer

In [2]:
def clean(texts):
	return [html.fromstring(text).text_content().lower().strip() for text in texts]

In [3]:
tr_data = pd.read_csv('/Users/dikien/Downloads/Bag of Words Meets Bags of Popcorn/labeledTrainData.tsv', delimiter='\t') 
trX = clean(tr_data['review'].values)
trY = tr_data['sentiment'].values
print("Training data loaded and cleaned.")


Training data loaded and cleaned.

In [5]:
print trY[:5].shape


(5,)

In [11]:
print len(trX)
print trY[0]


25000
1

In [12]:
tokenizer = Tokenizer(min_df=10, max_features=100000)
trX = tokenizer.fit_transform(trX)

print("Training data tokenized.")


Training data tokenized.

In [13]:
tokenizer.n_features


Out[13]:
18592

In [ ]:
layers = [
    Embedding(size=256, n_features=tokenizer.n_features),
    GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
    Dense(size=1, activation='sigmoid', init='orthogonal')
]

model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
model.fit(trX, trY, n_epochs=10)

In [ ]:
te_data = pd.read_csv('testData.tsv', delimiter='\t')
ids = te_data['id'].values
teX = clean(te_data['review'].values)
teX = tokenizer.transform(teX)
pr_teX = model.predict(teX).flatten()
 
pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('submission.csv', index=False, header=["id", "sentiment"])