In [3]:
import sys
In [2]:
sys.path.append("/Users/dikien/Downloads/Passage/")
In [1]:
import numpy as np
import pandas as pd
from lxml import html
from passage.models import RNN
from passage.updates import Adadelta
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import Tokenizer
In [2]:
def clean(texts):
return [html.fromstring(text).text_content().lower().strip() for text in texts]
In [3]:
tr_data = pd.read_csv('/Users/dikien/Downloads/Bag of Words Meets Bags of Popcorn/labeledTrainData.tsv', delimiter='\t')
trX = clean(tr_data['review'].values)
trY = tr_data['sentiment'].values
print("Training data loaded and cleaned.")
In [5]:
print trY[:5].shape
In [11]:
print len(trX)
print trY[0]
In [12]:
tokenizer = Tokenizer(min_df=10, max_features=100000)
trX = tokenizer.fit_transform(trX)
print("Training data tokenized.")
In [13]:
tokenizer.n_features
Out[13]:
In [ ]:
layers = [
Embedding(size=256, n_features=tokenizer.n_features),
GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
Dense(size=1, activation='sigmoid', init='orthogonal')
]
model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
model.fit(trX, trY, n_epochs=10)
In [ ]:
te_data = pd.read_csv('testData.tsv', delimiter='\t')
ids = te_data['id'].values
teX = clean(te_data['review'].values)
teX = tokenizer.transform(teX)
pr_teX = model.predict(teX).flatten()
pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('submission.csv', index=False, header=["id", "sentiment"])