notebook.community

Edit and run



In [3]:

    
import sys



In [2]:

    
sys.path.append("/Users/dikien/Downloads/Passage/")



In [1]:

    
import numpy as np
import pandas as pd
from lxml import html
from passage.models import RNN
from passage.updates import Adadelta
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import Tokenizer



In [2]:

    
def clean(texts):
	return [html.fromstring(text).text_content().lower().strip() for text in texts]



In [3]:

    
tr_data = pd.read_csv('/Users/dikien/Downloads/Bag of Words Meets Bags of Popcorn/labeledTrainData.tsv', delimiter='\t') 
trX = clean(tr_data['review'].values)
trY = tr_data['sentiment'].values
print("Training data loaded and cleaned.")









    



Training data loaded and cleaned.



In [5]:

    
print trY[:5].shape









    



(5,)



In [11]:

    
print len(trX)
print trY[0]



In [12]:

    
tokenizer = Tokenizer(min_df=10, max_features=100000)
trX = tokenizer.fit_transform(trX)

print("Training data tokenized.")









    



Training data tokenized.



In [13]:

    
tokenizer.n_features









    Out[13]:





18592



In [ ]:

    
layers = [
    Embedding(size=256, n_features=tokenizer.n_features),
    GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
    Dense(size=1, activation='sigmoid', init='orthogonal')
]

model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
model.fit(trX, trY, n_epochs=10)



In [ ]:

    
te_data = pd.read_csv('testData.tsv', delimiter='\t')
ids = te_data['id'].values
teX = clean(te_data['review'].values)
teX = tokenizer.transform(teX)
pr_teX = model.predict(teX).flatten()
 
pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('submission.csv', index=False, header=["id", "sentiment"])