In [2]:
import sys

In [3]:
sys.path.append("/Users/dikien/Downloads/Passage/")

In [41]:
import numpy as np
import pandas as pd
from lxml import html
from bs4 import BeautifulSoup
import re
from passage.models import RNN
from passage.updates import Adadelta
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import Tokenizer
from nltk.stem.porter import *

In [ ]:


In [5]:
# Load the training file
train = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/train.csv')
test = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/test.csv')

In [6]:
# we dont need ID columns
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [7]:
train.head(3)


Out[7]:
query product_title product_description median_relevance relevance_variance
0 bridal shower decorations Accent Pillow with Heart Design - Red/Black Red satin accent pillow embroidered with a hea... 1 0.000
1 led christmas lights Set of 10 Battery Operated Multi LED Train Chr... Set of 10 Battery Operated Train Christmas Lig... 4 0.000
2 projector ViewSonic Pro8200 DLP Multimedia Projector NaN 4 0.471

In [9]:
test.head(3)


Out[9]:
query product_title product_description
0 electric griddle Star-Max 48 in Electric Griddle NaN
1 phillips coffee maker Philips SENSEO HD7810 WHITE Single Serve Pod C... NaN
2 san francisco 49ers 2013 San Francisco 49ers Clock A 2013 San Francisco 49ers clock is the ultima...

In [18]:
# product_description은 없는 것이 있음
print train.isnull().any(0)
print "=" * 100
print test.isnull().any(0)


query                  False
product_title          False
product_description     True
median_relevance       False
relevance_variance     False
dtype: bool
====================================================================================================
query                  False
product_title          False
product_description     True
dtype: bool

In [19]:
# non values를 ""으로 채우기
train = train.fillna("")
test  = test.fillna("")

In [20]:
# product_description은 없는 것이 있음
print train.isnull().any(0)
print "=" * 100
print test.isnull().any(0)


query                  False
product_title          False
product_description    False
median_relevance       False
relevance_variance     False
dtype: bool
====================================================================================================
query                  False
product_title          False
product_description    False
dtype: bool

In [42]:
#remove html, remove non text or numeric, make query and title unique features for counts using prefix (accounted for in stopwords tweak)
stemmer = PorterStemmer()

## Stemming functionality
class stemmerUtility(object):
    """Stemming functionality"""
    @staticmethod
    def stemPorter(review_text):
        porter = PorterStemmer()
        preprocessed_docs = []
        for doc in review_text:
            final_doc = []
            for word in doc:
                final_doc.append(porter.stem(word))
                #final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
            preprocessed_docs.append(final_doc)
        return preprocessed_docs

In [ ]:
for i in range(len(train.id)):
    s=(" ").join(["q"+ z for z in BeautifulSoup(train["query"][i]).get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title[i]).get_text(" ").split(" ")]) + " " + BeautifulSoup(train.product_description[i]).get_text(" ")
    s=re.sub("[^a-zA-Z0-9]"," ", s)
    s= (" ").join([stemmer.stem(z) for z in s.split(" ")])
    s_data.append(s)

In [34]:
def clean1(text):
    return html.fromstring(text).text_content().lower().strip()

In [49]:
print clean1("test,, ddd </html> <script>")
test = clean1("test,, ddd </html> <script>")
test=re.sub("[^a-zA-Z0-9]"," ", test)
print test


test,, ddd
test   ddd

In [51]:
test.split()


Out[51]:
['test', 'ddd']

In [54]:
print clean1("test,, ddd </html> <script>")
test = clean1("test,, ddd </html> <script>")
test=re.sub("[^a-zA-Z0-9]"," ", test)
print test
test = (" ").join([stemmer.stem(z) for z in test.split()])
print test


test,, ddd
test   ddd
test ddd

In [33]:
BeautifulSoup("test,, ddd </html> <script>").get_text(" ")


Out[33]:
u'test,, ddd   '

In [6]:
tr_data = pd.read_csv('/Users/dikien/Downloads/Bag of Words Meets Bags of Popcorn/labeledTrainData.tsv', delimiter='\t') 
trX = clean(tr_data['review'].values)
trY = tr_data['sentiment'].values
print("Training data loaded and cleaned.")


Training data loaded and cleaned.

In [11]:
print len(trX)
print trY[0]


25000
1

In [12]:
tokenizer = Tokenizer(min_df=10, max_features=100000)
trX = tokenizer.fit_transform(trX)

print("Training data tokenized.")


Training data tokenized.

In [13]:
tokenizer.n_features


Out[13]:
18592

In [ ]:
layers = [
    Embedding(size=256, n_features=tokenizer.n_features),
    GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
    Dense(size=1, activation='sigmoid', init='orthogonal')
]

model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
model.fit(trX, trY, n_epochs=10)

In [ ]:
te_data = pd.read_csv('testData.tsv', delimiter='\t')
ids = te_data['id'].values
teX = clean(te_data['review'].values)
teX = tokenizer.transform(teX)
pr_teX = model.predict(teX).flatten()
 
pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('submission.csv', index=False, header=["id", "sentiment"])