In [1]:
import sys

In [2]:
sys.path.append("/Users/dikien/Downloads/Passage/")

In [3]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
from passage.models import RNN
from passage.updates import Adadelta
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import Tokenizer
from nltk.stem.porter import *

In [4]:
# Load the training file
train = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/train.csv')
test = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/test.csv')

In [5]:
# we dont need ID columns
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [6]:
train.head(3)


Out[6]:
query product_title product_description median_relevance relevance_variance
0 bridal shower decorations Accent Pillow with Heart Design - Red/Black Red satin accent pillow embroidered with a hea... 1 0.000
1 led christmas lights Set of 10 Battery Operated Multi LED Train Chr... Set of 10 Battery Operated Train Christmas Lig... 4 0.000
2 projector ViewSonic Pro8200 DLP Multimedia Projector NaN 4 0.471

In [7]:
test.head(3)


Out[7]:
query product_title product_description
0 electric griddle Star-Max 48 in Electric Griddle NaN
1 phillips coffee maker Philips SENSEO HD7810 WHITE Single Serve Pod C... NaN
2 san francisco 49ers 2013 San Francisco 49ers Clock A 2013 San Francisco 49ers clock is the ultima...

In [8]:
# product_description은 없는 것이 있음
print train.isnull().any(0)
print "=" * 100
print test.isnull().any(0)


query                  False
product_title          False
product_description     True
median_relevance       False
relevance_variance     False
dtype: bool
====================================================================================================
query                  False
product_title          False
product_description     True
dtype: bool

In [9]:
# non values를 " "으로 채우기
train = train.fillna(" ")
test  = test.fillna(" ")

In [10]:
# product_description은 없는 것이 있음
print train.isnull().any(0)
print "=" * 100
print test.isnull().any(0)


query                  False
product_title          False
product_description    False
median_relevance       False
relevance_variance     False
dtype: bool
====================================================================================================
query                  False
product_title          False
product_description    False
dtype: bool

In [11]:
#remove html, remove non text or numeric, make query and title unique features for counts using prefix (accounted for in stopwords tweak)
stemmer = PorterStemmer()

## Stemming functionality
class stemmerUtility(object):
    """Stemming functionality"""
    @staticmethod
    def stemPorter(review_text):
        porter = PorterStemmer()
        preprocessed_docs = []
        for doc in review_text:
            final_doc = []
            for word in doc:
                final_doc.append(porter.stem(word))
                #final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
            preprocessed_docs.append(final_doc)
        return preprocessed_docs

def clean(text):
#     text = html.fromstring(text).text_content().lower().strip()
    text = BeautifulSoup(text).get_text(" ")
    text = re.sub("[^a-zA-Z0-9]"," ", text)
    text = (" ").join([stemmer.stem(z) for z in text.split()])
    return text

In [12]:
print clean("jongwon . , kim ")


jongwon kim

In [13]:
# clean data
train['query'] = train['query'].apply(func=clean)
train['product_title'] = train['product_title'].apply(func=clean)
train['product_description'] = train['product_description'].apply(func=clean)

test['query'] = test['query'].apply(func=clean)
test['product_title'] = test['product_title'].apply(func=clean)
test['product_description'] = test['product_description'].apply(func=clean)


/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65497012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65516012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/6552101" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65527" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januarya/14146012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)

In [14]:
def merge_rows(x):
    query = x[0]
    product_title = x[1]
    product_description  = x[2]
    return query + ' ' + product_title + ' ' + product_description

In [15]:
trainX = train[['query', 'product_title', 'product_description']].apply(func=merge_rows, axis=1)
testX = test[['query', 'product_title', 'product_description']].apply(func=merge_rows, axis=1)

In [16]:
trainY = train["median_relevance"]

In [17]:
tokenizer = Tokenizer(min_df=10, max_features=100000)
trainX = tokenizer.fit_transform(trainX)

print("Training data tokenized.")


Training data tokenized.

In [18]:
tokenizer.n_features


Out[18]:
4354

In [19]:
layers = [
    Embedding(size=256, n_features=tokenizer.n_features),
    GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
    Dense(size=1, activation='sigmoid', init='orthogonal')
]

model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
model.fit(trainX, trainY, n_epochs=1)


Epoch 0 Seen 10058 samples Avg cost -842.1429 Time elapsed 410 seconds
/Users/dikien/anaconda/lib/python2.7/site-packages/theano/scan_module/scan_perform_ext.py:133: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility
  from scan_perform.scan_perform import *
Out[19]:
[array(0.7019436132583434),
 array(0.5191049687834116),
 array(0.29999605990878175),
 array(-0.04111086707732512),
 array(-1.0811351469475654),
 array(-20.40324655473114),
 array(-39.811147414137515),
 array(-60.45484734692913),
 array(-73.62591611849797),
 array(-82.85591777530604),
 array(-96.85875457925671),
 array(-103.88725965248537),
 array(-112.82843714719404),
 array(-107.05845253427695),
 array(-126.97924800622309),
 array(-139.61813563132978),
 array(-161.1675951836773),
 array(-171.79844654248967),
 array(-190.4508372990546),
 array(-187.95761881580418),
 array(-184.3901830466014),
 array(-202.17437268631667),
 array(-222.62368797243573),
 array(-216.37605081942294),
 array(-239.14372925180808),
 array(-246.33164539030076),
 array(-258.456589615913),
 array(-257.84030395989174),
 array(-265.5873604341535),
 array(-307.0250508323931),
 array(-312.59875878435895),
 array(-326.0943530150654),
 array(-350.9150553183064),
 array(-349.50489736275216),
 array(-394.25019074602704),
 array(-395.59523703966175),
 array(-372.9449799812586),
 array(-391.6042066923075),
 array(-386.7837677235303),
 array(-449.57913816298606),
 array(-445.74983823802063),
 array(-423.02496997401045),
 array(-442.52514561721415),
 array(-422.8428549128814),
 array(-470.0081719573781),
 array(-452.38266774384),
 array(-462.3578382878781),
 array(-492.43353334526375),
 array(-551.1819638230415),
 array(-515.0327412728755),
 array(-490.08661656919134),
 array(-550.5655522551784),
 array(-561.9678920155902),
 array(-573.3709893606693),
 array(-584.7745565723277),
 array(-544.8528599854922),
 array(-606.6518243805464),
 array(-618.0584692387246),
 array(-546.0929372640926),
 array(-652.1359905621199),
 array(-603.6267255672523),
 array(-692.348076598509),
 array(-673.5825436510574),
 array(-657.7753522529819),
 array(-640.66662325713),
 array(-673.7752158705628),
 array(-703.1944519167421),
 array(-752.7836857360646),
 array(-725.7266614066551),
 array(-706.8495371601231),
 array(-706.8882739285411),
 array(-747.4670171962605),
 array(-711.4335506857149),
 array(-736.6732837395127),
 array(-837.1876215062525),
 array(-881.9829437107819),
 array(-818.3519601187327),
 array(-790.5212723755815),
 array(-845.7709749770961),
 array(-828.4887026867088),
 array(-937.4262637475692),
 array(-909.4619999283681),
 array(-879.8792140748319),
 array(-908.9352556857299),
 array(-981.3242587208478),
 array(-926.3488061762941),
 array(-943.8911344165812),
 array(-974.3105006052916),
 array(-877.3314070495243),
 array(-854.4279059522335),
 array(-954.7794340661408),
 array(-998.5591413174944),
 array(-1083.5780259335625),
 array(-1042.539633498421),
 array(-1006.5016463830912),
 array(-955.0750885171528),
 array(-999.6276696019424),
 array(-1144.1060126449747),
 array(-1078.6522224496848),
 array(-1104.5377918485651),
 array(-1021.4355176359201),
 array(-1134.4440142582105),
 array(-1086.790428665959),
 array(-1135.099129776492),
 array(-1131.364030224608),
 array(-1226.8695093996805),
 array(-1076.9786658472062),
 array(-1055.449467357713),
 array(-1167.1744592579341),
 array(-1257.796754404289),
 array(-1189.9086169124362),
 array(-1038.6378433272969),
 array(-1243.2300414671322),
 array(-1238.3237416389545),
 array(-1241.3023216371882),
 array(-1235.6555002055186),
 array(-1119.3332959003249),
 array(-1273.4362622923559),
 array(-1207.0090985245192),
 array(-1347.27934753888),
 array(-1350.568917824928),
 array(-1504.035400929077),
 array(-1277.2967019698538),
 array(-1314.6170974087554),
 array(-1289.0410898861855),
 array(-1317.5087419497906),
 array(-1291.0723393695343),
 array(-1412.4759308633534),
 array(-1349.1386723946198),
 array(-1397.358716233288),
 array(-1455.9676802366898),
 array(-1333.4319260714942),
 array(-1430.1591624992957),
 array(-1528.8296877907753),
 array(-1668.8202622812614),
 array(-1554.5410235654213),
 array(-1317.4078180237448),
 array(-1607.5489574719084),
 array(-1407.7132822281985),
 array(-1529.6475799801617),
 array(-1571.7932024131026),
 array(-1676.7164948676623),
 array(-1585.5606766712458),
 array(-1471.091518881165),
 array(-1301.1575196046542),
 array(-1574.8631194979262),
 array(-1425.1666855085525),
 array(-1595.8662058723946),
 array(-1541.7589333544051),
 array(-1617.5157206869662),
 array(-1584.5397335300383),
 array(-1701.3386897012613),
 array(-1695.3616561477488),
 array(-1819.2940293366692),
 array(-1662.7866486385967),
 array(-1741.9976798336397),
 array(-1662.0787354129543),
 array(-1557.3322405541194)]

In [32]:
print model.layers[0].__dict__


{'settings': {'init': 'uniform', 'weights': None, 'n_features': 4354, 'size': 256}, 'wv': <TensorType(float64, matrix)>, 'init': <function uniform at 0x109a86a28>, 'params': [<TensorType(float64, matrix)>], 'input': <TensorType(int32, matrix)>, 'n_features': 4354, 'size': 256}

In [33]:
testX = tokenizer.transform(testX)
prediction = model.predict(testX)

In [43]:
prediction = prediction.astype('int32')

In [44]:
# Create your first submission file
submission = pd.DataFrame({"id": idx, "prediction": prediction.ravel()})
submission.to_csv("submission.csv", index=False)