notebook.community

Edit and run



In [2]:

    
import sys



In [3]:

    
sys.path.append("/Users/dikien/Downloads/Passage/")



In [41]:

    
import numpy as np
import pandas as pd
from lxml import html
from bs4 import BeautifulSoup
import re
from passage.models import RNN
from passage.updates import Adadelta
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import Tokenizer
from nltk.stem.porter import *



In [ ]:



In [5]:

    
# Load the training file
train = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/train.csv')
test = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/test.csv')



In [6]:

    
# we dont need ID columns
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)



In [7]:

    
train.head(3)









    Out[7]:






  
    
      
      query
      product_title
      product_description
      median_relevance
      relevance_variance
    
  
  
    
      0
      bridal shower decorations
      Accent Pillow with Heart Design - Red/Black
      Red satin accent pillow embroidered with a hea...
      1
      0.000
    
    
      1
      led christmas lights
      Set of 10 Battery Operated Multi LED Train Chr...
      Set of 10 Battery Operated Train Christmas Lig...
      4
      0.000
    
    
      2
      projector
      ViewSonic Pro8200 DLP Multimedia Projector
      NaN
      4
      0.471



In [9]:

    
test.head(3)









    Out[9]:






  
    
      
      query
      product_title
      product_description
    
  
  
    
      0
      electric griddle
      Star-Max 48 in Electric Griddle
      NaN
    
    
      1
      phillips coffee maker
      Philips SENSEO HD7810 WHITE Single Serve Pod C...
      NaN
    
    
      2
      san francisco 49ers
      2013 San Francisco 49ers Clock
      A 2013 San Francisco 49ers clock is the ultima...



In [18]:

    
# product_description은 없는 것이 있음
print train.isnull().any(0)
print "=" * 100
print test.isnull().any(0)









    



query                  False
product_title          False
product_description     True
median_relevance       False
relevance_variance     False
dtype: bool
====================================================================================================
query                  False
product_title          False
product_description     True
dtype: bool



In [19]:

    
# non values를 ""으로 채우기
train = train.fillna("")
test  = test.fillna("")



In [20]:

    
# product_description은 없는 것이 있음
print train.isnull().any(0)
print "=" * 100
print test.isnull().any(0)









    



query                  False
product_title          False
product_description    False
median_relevance       False
relevance_variance     False
dtype: bool
====================================================================================================
query                  False
product_title          False
product_description    False
dtype: bool



In [42]:

    
#remove html, remove non text or numeric, make query and title unique features for counts using prefix (accounted for in stopwords tweak)
stemmer = PorterStemmer()

## Stemming functionality
class stemmerUtility(object):
    """Stemming functionality"""
    @staticmethod
    def stemPorter(review_text):
        porter = PorterStemmer()
        preprocessed_docs = []
        for doc in review_text:
            final_doc = []
            for word in doc:
                final_doc.append(porter.stem(word))
                #final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
            preprocessed_docs.append(final_doc)
        return preprocessed_docs



In [ ]:

    
for i in range(len(train.id)):
    s=(" ").join(["q"+ z for z in BeautifulSoup(train["query"][i]).get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title[i]).get_text(" ").split(" ")]) + " " + BeautifulSoup(train.product_description[i]).get_text(" ")
    s=re.sub("[^a-zA-Z0-9]"," ", s)
    s= (" ").join([stemmer.stem(z) for z in s.split(" ")])
    s_data.append(s)



In [34]:

    
def clean1(text):
    return html.fromstring(text).text_content().lower().strip()



In [49]:

    
print clean1("test,, ddd </html> <script>")
test = clean1("test,, ddd </html> <script>")
test=re.sub("[^a-zA-Z0-9]"," ", test)
print test









    



test,, ddd
test   ddd



In [51]:

    
test.split()









    Out[51]:





['test', 'ddd']



In [54]:

    
print clean1("test,, ddd </html> <script>")
test = clean1("test,, ddd </html> <script>")
test=re.sub("[^a-zA-Z0-9]"," ", test)
print test
test = (" ").join([stemmer.stem(z) for z in test.split()])
print test









    



test,, ddd
test   ddd
test ddd



In [33]:

    
BeautifulSoup("test,, ddd </html> <script>").get_text(" ")









    Out[33]:





u'test,, ddd   '



In [6]:

    
tr_data = pd.read_csv('/Users/dikien/Downloads/Bag of Words Meets Bags of Popcorn/labeledTrainData.tsv', delimiter='\t') 
trX = clean(tr_data['review'].values)
trY = tr_data['sentiment'].values
print("Training data loaded and cleaned.")









    



Training data loaded and cleaned.



In [11]:

    
print len(trX)
print trY[0]



In [12]:

    
tokenizer = Tokenizer(min_df=10, max_features=100000)
trX = tokenizer.fit_transform(trX)

print("Training data tokenized.")









    



Training data tokenized.



In [13]:

    
tokenizer.n_features









    Out[13]:





18592



In [ ]:

    
layers = [
    Embedding(size=256, n_features=tokenizer.n_features),
    GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
    Dense(size=1, activation='sigmoid', init='orthogonal')
]

model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
model.fit(trX, trY, n_epochs=10)



In [ ]:

    
te_data = pd.read_csv('testData.tsv', delimiter='\t')
ids = te_data['id'].values
teX = clean(te_data['review'].values)
teX = tokenizer.transform(teX)
pr_teX = model.predict(teX).flatten()
 
pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('submission.csv', index=False, header=["id", "sentiment"])

	query	product_title	product_description	median_relevance	relevance_variance
0	bridal shower decorations	Accent Pillow with Heart Design - Red/Black	Red satin accent pillow embroidered with a hea...	1	0.000
1	led christmas lights	Set of 10 Battery Operated Multi LED Train Chr...	Set of 10 Battery Operated Train Christmas Lig...	4	0.000
2	projector	ViewSonic Pro8200 DLP Multimedia Projector	NaN	4	0.471

	query	product_title	product_description
0	electric griddle	Star-Max 48 in Electric Griddle	NaN
1	phillips coffee maker	Philips SENSEO HD7810 WHITE Single Serve Pod C...	NaN
2	san francisco 49ers	2013 San Francisco 49ers Clock	A 2013 San Francisco 49ers clock is the ultima...