In [2]:
import sys
In [3]:
sys.path.append("/Users/dikien/Downloads/Passage/")
In [41]:
import numpy as np
import pandas as pd
from lxml import html
from bs4 import BeautifulSoup
import re
from passage.models import RNN
from passage.updates import Adadelta
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import Tokenizer
from nltk.stem.porter import *
In [ ]:
In [5]:
# Load the training file
train = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/train.csv')
test = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/test.csv')
In [6]:
# we dont need ID columns
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
In [7]:
train.head(3)
Out[7]:
In [9]:
test.head(3)
Out[9]:
In [18]:
# product_description은 없는 것이 있음
print train.isnull().any(0)
print "=" * 100
print test.isnull().any(0)
In [19]:
# non values를 ""으로 채우기
train = train.fillna("")
test = test.fillna("")
In [20]:
# product_description은 없는 것이 있음
print train.isnull().any(0)
print "=" * 100
print test.isnull().any(0)
In [42]:
#remove html, remove non text or numeric, make query and title unique features for counts using prefix (accounted for in stopwords tweak)
stemmer = PorterStemmer()
## Stemming functionality
class stemmerUtility(object):
"""Stemming functionality"""
@staticmethod
def stemPorter(review_text):
porter = PorterStemmer()
preprocessed_docs = []
for doc in review_text:
final_doc = []
for word in doc:
final_doc.append(porter.stem(word))
#final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
preprocessed_docs.append(final_doc)
return preprocessed_docs
In [ ]:
for i in range(len(train.id)):
s=(" ").join(["q"+ z for z in BeautifulSoup(train["query"][i]).get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title[i]).get_text(" ").split(" ")]) + " " + BeautifulSoup(train.product_description[i]).get_text(" ")
s=re.sub("[^a-zA-Z0-9]"," ", s)
s= (" ").join([stemmer.stem(z) for z in s.split(" ")])
s_data.append(s)
In [34]:
def clean1(text):
return html.fromstring(text).text_content().lower().strip()
In [49]:
print clean1("test,, ddd </html> <script>")
test = clean1("test,, ddd </html> <script>")
test=re.sub("[^a-zA-Z0-9]"," ", test)
print test
In [51]:
test.split()
Out[51]:
In [54]:
print clean1("test,, ddd </html> <script>")
test = clean1("test,, ddd </html> <script>")
test=re.sub("[^a-zA-Z0-9]"," ", test)
print test
test = (" ").join([stemmer.stem(z) for z in test.split()])
print test
In [33]:
BeautifulSoup("test,, ddd </html> <script>").get_text(" ")
Out[33]:
In [6]:
tr_data = pd.read_csv('/Users/dikien/Downloads/Bag of Words Meets Bags of Popcorn/labeledTrainData.tsv', delimiter='\t')
trX = clean(tr_data['review'].values)
trY = tr_data['sentiment'].values
print("Training data loaded and cleaned.")
In [11]:
print len(trX)
print trY[0]
In [12]:
tokenizer = Tokenizer(min_df=10, max_features=100000)
trX = tokenizer.fit_transform(trX)
print("Training data tokenized.")
In [13]:
tokenizer.n_features
Out[13]:
In [ ]:
layers = [
Embedding(size=256, n_features=tokenizer.n_features),
GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
Dense(size=1, activation='sigmoid', init='orthogonal')
]
model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
model.fit(trX, trY, n_epochs=10)
In [ ]:
te_data = pd.read_csv('testData.tsv', delimiter='\t')
ids = te_data['id'].values
teX = clean(te_data['review'].values)
teX = tokenizer.transform(teX)
pr_teX = model.predict(teX).flatten()
pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('submission.csv', index=False, header=["id", "sentiment"])