In [36]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.stem.porter import *
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition, pipeline, metrics, grid_search
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.utils import np_utils
In [3]:
# Load the training file
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# we dont need ID columns
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
# non values를 " "으로 채우기
train = train.fillna(" ")
test = test.fillna(" ")
#remove html, remove non text or numeric, make query and title unique features for counts using prefix (accounted for in stopwords tweak)
stemmer = PorterStemmer()
## Stemming functionality
class stemmerUtility(object):
"""Stemming functionality"""
@staticmethod
def stemPorter(review_text):
porter = PorterStemmer()
preprocessed_docs = []
for doc in review_text:
final_doc = []
for word in doc:
final_doc.append(porter.stem(word))
#final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
preprocessed_docs.append(final_doc)
return preprocessed_docs
def clean(text):
text = BeautifulSoup(text).get_text(" ")
text = re.sub("[^a-zA-Z0-9]"," ", text)
text = (" ").join([stemmer.stem(z) for z in text.split()])
return text
def cleanq(text):
text = BeautifulSoup(text).get_text(" ")
text = re.sub("[^a-zA-Z0-9]"," ", text)
text = (" ").join(["q" + stemmer.stem(z) for z in text.split()])
return text
def cleant(text):
text = BeautifulSoup(text).get_text(" ")
text = re.sub("[^a-zA-Z0-9]"," ", text)
text = (" ").join(["t" + stemmer.stem(z) for z in text.split()])
return text
# clean data
train['query'] = train['query'].apply(func=cleanq)
train['product_title'] = train['product_title'].apply(func=cleant)
train['product_description'] = train['product_description'].apply(func=clean)
test['query'] = test['query'].apply(func=cleanq)
test['product_title'] = test['product_title'].apply(func=cleant)
test['product_description'] = test['product_description'].apply(func=clean)
In [4]:
def merge_rows(x):
query = x[0]
product_title = x[1]
product_description = x[2]
return query + ' ' + product_title + ' ' + product_description
trainX = train[['query', 'product_title', 'product_description']].apply(func=merge_rows, axis=1)
trainY = train["median_relevance"]
testX = test[['query', 'product_title', 'product_description']].apply(func=merge_rows, axis=1)
In [5]:
# the infamous tfidf vectorizer (Do you remember this one?)
tfv = TfidfVectorizer(min_df=3, max_features=None, max_df=500,
strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,
stop_words = 'english')
# Fit TFIDF
tfv.fit(trainX)
trainX = tfv.transform(trainX)
testX = tfv.transform(testX)
In [50]:
y_train = np_utils.to_categorical(trainY.values - 1, 4)
In [72]:
max_features=20000
maxlen = 100 # cut texts after this number of words (among top max_features most common words)
batch_size = 10
nb_epoch = 5
print "Pad sequences (samples x time)"
X_train = sequence.pad_sequences(trainX.toarray(), maxlen=maxlen)
X_test = sequence.pad_sequences(testX.toarray(), maxlen=maxlen)
print 'X_train shape:', X_train.shape
print 'X_test shape:', X_test.shape
print('Build model...')
model = Sequential()
model.add(Embedding(10158 , 256))
model.add(LSTM(256, 128)) # try using a GRU instead, for fun
model.add(Dropout(0.5))
model.add(Dense(128, 4))
model.add(Activation('sigmoid'))
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary")
print "Train..."
mm = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=0.1, show_accuracy=True)
score = model.evaluate(X_train, y_train, batch_size=batch_size)
print 'Test score:', score
classes = model.predict_classes(X_train, batch_size=batch_size)
acc = np_utils.accuracy(classes, y_train)
print 'accuracy:', acc
In [73]:
df = pd.DataFrame({'epoch' : mm.epoch, 'loss' : mm.loss, 'accuracy' : mm.accuracy})
df.index = df['epoch']
In [79]:
df['accuracy'].plot()
Out[79]:
In [80]:
df['loss'].plot()
Out[80]:
In [97]:
prediction = model.predict_classes(X_test, batch_size=batch_size)
In [100]:
prediction = np_utils.probas_to_classes(prediction)
In [103]:
# Create your submission file
submission = pd.DataFrame({"id": idx, "prediction": prediction})
submission.to_csv("submission.csv", index=False)