In [36]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.stem.porter import *

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition, pipeline, metrics, grid_search

from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.utils import np_utils

In [3]:
# Load the training file
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# we dont need ID columns
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

# non values를 " "으로 채우기
train = train.fillna(" ")
test  = test.fillna(" ")

#remove html, remove non text or numeric, make query and title unique features for counts using prefix (accounted for in stopwords tweak)
stemmer = PorterStemmer()

## Stemming functionality
class stemmerUtility(object):
    """Stemming functionality"""
    @staticmethod
    def stemPorter(review_text):
        porter = PorterStemmer()
        preprocessed_docs = []
        for doc in review_text:
            final_doc = []
            for word in doc:
                final_doc.append(porter.stem(word))
                #final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
            preprocessed_docs.append(final_doc)
        return preprocessed_docs

def clean(text):
    text = BeautifulSoup(text).get_text(" ")
    text = re.sub("[^a-zA-Z0-9]"," ", text)
    text = (" ").join([stemmer.stem(z) for z in text.split()])
    return text

def cleanq(text):
    text = BeautifulSoup(text).get_text(" ")
    text = re.sub("[^a-zA-Z0-9]"," ", text)
    text = (" ").join(["q" + stemmer.stem(z) for z in text.split()])
    return text

def cleant(text):
    text = BeautifulSoup(text).get_text(" ")
    text = re.sub("[^a-zA-Z0-9]"," ", text)
    text = (" ").join(["t" + stemmer.stem(z) for z in text.split()])
    return text

# clean data
train['query'] = train['query'].apply(func=cleanq)
train['product_title'] = train['product_title'].apply(func=cleant)
train['product_description'] = train['product_description'].apply(func=clean)

test['query'] = test['query'].apply(func=cleanq)
test['product_title'] = test['product_title'].apply(func=cleant)
test['product_description'] = test['product_description'].apply(func=clean)


/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65497012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65516012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/6552101" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65527" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januarya/14146012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)

In [4]:
def merge_rows(x):
    query = x[0]
    product_title = x[1]
    product_description  = x[2]
    return query + ' ' + product_title + ' ' + product_description

trainX = train[['query', 'product_title', 'product_description']].apply(func=merge_rows, axis=1)
trainY = train["median_relevance"]

testX = test[['query', 'product_title', 'product_description']].apply(func=merge_rows, axis=1)

In [5]:
# the infamous tfidf vectorizer (Do you remember this one?)
tfv = TfidfVectorizer(min_df=3,  max_features=None, max_df=500,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')

# Fit TFIDF
tfv.fit(trainX)
trainX =  tfv.transform(trainX)
testX = tfv.transform(testX)

In [50]:
y_train = np_utils.to_categorical(trainY.values - 1, 4)

In [72]:
max_features=20000
maxlen = 100 # cut texts after this number of words (among top max_features most common words)
batch_size = 10
nb_epoch = 5

print "Pad sequences (samples x time)"
X_train = sequence.pad_sequences(trainX.toarray(), maxlen=maxlen)
X_test = sequence.pad_sequences(testX.toarray(), maxlen=maxlen)
print 'X_train shape:', X_train.shape
print 'X_test shape:', X_test.shape

print('Build model...')
model = Sequential()
model.add(Embedding(10158 , 256))
model.add(LSTM(256, 128)) # try using a GRU instead, for fun
model.add(Dropout(0.5))
model.add(Dense(128, 4))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary")

print "Train..."
mm = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=0.1, show_accuracy=True)
score = model.evaluate(X_train, y_train, batch_size=batch_size)
print 'Test score:', score

classes = model.predict_classes(X_train, batch_size=batch_size)
acc = np_utils.accuracy(classes, y_train)
print 'accuracy:', acc


Pad sequences (samples x time)
X_train shape: (10158, 100)
X_test shape: (22513, 100)
Build model...
Train...
Train on 9142 samples, validate on 1016 samples
Epoch 0
9142/9142 [==============================] - 219s - loss: 0.4616 - acc.: 0.8013 - val. loss: 0.4423 - val. acc.: 0.8105
Epoch 1
9142/9142 [==============================] - 218s - loss: 0.4582 - acc.: 0.8030 - val. loss: 0.4425 - val. acc.: 0.8105
Epoch 2
9142/9142 [==============================] - 217s - loss: 0.4573 - acc.: 0.8030 - val. loss: 0.4428 - val. acc.: 0.8105
Epoch 3
9142/9142 [==============================] - 217s - loss: 0.4570 - acc.: 0.8031 - val. loss: 0.4420 - val. acc.: 0.8105
Epoch 4
9142/9142 [==============================] - 217s - loss: 0.4576 - acc.: 0.8030 - val. loss: 0.4424 - val. acc.: 0.8105
Epoch 5
9142/9142 [==============================] - 217s - loss: 0.4572 - acc.: 0.8030 - val. loss: 0.4422 - val. acc.: 0.8105
Epoch 6
9142/9142 [==============================] - 217s - loss: 0.4568 - acc.: 0.8030 - val. loss: 0.4423 - val. acc.: 0.8105
Epoch 7
9142/9142 [==============================] - 216s - loss: 0.4569 - acc.: 0.8030 - val. loss: 0.4429 - val. acc.: 0.8105
Epoch 8
9142/9142 [==============================] - 216s - loss: 0.4562 - acc.: 0.8030 - val. loss: 0.4422 - val. acc.: 0.8105
Epoch 9
9142/9142 [==============================] - 216s - loss: 0.4562 - acc.: 0.8030 - val. loss: 0.4423 - val. acc.: 0.8105
10158/10158 [==============================] - 32s - loss: 0.4530    
Test score: 0.452998043504

In [73]:
df = pd.DataFrame({'epoch' : mm.epoch, 'loss' : mm.loss, 'accuracy' : mm.accuracy})
df.index = df['epoch']

In [79]:
df['accuracy'].plot()


Out[79]:
<matplotlib.axes._subplots.AxesSubplot at 0x434f154d0>

In [80]:
df['loss'].plot()


Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x447279d10>

In [97]:
prediction = model.predict_classes(X_test, batch_size=batch_size)


22513/22513 [==============================] - 92s    

In [100]:
prediction = np_utils.probas_to_classes(prediction)

In [103]:
# Create your submission file
submission = pd.DataFrame({"id": idx, "prediction": prediction})
submission.to_csv("submission.csv", index=False)