In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.stem.porter import *
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition, pipeline, metrics, grid_search
In [9]:
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
"""
Returns the confusion matrix between rater's ratings
"""
assert(len(rater_a) == len(rater_b))
if min_rating is None:
min_rating = min(rater_a + rater_b)
if max_rating is None:
max_rating = max(rater_a + rater_b)
num_ratings = int(max_rating - min_rating + 1)
conf_mat = [[0 for i in range(num_ratings)]
for j in range(num_ratings)]
for a, b in zip(rater_a, rater_b):
conf_mat[a - min_rating][b - min_rating] += 1
return conf_mat
def histogram(ratings, min_rating=None, max_rating=None):
"""
Returns the counts of each type of rating that a rater made
"""
if min_rating is None:
min_rating = min(ratings)
if max_rating is None:
max_rating = max(ratings)
num_ratings = int(max_rating - min_rating + 1)
hist_ratings = [0 for x in range(num_ratings)]
for r in ratings:
hist_ratings[r - min_rating] += 1
return hist_ratings
def quadratic_weighted_kappa(y, y_pred):
"""
Calculates the quadratic weighted kappa
axquadratic_weighted_kappa calculates the quadratic weighted kappa
value, which is a measure of inter-rater agreement between two raters
that provide discrete numeric ratings. Potential values range from -1
(representing complete disagreement) to 1 (representing complete
agreement). A kappa value of 0 is expected if all agreement is due to
chance.
quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
each correspond to a list of integer ratings. These lists must have the
same length.
The ratings should be integers, and it is assumed that they contain
the complete range of possible ratings.
quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
is the minimum possible rating, and max_rating is the maximum possible
rating
"""
rater_a = y
rater_b = y_pred
min_rating=None
max_rating=None
rater_a = np.array(rater_a, dtype=int)
rater_b = np.array(rater_b, dtype=int)
assert(len(rater_a) == len(rater_b))
if min_rating is None:
min_rating = min(min(rater_a), min(rater_b))
if max_rating is None:
max_rating = max(max(rater_a), max(rater_b))
conf_mat = confusion_matrix(rater_a, rater_b,
min_rating, max_rating)
num_ratings = len(conf_mat)
num_scored_items = float(len(rater_a))
hist_rater_a = histogram(rater_a, min_rating, max_rating)
hist_rater_b = histogram(rater_b, min_rating, max_rating)
numerator = 0.0
denominator = 0.0
for i in range(num_ratings):
for j in range(num_ratings):
expected_count = (hist_rater_a[i] * hist_rater_b[j]
/ num_scored_items)
d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
numerator += d * conf_mat[i][j] / num_scored_items
denominator += d * expected_count / num_scored_items
return (1.0 - numerator / denominator)
# Kappa Scorer
kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True)
In [3]:
# Load the training file
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# we dont need ID columns
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
# non values를 " "으로 채우기
train = train.fillna(" ")
test = test.fillna(" ")
#remove html, remove non text or numeric, make query and title unique features for counts using prefix (accounted for in stopwords tweak)
stemmer = PorterStemmer()
## Stemming functionality
class stemmerUtility(object):
"""Stemming functionality"""
@staticmethod
def stemPorter(review_text):
porter = PorterStemmer()
preprocessed_docs = []
for doc in review_text:
final_doc = []
for word in doc:
final_doc.append(porter.stem(word))
#final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
preprocessed_docs.append(final_doc)
return preprocessed_docs
def clean(text):
text = BeautifulSoup(text).get_text(" ")
text = re.sub("[^a-zA-Z0-9]"," ", text)
text = (" ").join([stemmer.stem(z) for z in text.split()])
return text
def cleanq(text):
text = BeautifulSoup(text).get_text(" ")
text = re.sub("[^a-zA-Z0-9]"," ", text)
text = (" ").join(["q" + stemmer.stem(z) for z in text.split()])
return text
def cleant(text):
text = BeautifulSoup(text).get_text(" ")
text = re.sub("[^a-zA-Z0-9]"," ", text)
text = (" ").join(["t" + stemmer.stem(z) for z in text.split()])
return text
# clean data
train['query'] = train['query'].apply(func=cleanq)
train['product_title'] = train['product_title'].apply(func=cleant)
train['product_description'] = train['product_description'].apply(func=clean)
test['query'] = test['query'].apply(func=cleanq)
test['product_title'] = test['product_title'].apply(func=cleant)
test['product_description'] = test['product_description'].apply(func=clean)
In [4]:
def merge_rows(x):
query = x[0]
product_title = x[1]
product_description = x[2]
return query + ' ' + product_title + ' ' + product_description
trainX = train[['query', 'product_title', 'product_description']].apply(func=merge_rows, axis=1)
trainY = train["median_relevance"]
testX = test[['query', 'product_title', 'product_description']].apply(func=merge_rows, axis=1)
In [10]:
# the infamous tfidf vectorizer (Do you remember this one?)
tfv = TfidfVectorizer(min_df=3, max_features=None, max_df=500,
strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,
stop_words = 'english')
# Fit TFIDF
tfv.fit(trainX)
trainX = tfv.transform(trainX)
testX = tfv.transform(testX)
In [14]:
# Initialize SVD
svd = TruncatedSVD(algorithm='randomized', n_iter=5, random_state=None, tol=0.0)
# Initialize the standard scaler
scl = StandardScaler(copy=True, with_mean=True, with_std=True)
# We will use SVM here..
svm_model = SVC(kernel='rbf', degree=3, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=4000, class_weight=None, verbose=False, max_iter=-1, random_state=None)
# Create the pipeline
clf = pipeline.Pipeline([('svd', svd),
('scl', scl),
('svm', svm_model)])
# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svd__n_components' : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
'svm__C': range(10, 500, 20),
'svm__gamma' : [0.0, 0.1, 0.01, 0.001, 0.5, 0.05, 0.005]}
In [15]:
# Initialize Grid Search Model
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
verbose=2, n_jobs=1, iid=True, refit=True, cv=1)
In [17]:
# Fit Grid Search Model
model.fit(trainX, trainY)
Out[17]:
In [18]:
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
In [ ]:
print best_parameters
In [5]:
#create sklearn pipeline, fit all, and predit test data
clf = Pipeline([('v',TfidfVectorizer(min_df=5, max_df=500, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english')),
('svd', TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)),
('scl', StandardScaler(copy=True, with_mean=True, with_std=True)),
('svm', SVC(C=10.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=1000, class_weight=None, verbose=False, max_iter=-1, random_state=None))])
clf.fit(trainX, trainY)
prediction = clf.predict(testX)
In [8]:
# Create your submission file
submission = pd.DataFrame({"id": idx, "prediction": prediction})
submission.to_csv("submission.csv", index=False)