In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.stem.porter import *
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition, pipeline, metrics, grid_search

In [2]:
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

# Kappa Scorer 
kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True)

In [3]:
# Load the training file
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# we dont need ID columns
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

# non values를 " "으로 채우기
train = train.fillna(" ")
test  = test.fillna(" ")

#remove html, remove non text or numeric, make query and title unique features for counts using prefix (accounted for in stopwords tweak)
stemmer = PorterStemmer()

## Stemming functionality
class stemmerUtility(object):
    """Stemming functionality"""
    @staticmethod
    def stemPorter(review_text):
        porter = PorterStemmer()
        preprocessed_docs = []
        for doc in review_text:
            final_doc = []
            for word in doc:
                final_doc.append(porter.stem(word))
                #final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
            preprocessed_docs.append(final_doc)
        return preprocessed_docs

def clean(text):
    text = BeautifulSoup(text).get_text(" ")
    text = re.sub("[^a-zA-Z0-9]"," ", text)
    text = (" ").join([stemmer.stem(z) for z in text.split()])
    return text

def cleanq(text):
    text = BeautifulSoup(text).get_text(" ")
    text = re.sub("[^a-zA-Z0-9]"," ", text)
    text = (" ").join(["q" + stemmer.stem(z) for z in text.split()])
    return text

def cleant(text):
    text = BeautifulSoup(text).get_text(" ")
    text = re.sub("[^a-zA-Z0-9]"," ", text)
    text = (" ").join(["t" + stemmer.stem(z) for z in text.split()])
    return text

# clean data
train['query'] = train['query'].apply(func=cleanq)
train['product_title'] = train['product_title'].apply(func=cleant)
train['product_description'] = train['product_description'].apply(func=clean)

test['query'] = test['query'].apply(func=cleanq)
test['product_title'] = test['product_title'].apply(func=cleant)
test['product_description'] = test['product_description'].apply(func=clean)


/home/ubuntu/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65497012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/home/ubuntu/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65516012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/home/ubuntu/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/6552101" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/home/ubuntu/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65527" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/home/ubuntu/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januarya/14146012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)

In [4]:
def merge_rows(x):
    query = x[0]
    product_title = x[1]
    product_description  = x[2]
    return query + ' ' + product_title + ' ' + product_description

trainX = train[['query', 'product_title', 'product_description']].apply(func=merge_rows, axis=1)
trainY = train["median_relevance"]

testX = test[['query', 'product_title', 'product_description']].apply(func=merge_rows, axis=1)

In [5]:
# the infamous tfidf vectorizer (Do you remember this one?)
tfv = TfidfVectorizer(min_df=3,  max_features=None, max_df=500,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')

# Fit TFIDF
tfv.fit(trainX)
trainX =  tfv.transform(trainX) 
testX = tfv.transform(testX)

In [6]:
# Initialize SVD
svd = TruncatedSVD(algorithm='randomized', n_iter=5, random_state=None, tol=0.0)

# Initialize the standard scaler 
scl = StandardScaler(copy=True, with_mean=True, with_std=True)

# We will use SVM here..
svm_model = SVC(kernel='rbf', degree=3, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=4000, class_weight=None, verbose=False, max_iter=-1, random_state=None)

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('svm', svm_model)])

# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svd__n_components' : [100, 200, 300, 400, 500, 600],
              'svm__C': [10]}

In [7]:
# Initialize Grid Search Model
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
                                 verbose=2, n_jobs=6, iid=True, refit=True, cv=2)

In [8]:
# Fit Grid Search Model
model.fit(trainX, trainY)


Fitting 2 folds for each of 6 candidates, totalling 12 fits
[Parallel(n_jobs=6)]: Done   1 out of  12 | elapsed:   21.1s remaining:  3.9min
[Parallel(n_jobs=6)]: Done   9 out of  12 | elapsed:  2.4min remaining:   47.5s
[Parallel(n_jobs=6)]: Done  12 out of  12 | elapsed:  3.1min finished
[CV] svm__C=10, svd__n_components=100 ................................
[CV] svm__C=10, svd__n_components=100 ................................
[CV] svm__C=10, svd__n_components=200 ................................
[CV] svm__C=10, svd__n_components=200 ................................
[CV] svm__C=10, svd__n_components=300 ................................
[CV] svm__C=10, svd__n_components=300 ................................
[CV] ....................... svm__C=10, svd__n_components=100 -  21.0s[CV] ....................... svm__C=10, svd__n_components=100 -  20.2s[CV] ....................... svm__C=10, svd__n_components=200 -  41.0s[CV] ....................... svm__C=10, svd__n_components=200 -  39.5s[CV] ....................... svm__C=10, svd__n_components=300 - 1.1min[CV] ....................... svm__C=10, svd__n_components=300 - 1.1min





[CV] svm__C=10, svd__n_components=400 ................................
[CV] svm__C=10, svd__n_components=400 ................................
[CV] svm__C=10, svd__n_components=500 ................................
[CV] svm__C=10, svd__n_components=500 ................................
[CV] svm__C=10, svd__n_components=600 ................................
[CV] svm__C=10, svd__n_components=600 ................................
[CV] ....................... svm__C=10, svd__n_components=400 - 1.7min[CV] ....................... svm__C=10, svd__n_components=400 - 1.6min[CV] ....................... svm__C=10, svd__n_components=500 - 1.8min[CV] ....................... svm__C=10, svd__n_components=500 - 1.7min[CV] ....................... svm__C=10, svd__n_components=600 - 2.0min[CV] ....................... svm__C=10, svd__n_components=600 - 2.0min





Out[8]:
GridSearchCV(cv=2,
       estimator=Pipeline(steps=[('svd', TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=1.0, cache_size=4000, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=6,
       param_grid={'svm__C': [10], 'svd__n_components': [100, 200, 300, 400, 500, 600]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring=make_scorer(quadratic_weighted_kappa), verbose=2)

In [9]:
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Best score: 0.528
Best parameters set:
	svd__n_components: 200
	svm__C: 10

In [10]:
# 2번째 시도

In [11]:
# Initialize SVD
svd = TruncatedSVD(algorithm='randomized', n_iter=5, random_state=None, tol=0.0)

# Initialize the standard scaler 
scl = StandardScaler(copy=True, with_mean=True, with_std=True)

# We will use SVM here..
svm_model = SVC(kernel='rbf', degree=3, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=4000, class_weight=None, verbose=False, max_iter=-1, random_state=None)

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('svm', svm_model)])

# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svd__n_components' : [200, 1000, 1500, 2000, 2500, 3000],
              'svm__C': [10]}

# Initialize Grid Search Model
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
                                 verbose=1, n_jobs=6, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(trainX, trainY)

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Fitting 2 folds for each of 6 candidates, totalling 12 fits
[Parallel(n_jobs=6)]: Done   1 out of  12 | elapsed:   41.4s remaining:  7.6min
[Parallel(n_jobs=6)]: Done  12 out of  12 | elapsed: 23.5min finished
Best score: 0.526
Best parameters set:
	svd__n_components: 200
	svm__C: 10

In [12]:
# svd__n_components: 200 이게 제일 나아 보인다
# 이제 svm__C파라미터를 최적화해보자

# Initialize SVD
svd = TruncatedSVD(algorithm='randomized', n_iter=5, random_state=None, tol=0.0)

# Initialize the standard scaler 
scl = StandardScaler(copy=True, with_mean=True, with_std=True)

# We will use SVM here..
svm_model = SVC(kernel='rbf', degree=3, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=4000, class_weight=None, verbose=False, max_iter=-1, random_state=None)

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('svm', svm_model)])

# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svd__n_components' : [200],
              'svm__C': [10,20,30,40,50,60]}

# Initialize Grid Search Model
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
                                 verbose=1, n_jobs=6, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(trainX, trainY)

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Fitting 2 folds for each of 6 candidates, totalling 12 fits
[Parallel(n_jobs=6)]: Done   1 out of  12 | elapsed:   47.2s remaining:  8.6min
[Parallel(n_jobs=6)]: Done  12 out of  12 | elapsed:  1.7min finished
Best score: 0.535
Best parameters set:
	svd__n_components: 200
	svm__C: 10

In [13]:
# 이제 svm__C파라미터를 최적화해보자

# Initialize SVD
svd = TruncatedSVD(algorithm='randomized', n_iter=5, random_state=None, tol=0.0)

# Initialize the standard scaler 
scl = StandardScaler(copy=True, with_mean=True, with_std=True)

# We will use SVM here..
svm_model = SVC(kernel='rbf', degree=3, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=4000, class_weight=None, verbose=False, max_iter=-1, random_state=None)

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('svm', svm_model)])

# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svd__n_components' : [200],
              'svm__C': [10, 100, 200, 300, 400, 500]}

# Initialize Grid Search Model
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
                                 verbose=1, n_jobs=6, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(trainX, trainY)

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Fitting 2 folds for each of 6 candidates, totalling 12 fits
[Parallel(n_jobs=6)]: Done   1 jobs       | elapsed:   46.5s
[Parallel(n_jobs=6)]: Done   2 out of  12 | elapsed:   48.3s remaining:  4.0min
[Parallel(n_jobs=6)]: Done  12 out of  12 | elapsed:  1.7min finished
Best score: 0.536
Best parameters set:
	svd__n_components: 200
	svm__C: 10

In [14]:
# 이제 svm__C파라미터를 최적화해보자

# Initialize SVD
svd = TruncatedSVD(algorithm='randomized', n_iter=5, random_state=None, tol=0.0, n_components=200)

# Initialize the standard scaler 
scl = StandardScaler(copy=True, with_mean=True, with_std=True)

# We will use SVM here..
svm_model = SVC(kernel='rbf', degree=3, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=4000, class_weight=None, verbose=False, max_iter=-1, random_state=None)

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('svm', svm_model)])

# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svm__C': [1,2,3,4,5,6,7,8,9,10]}

# Initialize Grid Search Model
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
                                 verbose=1, n_jobs=6, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(trainX, trainY)

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Fitting 2 folds for each of 10 candidates, totalling 20 fits
[Parallel(n_jobs=6)]: Done   1 jobs       | elapsed:   46.8s
[Parallel(n_jobs=6)]: Done  10 out of  20 | elapsed:  1.6min remaining:  1.6min
[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:  2.7min finished
Best score: 0.535
Best parameters set:
	svm__C: 9

In [16]:
# 이제 svm__C파라미터를 최적화해보자

# Initialize SVD
svd = TruncatedSVD(algorithm='randomized', n_iter=5, random_state=None, tol=0.0, n_components=200)

# Initialize the standard scaler 
scl = StandardScaler(copy=True, with_mean=True, with_std=True)

# We will use SVM here..
svm_model = SVC(kernel='rbf', degree=3, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=4000, class_weight=None, verbose=False, max_iter=-1, random_state=None)

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('svm', svm_model)])

# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svm__C': [9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9]}

# Initialize Grid Search Model
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
                                 verbose=1, n_jobs=6, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(trainX, trainY)

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Fitting 2 folds for each of 9 candidates, totalling 18 fits
[Parallel(n_jobs=6)]: Done   1 jobs       | elapsed:   44.6s
[Parallel(n_jobs=6)]: Done   8 out of  18 | elapsed:  1.5min remaining:  1.9min
[Parallel(n_jobs=6)]: Done  18 out of  18 | elapsed:  2.2min finished
Best score: 0.536
Best parameters set:
	svm__C: 9.6

In [18]:
# 이제 svm__C파라미터를 최적화해보자

# Initialize SVD
svd = TruncatedSVD(algorithm='randomized', n_iter=5, random_state=None, tol=0.0, n_components=200)

# Initialize the standard scaler 
scl = StandardScaler(copy=True, with_mean=True, with_std=True)

# We will use SVM here..
svm_model = SVC(kernel='rbf', degree=3, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=4000, class_weight=None, verbose=False, max_iter=-1, random_state=None)

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('svm', svm_model)])

# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svm__C': np.random.uniform(low=9.5, high=9.7, size=30)}

# Initialize Grid Search Model
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
                                 verbose=1, n_jobs=6, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(trainX, trainY)

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Fitting 2 folds for each of 30 candidates, totalling 60 fits
[Parallel(n_jobs=6)]: Done   1 jobs       | elapsed:   44.4s
[Parallel(n_jobs=6)]: Done  50 out of  60 | elapsed:  6.3min remaining:  1.3min
[Parallel(n_jobs=6)]: Done  60 out of  60 | elapsed:  7.1min finished
Best score: 0.542
Best parameters set:
	svm__C: 9.6149819013819808

In [19]:
# svm__gamma파라미터를 최적화해보자

# Initialize SVD
svd = TruncatedSVD(algorithm='randomized', n_iter=5, random_state=None, tol=0.0, n_components=200)

# Initialize the standard scaler 
scl = StandardScaler(copy=True, with_mean=True, with_std=True)

# We will use SVM here..
svm_model = SVC(kernel='rbf', C=9.6149819013819808, degree=3, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=4000, class_weight=None, verbose=False, max_iter=-1, random_state=None)

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('svm', svm_model)])

# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svm__gamma': [0.0, 0.1, 0.01, 0.001, 0.0001, 0.5, 0.05, 0.005, 0.0005, 0.3, 0.03, 0.003, 0.0003
                            ,0.7, 0.07, 0.007, 0.0007]}

# Initialize Grid Search Model
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
                                 verbose=1, n_jobs=6, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(trainX, trainY)

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Fitting 2 folds for each of 17 candidates, totalling 34 fits
[Parallel(n_jobs=6)]: Done   1 jobs       | elapsed:   46.2s
[Parallel(n_jobs=6)]: Done  34 out of  34 | elapsed:  4.9min finished
Best score: 0.536
Best parameters set:
	svm__gamma: 0.007

In [20]:
# svm__gamma파라미터를 최적화해보자

# Initialize SVD
svd = TruncatedSVD(algorithm='randomized', n_iter=5, random_state=None, tol=0.0, n_components=200)

# Initialize the standard scaler 
scl = StandardScaler(copy=True, with_mean=True, with_std=True)

# We will use SVM here..
svm_model = SVC(kernel='rbf', C=9.6149819013819808, degree=3, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=4000, class_weight=None, verbose=False, max_iter=-1, random_state=None)

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('svm', svm_model)])

# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svm__gamma': np.random.uniform(low=0.006, high=0.008, size=100)}

# Initialize Grid Search Model
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
                                 verbose=1, n_jobs=6, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(trainX, trainY)

print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Fitting 2 folds for each of 100 candidates, totalling 200 fits
[Parallel(n_jobs=6)]: Done   1 jobs       | elapsed:   48.6s
[Parallel(n_jobs=6)]: Done  50 jobs       | elapsed:  6.7min
[Parallel(n_jobs=6)]: Done 200 out of 200 | elapsed: 24.5min finished
Best score: 0.544
Best parameters set:
	svm__gamma: 0.0063585387028907208

In [21]:
# Get best model
best_model = model.best_estimator_

# Fit model with best parameters optimized for quadratic_weighted_kappa
best_model.fit(trainX, trainY)
preds = best_model.predict(testX)

In [30]:
# Create your submission file
submission = pd.DataFrame({"id": idx, "prediction": preds})
submission.to_csv("submission.csv", index=False)

In [29]:
# kaggle score : 0.59655

In [31]:
# 2개의 결과값을 합쳐서 평균을 내보자!
# 0.61139
s_data = train[['query', 'product_title', 'product_description']].apply(func=merge_rows, axis=1)
s_labels = train["median_relevance"]

t_data = test[['query', 'product_title', 'product_description']].apply(func=merge_rows, axis=1)

#create sklearn pipeline, fit all, and predit test data
clf = Pipeline([('v',TfidfVectorizer(min_df=5, max_df=500, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english')), 
('svd', TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)), 
('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), 
('svm', SVC(C=10.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=1000, class_weight=None, verbose=False, max_iter=-1, random_state=None))])
clf.fit(s_data, s_labels)
t_labels = clf.predict(t_data)

import math
p3 = []
for i in range(len(preds)):
    x = (int(t_labels[i]) + preds[i])/2
    x = math.floor(x)
    p3.append(int(x))
    
submission = pd.DataFrame({"id": idx, "prediction": p3})
submission.to_csv("submission.csv", index=False)