In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

In [2]:
df_sub = pd.read_csv("/Users/dikien/Downloads/Search Results Relevance/sampleSubmission.csv")
print df_sub.columns
print df_sub.dtypes
df_sub.head(3)


Index([u'id', u'prediction'], dtype='object')
id            int64
prediction    int64
dtype: object
Out[2]:
id prediction
0 3 3
1 6 3
2 9 3

In [3]:
df = pd.read_csv("/Users/dikien/Downloads/Search Results Relevance/train.csv", low_memory=False)
print df.columns


Index([u'id', u'query', u'product_title', u'product_description',
       u'median_relevance', u'relevance_variance'],
      dtype='object')

In [4]:
df.head(4)


Out[4]:
id query product_title product_description median_relevance relevance_variance
0 1 bridal shower decorations Accent Pillow with Heart Design - Red/Black Red satin accent pillow embroidered with a hea... 1 0.000
1 2 led christmas lights Set of 10 Battery Operated Multi LED Train Chr... Set of 10 Battery Operated Train Christmas Lig... 4 0.000
2 4 projector ViewSonic Pro8200 DLP Multimedia Projector NaN 4 0.471
3 5 wine rack Concept Housewares WR-44526 Solid-Wood Ceiling... Like a silent and sturdy tree, the Southern En... 4 0.000

In [5]:
df = pd.read_csv("/Users/dikien/Downloads/Search Results Relevance/test.csv", low_memory=False)
print df.columns


Index([u'id', u'query', u'product_title', u'product_description'], dtype='object')

In [6]:
df.head(5)


Out[6]:
id query product_title product_description
0 3 electric griddle Star-Max 48 in Electric Griddle NaN
1 6 phillips coffee maker Philips SENSEO HD7810 WHITE Single Serve Pod C... NaN
2 9 san francisco 49ers 2013 San Francisco 49ers Clock A 2013 San Francisco 49ers clock is the ultima...
3 11 aveeno shampoo AVEENO 10.5FLOZ NRSH SHINE SH Water, Ammonium Lauryl Sulfate, Dimethicone, S...
4 12 flea and tick control for dogs Merial Frontline Plus Flea and Tick Control fo... NaN

In [7]:
# 다른 사람이 만든거 사용해보기
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition, pipeline, metrics, grid_search
from nltk.stem.porter import *
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

In [8]:
# array declarations
sw=[]
s_data = []
s_labels = []
t_data = []
t_labels = []

In [9]:
#stopwords tweak - more overhead
stop_words = ['http','www','img','border','0','1','2','3','4','5','6','7','8','9']
stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)
for stw in stop_words:
    sw.append("q"+stw)
    sw.append("z"+stw)
stop_words = text.ENGLISH_STOP_WORDS.union(sw)

In [10]:
# Load the training file
train = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/train.csv')
test = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/test.csv')

In [11]:
# we dont need ID columns
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [12]:
# create labels. drop useless columns
y = train.median_relevance.values
train = train.drop(['median_relevance', 'relevance_variance'], axis=1)

In [13]:
print train.shape
print y.shape


(10158, 3)
(10158,)

In [14]:
train.head(3)


Out[14]:
query product_title product_description
0 bridal shower decorations Accent Pillow with Heart Design - Red/Black Red satin accent pillow embroidered with a hea...
1 led christmas lights Set of 10 Battery Operated Multi LED Train Chr... Set of 10 Battery Operated Train Christmas Lig...
2 projector ViewSonic Pro8200 DLP Multimedia Projector NaN

In [15]:
# do some lambda magic on text columns
traindata = list(train.apply(lambda x:'%s %s' % (x['query'],x['product_title']),axis=1))
testdata = list(test.apply(lambda x:'%s %s' % (x['query'],x['product_title']),axis=1))

In [16]:
traindata[0]


Out[16]:
'bridal shower decorations Accent Pillow with Heart Design - Red/Black'

In [17]:
# the infamous tfidf vectorizer (Do you remember this one?)
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')

In [18]:
# Fit TFIDF
tfv.fit(traindata)
X =  tfv.transform(traindata) 
X_test = tfv.transform(testdata)

In [19]:
# Initialize SVD
svd = TruncatedSVD()

# Initialize the standard scaler 
scl = StandardScaler()

# We will use SVM here..
svm_model = SVC()

In [20]:
# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('svm', svm_model)])

In [21]:
# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svd__n_components' : [400],
              'svm__C': [10]}

In [22]:
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

In [23]:
# Kappa Scorer 
kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True)

In [24]:
# Initialize Grid Search Model
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
                                 verbose=1, n_jobs=1, iid=True, refit=True, cv=2)

In [25]:
# Fit Grid Search Model
model.fit(X, y)


Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] svm__C=10, svd__n_components=400 ................................
[CV] ....... svm__C=10, svd__n_components=400, score=0.552638 -  40.7s
[CV] svm__C=10, svd__n_components=400 ................................
[CV] ....... svm__C=10, svd__n_components=400, score=0.541797 -  41.8s
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:   40.8s
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:  1.4min

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.4min finished
Out[25]:
GridSearchCV(cv=2,
       estimator=Pipeline(steps=[('svd', TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'svm__C': [10], 'svd__n_components': [400]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring=make_scorer(quadratic_weighted_kappa), verbose=10)

In [26]:
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Best score: 0.547
Best parameters set:
	svd__n_components: 400
	svm__C: 10

In [31]:
best_parameters


Out[31]:
{'scl': StandardScaler(copy=True, with_mean=True, with_std=True),
 'scl__copy': True,
 'scl__with_mean': True,
 'scl__with_std': True,
 'svd': TruncatedSVD(algorithm='randomized', n_components=400, n_iter=5,
        random_state=None, tol=0.0),
 'svd__algorithm': 'randomized',
 'svd__n_components': 400,
 'svd__n_iter': 5,
 'svd__random_state': None,
 'svd__tol': 0.0,
 'svm': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
   kernel='rbf', max_iter=-1, probability=False, random_state=None,
   shrinking=True, tol=0.001, verbose=False),
 'svm__C': 10,
 'svm__cache_size': 200,
 'svm__class_weight': None,
 'svm__coef0': 0.0,
 'svm__degree': 3,
 'svm__gamma': 0.0,
 'svm__kernel': 'rbf',
 'svm__max_iter': -1,
 'svm__probability': False,
 'svm__random_state': None,
 'svm__shrinking': True,
 'svm__tol': 0.001,
 'svm__verbose': False}

In [32]:
# Get best model
best_model = model.best_estimator_

# Fit model with best parameters optimized for quadratic_weighted_kappa
best_model.fit(X, y)
preds = best_model.predict(X_test)

In [35]:
print preds.shape
print preds[0:10]


(22513,)
[4 4 3 4 4 4 4 4 4 3]

In [36]:
# 첫번째 시도
submission = pd.DataFrame({"id": idx, "prediction": preds})
submission.to_csv("first_try.csv", index=False)
# 0.56948나옴 409등 ㅋㅋ

In [39]:
submission.head(2)


Out[39]:
id prediction
0 3 4
1 6 4

In [40]:
#load data
train = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/train.csv').fillna("")
test  = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/test.csv').fillna("")

In [41]:
#remove html, remove non text or numeric, make query and title unique features for counts using prefix (accounted for in stopwords tweak)
stemmer = PorterStemmer()
## Stemming functionality
class stemmerUtility(object):
    """Stemming functionality"""
    @staticmethod
    def stemPorter(review_text):
        porter = PorterStemmer()
        preprocessed_docs = []
        for doc in review_text:
            final_doc = []
            for word in doc:
                final_doc.append(porter.stem(word))
                #final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
            preprocessed_docs.append(final_doc)
        return preprocessed_docs


for i in range(len(train.id)):
    s=(" ").join(["q"+ z for z in BeautifulSoup(train["query"][i]).get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title[i]).get_text(" ").split(" ")]) + " " + BeautifulSoup(train.product_description[i]).get_text(" ")
    s=re.sub("[^a-zA-Z0-9]"," ", s)
    s= (" ").join([stemmer.stem(z) for z in s.split(" ")])
    s_data.append(s)
    s_labels.append(str(train["median_relevance"][i]))
for i in range(len(test.id)):
    s=(" ").join(["q"+ z for z in BeautifulSoup(test["query"][i]).get_text().split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(test.product_title[i]).get_text().split(" ")]) + " " + BeautifulSoup(test.product_description[i]).get_text()
    s=re.sub("[^a-zA-Z0-9]"," ", s)
    s= (" ").join([stemmer.stem(z) for z in s.split(" ")])
    t_data.append(s)


/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65497012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65516012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/6552101" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65527" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januarya/14146012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)

In [42]:
#create sklearn pipeline, fit all, and predit test data
clf = Pipeline([('v',TfidfVectorizer(min_df=5, max_df=500, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english')), 
('svd', TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)), 
('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), 
('svm', SVC(C=10.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=1000, class_weight=None, verbose=False, max_iter=-1, random_state=None))])
clf.fit(s_data, s_labels)
t_labels = clf.predict(t_data)

In [43]:
import math
p3 = []
for i in range(len(preds)):
    x = (int(t_labels[i]) + preds[i])/2
    x = math.floor(x)
    p3.append(int(x))

In [44]:
# Create your first submission file
submission = pd.DataFrame({"id": idx, "prediction": p3})
submission.to_csv("second.csv", index=False)
# 0.62491