notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
from sklearn.utils import shuffle



In [2]:

    
df_sub = pd.read_csv("/Users/dikien/Downloads/Search Results Relevance/sampleSubmission.csv")
print df_sub.columns
print df_sub.dtypes
df_sub.head(3)









    



Index([u'id', u'prediction'], dtype='object')
id            int64
prediction    int64
dtype: object






    Out[2]:






  
    
      
      id
      prediction
    
  
  
    
      0
      3
      3
    
    
      1
      6
      3
    
    
      2
      9
      3



In [3]:

    
df = pd.read_csv("/Users/dikien/Downloads/Search Results Relevance/train.csv", low_memory=False)
print df.columns









    



Index([u'id', u'query', u'product_title', u'product_description',
       u'median_relevance', u'relevance_variance'],
      dtype='object')



In [4]:

    
df.head(4)









    Out[4]:






  
    
      
      id
      query
      product_title
      product_description
      median_relevance
      relevance_variance
    
  
  
    
      0
      1
      bridal shower decorations
      Accent Pillow with Heart Design - Red/Black
      Red satin accent pillow embroidered with a hea...
      1
      0.000
    
    
      1
      2
      led christmas lights
      Set of 10 Battery Operated Multi LED Train Chr...
      Set of 10 Battery Operated Train Christmas Lig...
      4
      0.000
    
    
      2
      4
      projector
      ViewSonic Pro8200 DLP Multimedia Projector
      NaN
      4
      0.471
    
    
      3
      5
      wine rack
      Concept Housewares WR-44526 Solid-Wood Ceiling...
      Like a silent and sturdy tree, the Southern En...
      4
      0.000



In [5]:

    
df = pd.read_csv("/Users/dikien/Downloads/Search Results Relevance/test.csv", low_memory=False)
print df.columns









    



Index([u'id', u'query', u'product_title', u'product_description'], dtype='object')



In [6]:

    
df.head(5)









    Out[6]:






  
    
      
      id
      query
      product_title
      product_description
    
  
  
    
      0
      3
      electric griddle
      Star-Max 48 in Electric Griddle
      NaN
    
    
      1
      6
      phillips coffee maker
      Philips SENSEO HD7810 WHITE Single Serve Pod C...
      NaN
    
    
      2
      9
      san francisco 49ers
      2013 San Francisco 49ers Clock
      A 2013 San Francisco 49ers clock is the ultima...
    
    
      3
      11
      aveeno shampoo
      AVEENO       10.5FLOZ NRSH SHINE SH
      Water, Ammonium Lauryl Sulfate, Dimethicone, S...
    
    
      4
      12
      flea and tick control for dogs
      Merial Frontline Plus Flea and Tick Control fo...
      NaN



In [7]:

    
# 다른 사람이 만든거 사용해보기
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition, pipeline, metrics, grid_search
from nltk.stem.porter import *
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text



In [8]:

    
# array declarations
sw=[]
s_data = []
s_labels = []
t_data = []
t_labels = []



In [9]:

    
#stopwords tweak - more overhead
stop_words = ['http','www','img','border','0','1','2','3','4','5','6','7','8','9']
stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)
for stw in stop_words:
    sw.append("q"+stw)
    sw.append("z"+stw)
stop_words = text.ENGLISH_STOP_WORDS.union(sw)



In [10]:

    
# Load the training file
train = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/train.csv')
test = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/test.csv')



In [11]:

    
# we dont need ID columns
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)



In [12]:

    
# create labels. drop useless columns
y = train.median_relevance.values
train = train.drop(['median_relevance', 'relevance_variance'], axis=1)



In [13]:

    
print train.shape
print y.shape









    



(10158, 3)
(10158,)



In [14]:

    
train.head(3)









    Out[14]:






  
    
      
      query
      product_title
      product_description
    
  
  
    
      0
      bridal shower decorations
      Accent Pillow with Heart Design - Red/Black
      Red satin accent pillow embroidered with a hea...
    
    
      1
      led christmas lights
      Set of 10 Battery Operated Multi LED Train Chr...
      Set of 10 Battery Operated Train Christmas Lig...
    
    
      2
      projector
      ViewSonic Pro8200 DLP Multimedia Projector
      NaN



In [15]:

    
# do some lambda magic on text columns
traindata = list(train.apply(lambda x:'%s %s' % (x['query'],x['product_title']),axis=1))
testdata = list(test.apply(lambda x:'%s %s' % (x['query'],x['product_title']),axis=1))



In [16]:

    
traindata[0]









    Out[16]:





'bridal shower decorations Accent Pillow with Heart Design - Red/Black'



In [17]:

    
# the infamous tfidf vectorizer (Do you remember this one?)
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')



In [18]:

    
# Fit TFIDF
tfv.fit(traindata)
X =  tfv.transform(traindata) 
X_test = tfv.transform(testdata)



In [19]:

    
# Initialize SVD
svd = TruncatedSVD()

# Initialize the standard scaler 
scl = StandardScaler()

# We will use SVM here..
svm_model = SVC()



In [20]:

    
# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('svm', svm_model)])



In [21]:

    
# Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'svd__n_components' : [400],
              'svm__C': [10]}



In [22]:

    
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)



In [23]:

    
# Kappa Scorer 
kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True)



In [24]:

    
# Initialize Grid Search Model
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
                                 verbose=1, n_jobs=1, iid=True, refit=True, cv=2)



In [25]:

    
# Fit Grid Search Model
model.fit(X, y)









    



Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] svm__C=10, svd__n_components=400 ................................
[CV] ....... svm__C=10, svd__n_components=400, score=0.552638 -  40.7s
[CV] svm__C=10, svd__n_components=400 ................................
[CV] ....... svm__C=10, svd__n_components=400, score=0.541797 -  41.8s





    



[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:   40.8s
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:  1.4min






    









    



[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.4min finished






    Out[25]:





GridSearchCV(cv=2,
       estimator=Pipeline(steps=[('svd', TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'svm__C': [10], 'svd__n_components': [400]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring=make_scorer(quadratic_weighted_kappa), verbose=10)



In [26]:

    
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))









    



Best score: 0.547
Best parameters set:
	svd__n_components: 400
	svm__C: 10



In [31]:

    
best_parameters









    Out[31]:





{'scl': StandardScaler(copy=True, with_mean=True, with_std=True),
 'scl__copy': True,
 'scl__with_mean': True,
 'scl__with_std': True,
 'svd': TruncatedSVD(algorithm='randomized', n_components=400, n_iter=5,
        random_state=None, tol=0.0),
 'svd__algorithm': 'randomized',
 'svd__n_components': 400,
 'svd__n_iter': 5,
 'svd__random_state': None,
 'svd__tol': 0.0,
 'svm': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
   kernel='rbf', max_iter=-1, probability=False, random_state=None,
   shrinking=True, tol=0.001, verbose=False),
 'svm__C': 10,
 'svm__cache_size': 200,
 'svm__class_weight': None,
 'svm__coef0': 0.0,
 'svm__degree': 3,
 'svm__gamma': 0.0,
 'svm__kernel': 'rbf',
 'svm__max_iter': -1,
 'svm__probability': False,
 'svm__random_state': None,
 'svm__shrinking': True,
 'svm__tol': 0.001,
 'svm__verbose': False}



In [32]:

    
# Get best model
best_model = model.best_estimator_

# Fit model with best parameters optimized for quadratic_weighted_kappa
best_model.fit(X, y)
preds = best_model.predict(X_test)



In [35]:

    
print preds.shape
print preds[0:10]









    



(22513,)
[4 4 3 4 4 4 4 4 4 3]



In [36]:

    
# 첫번째 시도
submission = pd.DataFrame({"id": idx, "prediction": preds})
submission.to_csv("first_try.csv", index=False)
# 0.56948나옴 409등 ㅋㅋ



In [39]:

    
submission.head(2)



In [40]:

    
#load data
train = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/train.csv').fillna("")
test  = pd.read_csv('/Users/dikien/Downloads/Search Results Relevance/test.csv').fillna("")



In [41]:

    
#remove html, remove non text or numeric, make query and title unique features for counts using prefix (accounted for in stopwords tweak)
stemmer = PorterStemmer()
## Stemming functionality
class stemmerUtility(object):
    """Stemming functionality"""
    @staticmethod
    def stemPorter(review_text):
        porter = PorterStemmer()
        preprocessed_docs = []
        for doc in review_text:
            final_doc = []
            for word in doc:
                final_doc.append(porter.stem(word))
                #final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
            preprocessed_docs.append(final_doc)
        return preprocessed_docs


for i in range(len(train.id)):
    s=(" ").join(["q"+ z for z in BeautifulSoup(train["query"][i]).get_text(" ").split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(train.product_title[i]).get_text(" ").split(" ")]) + " " + BeautifulSoup(train.product_description[i]).get_text(" ")
    s=re.sub("[^a-zA-Z0-9]"," ", s)
    s= (" ").join([stemmer.stem(z) for z in s.split(" ")])
    s_data.append(s)
    s_labels.append(str(train["median_relevance"][i]))
for i in range(len(test.id)):
    s=(" ").join(["q"+ z for z in BeautifulSoup(test["query"][i]).get_text().split(" ")]) + " " + (" ").join(["z"+ z for z in BeautifulSoup(test.product_title[i]).get_text().split(" ")]) + " " + BeautifulSoup(test.product_description[i]).get_text()
    s=re.sub("[^a-zA-Z0-9]"," ", s)
    s= (" ").join([stemmer.stem(z) for z in s.split(" ")])
    t_data.append(s)









    



/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65497012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65516012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/6552101" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januaryb/65527" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
/Users/dikien/anaconda/lib/python2.7/site-packages/bs4/__init__.py:189: UserWarning: "http://i104.photobucket.com/albums/m175/champions_on_display/wincraft2013/januarya/14146012.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



In [42]:

    
#create sklearn pipeline, fit all, and predit test data
clf = Pipeline([('v',TfidfVectorizer(min_df=5, max_df=500, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = 'english')), 
('svd', TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)), 
('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), 
('svm', SVC(C=10.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=1000, class_weight=None, verbose=False, max_iter=-1, random_state=None))])
clf.fit(s_data, s_labels)
t_labels = clf.predict(t_data)



In [43]:

    
import math
p3 = []
for i in range(len(preds)):
    x = (int(t_labels[i]) + preds[i])/2
    x = math.floor(x)
    p3.append(int(x))



In [44]:

    
# Create your first submission file
submission = pd.DataFrame({"id": idx, "prediction": p3})
submission.to_csv("second.csv", index=False)
# 0.62491

	id	query	product_title	product_description	median_relevance	relevance_variance
0	1	bridal shower decorations	Accent Pillow with Heart Design - Red/Black	Red satin accent pillow embroidered with a hea...	1	0.000
1	2	led christmas lights	Set of 10 Battery Operated Multi LED Train Chr...	Set of 10 Battery Operated Train Christmas Lig...	4	0.000
2	4	projector	ViewSonic Pro8200 DLP Multimedia Projector	NaN	4	0.471
3	5	wine rack	Concept Housewares WR-44526 Solid-Wood Ceiling...	Like a silent and sturdy tree, the Southern En...	4	0.000

	id	query	product_title	product_description
0	3	electric griddle	Star-Max 48 in Electric Griddle	NaN
1	6	phillips coffee maker	Philips SENSEO HD7810 WHITE Single Serve Pod C...	NaN
2	9	san francisco 49ers	2013 San Francisco 49ers Clock	A 2013 San Francisco 49ers clock is the ultima...
3	11	aveeno shampoo	AVEENO 10.5FLOZ NRSH SHINE SH	Water, Ammonium Lauryl Sulfate, Dimethicone, S...
4	12	flea and tick control for dogs	Merial Frontline Plus Flea and Tick Control fo...	NaN