In [1]:

    
import warnings
warnings.filterwarnings('ignore')
import re
import time
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.mpl_style = 'default'
from nltk.tokenize import word_tokenize
from util3 import *

Load Files



In [2]:

    
df_train = pd.read_csv('./data/train.csv', encoding='ISO-8859-1')
df_test = pd.read_csv('./data/test.csv', encoding='ISO-8859-1')
df_desp = pd.read_csv('./data/product_descriptions.csv', encoding='ISO-8859-1')
df_attr = pd.read_csv('./data/attributes.csv', encoding='ISO-8859-1')
num_train = df_train.shape[0]

Attributes

Explore what are the useful attributes and how to represent them as features.



In [86]:

    
df_attr['name'].value_counts()[:30]









    Out[86]:





Bullet02                       86248
Bullet03                       86226
MFG Brand Name                 86220
Bullet04                       86174
Bullet01                       85940
Product Width (in.)            61137
Bullet05                       60528
Product Height (in.)           54698
Product Depth (in.)            53652
Product Weight (lb.)           45175
Bullet06                       44901
Color Family                   41508
Bullet07                       34349
Material                       31499
Color/Finish                   28540
Bullet08                       26645
Certifications and Listings    24583
Bullet09                       20567
Assembled Height (in.)         18299
Assembled Width (in.)          18263
Assembled Depth (in.)          18198
Product Length (in.)           16705
Bullet10                       14763
Indoor/Outdoor                 12939
Bullet11                       11784
Commercial / Residential        9530
Bullet12                        8795
ENERGY STAR Certified           8420
Hardware Included               7462
Package Quantity                6904
Name: name, dtype: int64



In [3]:

    
df_attr.dropna(inplace=True)

Among the top 30 attributes, some seem to be not very useful. They are:

Certifications and Listings
Package Quantity
Hardware Included

Spend some time to manually inspect the dataset.



In [4]:

    
def filter_str(df, s, col='search_term'):
    return df[df[col].str.lower().str.contains(s)]



In [124]:

    
filter_str(df, 'hardware')









    Out[124]:






  
    
      
      id
      product_title
      product_uid
      relevance
      search_term
      product_description
      brand
      bullet
      bullet_count
      color
      material
      flag_commercial
      flag_residential
    
  
  
    
      375
      1208
      Liberty 3-3/4 in. Steel Bar Pull (25-Pack)
      100209
      2.67
      bathroom hardware knobs and pulls
      Sleek and sophisticated, this design makes a c...
      Liberty
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      1416
      4355
      Martha Stewart Living 3-3/4 in. Bar Cabinet Ha...
      100748
      2.33
      3/4' hardware
      The Martha Stewart Living Country 3-3/4 in. Po...
      Martha Stewart Living
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      1989
      6161
      Defiant Hartford Satin Nickel Entry Knob
      101061
      2.67
      door lock hardware
      Featuring a lifetime guarantee, Defiant meets ...
      Defiant
      
      0.0
      
      Stainless steel Metal
      -1.0
      -1.0
    
    
      2180
      6735
      Martha Stewart Living 3-3/4 in. Dowel Cabinet ...
      101158
      3.00
      3/4' hardware
      The Martha Stewart Living Country 3-3/4 in. Be...
      Martha Stewart Living
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      2295
      7047
      Oz-Post T4-850 4 in. Square Wood Post Anchor (...
      101200
      1.33
      oz metal fence hardware
      Oz-Post is one of the best ways to secure a wo...
      Oz-Post
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      2337
      7169
      Liberty 2-3/4 in. or 3 in. Newton Cabinet Hard...
      101223
      2.00
      3/4' hardware
      The Liberty 2-3/4 or 3 in. Satin Nickel Dual-M...
      Liberty
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      2339
      7172
      Liberty 2-3/4 in. or 3 in. Newton Cabinet Hard...
      101223
      3.00
      kitchen cabinet drawer center-mount hardware
      The Liberty 2-3/4 or 3 in. Satin Nickel Dual-M...
      Liberty
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      2706
      8370
      HDX 4 ft. x 100 ft. 14-Gauge Welded Wire
      101419
      2.00
      wire fences hardware
      The HDX 4 ft. x 100 ft. Welded Wire is made of...
      HDX
      Silver
      1.0
      Silver
      Metal
      -1.0
      -1.0
    
    
      2965
      9189
      Liberty 2-1/2 in. or 3 in. Dark Oil Rubbed Bro...
      101566
      2.00
      kitchen hardware
      Use the Liberty Hardware 2-1/2 or 3 in. Dual M...
      Liberty
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      2975
      9221
      Oz-Post Steel 2 Wood Fence Bracket Project Pac...
      101571
      2.33
      wap around hardware
      The WAP-238 from OZCO is a galvanized bracket ...
      Oz-Post
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      2982
      9244
      HDX 1/4 in. x 2 ft. x 5 ft. Hardware Cloth
      101575
      2.00
      wire fences hardware
      A lightweight, flexible and economical wire me...
      HDX
      Silver
      1.0
      Silver
      Metal
      -1.0
      -1.0
    
    
      3300
      10232
      Schlage Plymouth Double Cylinder Antique Brass...
      101755
      3.00
      interior door hardware by schlage
      The Plymouth front entry features a simple cur...
      Schlage
      
      0.0
      
      Solid Brass
      -1.0
      -1.0
    
    
      3421
      10576
      Liberty 6-2/7 in. Steel Bar Cabinet Hardware A...
      101831
      2.00
      kitchen cabinet drawer center-mount hardware
      Sleek and sophisticated, this design makes a c...
      Liberty
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      3422
      10577
      Liberty 6-2/7 in. Steel Bar Cabinet Hardware A...
      101831
      2.00
      kitchen cabinte hardware blue knob
      Sleek and sophisticated, this design makes a c...
      Liberty
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      3423
      10578
      Liberty 6-2/7 in. Steel Bar Cabinet Hardware A...
      101831
      3.00
      liberty campaign hardware
      Sleek and sophisticated, this design makes a c...
      Liberty
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      3775
      11728
      Veranda 1-1/2 oz. Vinyl Fence Cement
      102046
      2.67
      vinyl fence hardware
      Veranda PVC cement glue is specifically design...
      Veranda
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      3792
      11788
      HDX 28 in. x 50 ft. Garden Fence
      102057
      2.00
      wire fences hardware
      If rabbits and other varmints are getting to y...
      HDX
      Silver
      1.0
      Silver
      Metal
      -1.0
      -1.0
    
    
      3933
      12218
      YARDGARD 5 ft. x 50 ft. 14-Gauge Vinyl Galvani...
      102139
      1.67
      wire fences hardware
      Welded wire is a general purpose fence providi...
      YARDGARD
      Green
      1.0
      Green
      Metal
      -1.0
      -1.0
    
    
      4159
      13026
      Tenax 3 ft. x 15 ft. Plastic Black Hardware Net
      102276
      2.33
      wire fences hardware
      Replaces metal hardware net under must applica...
      Tenax
      Black Black
      2.0
      Black Black
      Plastic
      -1.0
      -1.0
    
    
      4613
      14452
      Liberty 3-3/4 in. Steel Bar Cabinet Hardware Pull
      102520
      1.33
      liberty campaign hardware
      Sleek and sophisticated, this design makes a c...
      Liberty
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      4844
      15184
      Oz-Post Steel 2 Wood Fence Bracket WAP-OZ
      102651
      2.67
      oz metal fence hardware
      The remarkable WAP-OZ fence bracket from OZCO ...
      Oz-Post
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      5260
      16453
      Martha Stewart Living 3 in. Bedford Nickel Cyl...
      102905
      2.00
      kitchen hardware
      The Martha Stewart Living Modern 3 in. Bedford...
      Martha Stewart Living
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      6770
      21202
      Hickory Hardware Oil-Rubbed Bronze Surface Sel...
      103960
      2.00
      hickory hardware 469999035
      Update your cabinetry with the Hickory Hardwar...
      Hickory Hardware
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      6835
      21419
      Defiant Springfield Satin Nickel Mushroom Hand...
      104017
      3.00
      front door hardware
      Reinforce your entry door with the Defiant Spr...
      Defiant
      
      0.0
      
      Solid Brass
      -1.0
      -1.0
    
    
      6909
      21626
      HDX 1 in. x 4 ft. x 50 ft. Poultry Netting
      104079
      2.00
      wire fences hardware
      The HDX 1 in. x 4 ft. x 50 ft. 20-Gauge Galvan...
      HDX
      
      0.0
      
      Galvanized Steel
      -1.0
      -1.0
    
    
      6923
      21680
      Everbilt Anti-Sag Gate Kit
      104092
      2.00
      fence gate hardware eyebolt
      The Everbilt Anti-Sag Gate Kit is ideal to eli...
      Everbilt
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      7209
      22570
      ClosetMaid Preloaded Wall Brackets for SuperSl...
      104340
      2.00
      shelves wood hardware
      This set of 2 ClosetMaid Wall Brackets is desi...
      ClosetMaid
      White White
      2.0
      White White
      Resin
      -1.0
      -1.0
    
    
      7363
      23045
      Veranda Aluminum Rail Bracket for Vinyl Fencin...
      104469
      3.00
      vinyl fence hardware
      Choose Veranda's 2 in. x 3 in. Fence Rail Brac...
      Veranda
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      7461
      23327
      Veranda Shadowbox White Vinyl Fence Bracket Kit
      104547
      3.00
      vinyl fence hardware
      Veranda vinyl fencing is The Home Depot's prem...
      Veranda
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      7475
      23381
      Liberty Satin Nickel 1-3/8 in. Large Football ...
      104569
      3.00
      bathroom hardware knobs and pulls
      Use the Liberty 1-3/8 in. Satin Nickel Large F...
      nobrand
      
      0.0
      
      
      -1.0
      -1.0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      220714
      220324
      PartsmasterPro Universal Designer Lever Handle...
      205663
      NaN
      porcelain handle hardware
      Replacing your faucets handles makes your fauc...
      PartsmasterPro
      Chrome Chrome
      2.0
      Chrome Chrome
      Metal
      -1.0
      -1.0
    
    
      222464
      222465
      Stanley-National Hardware 4-1/2 in. Template H...
      207497
      NaN
      hardware template
      Whether for home, farm, builder or industrial ...
      Stanley-National Hardware
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      223589
      223590
      Prime-Line 3/8 in. Bi-Fold Door Pivot Set
      208443
      NaN
      bi fold door hardware
      This Prime-Line Products 3/8 in. Bifold Door P...
      Prime-Line
      
      0.0
      
      Plastic
      -1.0
      -1.0
    
    
      223642
      223643
      Barton Kramer Bi-Fold Jamb Bracket Closet Door...
      208491
      NaN
      closet hardware
      This bi-fold closet door jamb bracket is desig...
      Barton Kramer
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      223643
      223644
      Barton Kramer Bi-Fold Jamb Bracket Closet Door...
      208491
      NaN
      hardware brackets
      This bi-fold closet door jamb bracket is desig...
      Barton Kramer
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      223774
      223775
      Stainless Glide Stainless Steel Top Mount Roll...
      208596
      NaN
      stainless steel hardware
      The Stainless Glide Stainless Steel Rolling Do...
      Stainless Glide
      Stainless Steel
      1.0
      Stainless Steel
      Stainless Steel
      -1.0
      -1.0
    
    
      224771
      224772
      Canine Hardware 30 in. x 39 in. Pet Travel Bed
      209453
      NaN
      murphy bed hardware
      This travel bed's plush look and feel will als...
      Canine Hardware
      
      0.0
      
      Other
      -1.0
      -1.0
    
    
      226311
      226312
      Martha Stewart Living Bedford 3 in. Nickel Can...
      210797
      NaN
      hardware
      The Martha Stewart Living 3 in. Bedford Nickel...
      Martha Stewart Living
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      226361
      226362
      Prime-Line Bi-Fold Door Top Guide Wheel, 5/8 i...
      210841
      NaN
      bi fold door hardware
      This bi-fold door pivot is constructed from st...
      Prime-Line
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      226826
      226827
      Stainless Glide Stainless Steel Dual Wheel Str...
      211265
      NaN
      stainless steel hardware
      The Stainless Glide Stainless Steel Rolling Do...
      Stainless Glide
      Stainless Steel
      1.0
      Stainless Steel
      Stainless Steel
      -1.0
      -1.0
    
    
      226881
      226882
      Prime-Line Solid Brass Pocket Door Combination...
      211314
      NaN
      pocket door hardware
      This mortise latch unit features recessed grip...
      Prime-Line
      
      0.0
      
      Solid Brass
      -1.0
      -1.0
    
    
      227150
      227151
      Young House Love 3 in. Vintage Style Cocoa Bro...
      211554
      NaN
      liberty campaign hardware
      From Liberty Hardware and Young House Love, th...
      Young House Love
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      228862
      228863
      Continental Home Hardware 1-3/4 in. Satin Nick...
      213100
      NaN
      3/4' hardware
      Thomasville  Hardware brings a customized hard...
      Continental Home Hardware
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      229742
      229743
      Stanley-National Hardware 6 in. Professional C...
      213907
      NaN
      gate hardware kit
      Stanley-National Hardware has been a leading m...
      Stanley-National Hardware
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      229999
      230000
      Prime-Line 3/4 in. Flat Nylon Wheel Bi-Fold Do...
      214140
      NaN
      pocket door hardware
      This wardrobe door roller is constructed from ...
      Prime-Line
      
      0.0
      
      Other
      -1.0
      -1.0
    
    
      230047
      230048
      Kwikset Arlington Single Cylinder Antique Bras...
      214184
      NaN
      front door hardware
      Step up to designer styles and superior securi...
      Kwikset
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      230825
      230826
      PlayStar All Star Build It Yourself Gold Plays...
      214889
      NaN
      lumber hardware
      The All Star XP Gold Design has 12 sq. ft. of ...
      PlayStar
      
      0.0
      
      Galvanized Steel Metal Plastic/Metal
      -1.0
      -1.0
    
    
      231360
      231361
      Barton Kramer Johnson Hardware Bi-Fold Door Bo...
      215386
      NaN
      hardware brackets
      This bi-fold door bottom pivot and bracket is ...
      Barton Kramer
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      232910
      232911
      Prime-Line Closet Pole Sockets, 1-3/8 in., Pla...
      216836
      NaN
      closet hardware
      These pole sockets are constructed from sturdy...
      Prime-Line
      
      0.0
      
      Plastic
      -1.0
      -1.0
    
    
      233124
      233125
      Richelieu Hardware 4-15/16 in. Furniture Leg
      217037
      NaN
      murphy bed hardware
      An ideal way to update your furniture, the Ric...
      Richelieu Hardware
      Matte Black
      1.0
      Matte Black
      Other
      -1.0
      -1.0
    
    
      233302
      233303
      Hickory Hardware Studio Collection 1 in. Oil-R...
      217204
      NaN
      hickory hardware studio
      This Hickory hardware Studio Collection 1 in. ...
      Hickory Hardware
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      234873
      234874
      Liberty 3-3/4 in. Plaza Cabinet Hardware Pull
      218685
      NaN
      3/4' hardware
      The Liberty 3-3/4 in. Brushed Satin-Nickel Pla...
      Liberty
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      235233
      235234
      National Hardware Vinyl Fence Gate Kit in Whit...
      219029
      NaN
      vinyl fence hardware
      Whether for home, farm, builder or industrial ...
      National Hardware
      
      0.0
      
      Steel
      -1.0
      -1.0
    
    
      235587
      235588
      Richelieu Hardware 8 in. x 10 in. White Enamel...
      219367
      NaN
      hardware brackets
      Onward products offer unique and creative hard...
      Richelieu Hardware
      White White
      2.0
      White White
      Metal
      -1.0
      -1.0
    
    
      235701
      235702
      Liberty Cabinet Drawer Hardware Installation T...
      219477
      NaN
      hardware template
      This Liberty Cabinet Hardware Installation Tem...
      Liberty
      
      0.0
      
      Plastic
      -1.0
      -1.0
    
    
      236106
      236107
      Rustica Hardware 42 in. x 84 in. Modern Range ...
      219865
      NaN
      sliding cabinet door hardware
      As unique as your fingerprints are to you, so ...
      nobrand
      
      0.0
      
      
      -1.0
      -1.0
    
    
      237028
      237029
      Everbilt Heavy Duty 36 in. Pocket Door Frame Set
      220759
      NaN
      pocket door hardware
      The Everbilt 36 in. Heavy Duty Pocket Door Fra...
      Everbilt
      
      0.0
      
      Aluminum
      -1.0
      -1.0
    
    
      237151
      237152
      Hickory Hardware Studio 1-1/4 in. Oil Rubbed B...
      220878
      NaN
      hickory hardware studio
      Bold style and functionality are combined in t...
      Hickory Hardware
      
      0.0
      
      Metal
      -1.0
      -1.0
    
    
      237739
      237740
      Johnson Hardware 111FD Series 72 in. Track and...
      221455
      NaN
      bi fold door hardware
      The Johnson Hardware 111FD Series 72 in. Track...
      Johnson Hardware
      
      0.0
      
      Aluminum
      -1.0
      -1.0
    
    
      239861
      239862
      Liberty 1-1/4 in. Hollow Cabinet Hardware Knob
      223535
      NaN
      hardware knob
      The clean lines of this knob fit several desig...
      Liberty
      
      0.0
      
      Ceramic
      -1.0
      -1.0
    
  

742 rows × 13 columns



In [151]:

    
filter_str(df_attr, 'energy star certified', 'name')['value'].value_counts()









    Out[151]:





No     6939
Yes    1481
Name: value, dtype: int64

Brands



In [5]:

    
df_brand = df_attr[df_attr.name == 'MFG Brand Name'][['product_uid', 'value']].rename(columns={'value': 'brand'})

Bullets



In [7]:

    
bullet = dict()
bullet_count = dict()
df_attr['about_bullet'] = df_attr['name'].str.lower().str.contains('bullet')
for idx, row in df_attr[df_attr['about_bullet']].iterrows():
    pid = row['product_uid']
    value = row['value']
    bullet.setdefault(pid, '')
    bullet_count.setdefault(pid, 0)
    bullet[pid] = bullet[pid] + ' ' + str(value)
    bullet_count[pid] = bullet_count[pid] + 1
df_bullet = pd.DataFrame.from_dict(bullet, orient='index').reset_index()
df_bullet_count = pd.DataFrame.from_dict(bullet_count, orient='index').reset_index().astype(np.float)
df_bullet.columns = ['product_uid', 'bullet']
df_bullet_count.columns = ['product_uid', 'bullet_count']

Color



In [8]:

    
color = dict()
df_attr['about_color'] = df_attr['name'].str.lower().str.contains('color')
for idx, row in df_attr[df_attr['about_color']].iterrows():
    pid = row['product_uid']
    value = row['value']
    color.setdefault(pid, '')
    color[pid] = color[pid] + ' ' + str(value)
df_color = pd.DataFrame.from_dict(color, orient='index').reset_index()
df_color.columns = ['product_uid', 'color']

Material



In [9]:

    
material = dict()
df_attr['about_material'] = df_attr['name'].str.lower().str.contains('material')
for idx, row in df_attr[df_attr['about_material']].iterrows():
    pid = row['product_uid']
    value = row['value']
    material.setdefault(pid, '')
    material[pid] = material[pid] + ' ' + str(value)
df_material = pd.DataFrame.from_dict(material, orient='index').reset_index()
df_material.columns = ['product_uid', 'material']

Commercial / Residential Flag



In [10]:

    
comres_index = df_attr['name'].str.lower().str.contains('commercial / residential')



In [11]:

    
df_attr[comres_index]['value'].value_counts()









    Out[11]:





Commercial / Residential    5011
Residential                 4337
Commercial                   182
Name: value, dtype: int64



In [12]:

    
flag_comres = dict()
df_attr['about_comres'] = df_attr['name'].str.lower().str.contains('commercial / residential')
for idx, row in df_attr[df_attr['about_comres']].iterrows():
    pid = row['product_uid']
    value = row['value']
    flag_comres.setdefault(pid, [0, 0])
    if 'Commercial' in str(value):
        flag_comres[pid][0] = 1
    if 'Residential' in str(value):
        flag_comres[pid][1] = 1
df_comres = pd.DataFrame.from_dict(flag_comres, orient='index').reset_index().astype(np.float)
df_comres.columns = ['product_uid', 'flag_commercial', 'flag_residential']

Indoor/Outdoor Flag



In [13]:

    
filter_str(df_attr, 'indoor/outdoor', 'name')['value'].value_counts()









    Out[13]:





Indoor                      7527
Indoor/Outdoor              3925
Outdoor                     1204
Indoor,Outdoor               256
Indoor/Outdoor (Covered)      47
Name: value, dtype: int64



In [14]:

    
flag_inoutdoor = dict()
df_attr['about_intoutdoor'] = df_attr['name'].str.lower().str.contains('indoor/outdoor')
for idx, row in df_attr[df_attr['about_intoutdoor']].iterrows():
    pid = row['product_uid']
    value = row['value']
    flag_inoutdoor.setdefault(pid, [0, 0])
    if 'Indoor' in str(value):
        flag_inoutdoor[pid][0] = 1
    if 'Outdoor' in str(value):
        flag_inoutdoor[pid][1] = 1
df_inoutdoor = pd.DataFrame.from_dict(flag_inoutdoor, orient='index').reset_index().astype(np.float)
df_inoutdoor.columns = ['product_uid', 'flag_indoor', 'flag_outdoor']



In [15]:

    
df_inoutdoor['flag_indoor'].value_counts()









    Out[15]:





1    11755
0     1203
Name: flag_indoor, dtype: int64

Energy Star Flag



In [16]:

    
filter_str(df_attr, 'energy star certified', 'name')['value'].value_counts()









    Out[16]:





No     6939
Yes    1481
Name: value, dtype: int64



In [17]:

    
flag_estar = dict()
df_attr['about_estar'] = df_attr['name'].str.lower().str.contains('energy star certified')
for idx, row in df_attr[df_attr['about_estar']].iterrows():
    pid = row['product_uid']
    value = row['value']
    flag_estar.setdefault(pid, 0)
    if 'Yes' in str(value):
        flag_estar[pid] = 1
df_estar = pd.DataFrame.from_dict(flag_estar, orient='index').reset_index().astype(np.float)
df_estar.columns = ['product_uid', 'flag_estar']



In [18]:

    
df_estar['flag_estar'].value_counts()









    Out[18]:





0    6939
1    1481
Name: flag_estar, dtype: int64

Join (this will rebuild df, besure to finish all the following operations before training)



In [19]:

    
df = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df = pd.merge(df, df_desp, how='left', on='product_uid')
df = pd.merge(df, df_brand, how='left', on='product_uid')
df = pd.merge(df, df_bullet, how='left', on='product_uid')
df = pd.merge(df, df_bullet_count, how='left', on='product_uid')
df = pd.merge(df, df_color, how='left', on='product_uid')
df = pd.merge(df, df_material, how='left', on='product_uid')
df = pd.merge(df, df_comres, how='left', on='product_uid')
df = pd.merge(df, df_inoutdoor, how='left', on='product_uid')
df = pd.merge(df, df_estar, how='left', on='product_uid')

Fill NAs



In [20]:

    
df['brand'].fillna('nobrand', inplace=True)
df['bullet'].fillna('', inplace=True)
df['bullet_count'].fillna(0, inplace=True)
df['color'].fillna('', inplace=True)
df['material'].fillna('', inplace=True)
df['flag_commercial'].fillna(-1, inplace=True)
df['flag_residential'].fillna(-1, inplace=True)
df['flag_indoor'].fillna(-1, inplace=True)
df['flag_outdoor'].fillna(-1, inplace=True)
df['flag_estar'].fillna(-1, inplace=True)

Relevance Distribution



In [21]:

    
sns.countplot(x='relevance', data=df)









    Out[21]:





<matplotlib.axes._subplots.AxesSubplot at 0x17060ba8>



In [22]:

    
df['majority_relevance'] = df['relevance'].map(lambda x: x in [1.0, 1.33, 1.67, 2.0, 2.33, 2.67, 3.0])
def majoritize(df):
    return df[df['majority_relevance'] == 1]

External Data Utilization

Fix Typos



In [23]:

    
df['search_term'] = df['search_term'].map(correct_typo)

Pre-Stemming Attributes Features



In [24]:

    
df['match_commercial'] = (df['search_term'].str.lower().str.contains('commercial') & df['flag_commercial']).astype(np.float)



In [25]:

    
sum(df['match_commercial'])









    Out[25]:





350.0



In [26]:

    
df['match_residential'] = (df['search_term'].str.lower().str.contains('residential') & df['flag_residential']).astype(np.float)



In [27]:

    
sum(df['match_residential'])









    Out[27]:





56.0



In [28]:

    
def filter_estar(df):
    return df['search_term'].str.lower().str.contains('energy star') |\
    df['search_term'].str.lower().str.contains('energy efficient')



In [29]:

    
df['match_estar'] = (filter_estar(df) & df['flag_residential']).astype(np.float)



In [30]:

    
sum(df['match_estar'])









    Out[30]:





60.0



In [31]:

    
df['match_indoor'] = (df['search_term'].str.lower().str.contains('indoor') & df['flag_indoor']).astype(np.float)



In [32]:

    
sum(df['match_indoor'])









    Out[32]:





591.0



In [33]:

    
df['match_outdoor'] = (df['search_term'].str.lower().str.contains('outdoor') & df['flag_outdoor']).astype(np.float)



In [34]:

    
sum(df['match_outdoor'])









    Out[34]:





3509.0



In [35]:

    
df['match_outdoor'].describe()









    Out[35]:





count    240760.000000
mean          0.014575
std           0.119843
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: match_outdoor, dtype: float64

Stemming & Tokenizing



In [36]:

    
df['search_term'] = df['search_term'].map(lambda x: str_stem(x))
df['product_title'] = df['product_title'].map(lambda x: str_stem(x))
df['product_description'] = df['product_description'].map(lambda x: str_stem(x))
df['brand'] = df['brand'].map(lambda x: str_stem(x))
df['bullet'] = df['bullet'].map(lambda x: str_stem(x))



In [37]:

    
df['color'] = df['color'].map(lambda x: str_stem(x))
df['material'] = df['material'].map(lambda x: str_stem(x))



In [38]:

    
df['tokens_search_term'] = df['search_term'].map(lambda x: x.split())
df['tokens_product_title'] = df['product_title'].map(lambda x: x.split())
df['tokens_product_description'] = df['product_description'].map(lambda x: x.split())
df['tokens_brand'] = df['brand'].map(lambda x: x.split())
df['tokens_bullet'] = df['bullet'].map(lambda x: x.split())

Using a proper tokenizer is much slower and does not bring substantial improvment.



In [ ]:

    
# df['tokens_search_term'] = df['search_term'].map(lambda x: word_tokenize(x))
# df['tokens_product_title'] = df['product_title'].map(lambda x: word_tokenize(x))
# df['tokens_product_description'] = df['product_description'].map(lambda x: word_tokenize(x))
# df['tokens_brand'] = df['brand'].map(lambda x: word_tokenize(x))

Meta-Features

Length



In [45]:

    
df['len_search_term'] = df['tokens_search_term'].map(lambda x: len(x))
df['len_product_title'] = df['tokens_product_title'].map(lambda x: len(x))
df['len_product_description'] = df['tokens_product_description'].map(lambda x: len(x))
df['len_brand'] = df['tokens_brand'].map(lambda x: len(x))
df['len_bullet'] = df['tokens_bullet'].map(lambda x: len(x))

Post-Stemming Attributes Features



In [39]:

    
def match_color(st, colors):
    for w in st:
        if w in colors:
            return True
    return False



In [40]:

    
df['match_color'] = df.apply(lambda x: match_color(x['tokens_search_term'], x['color']), axis=1).astype(np.float)



In [41]:

    
sum(df['match_color'])









    Out[41]:





19251.0



In [42]:

    
def match_material(st, materials):
    for w in st:
        if w in materials:
            return True
    return False



In [43]:

    
df['match_material'] = df.apply(lambda x: match_material(x['tokens_search_term'], x['material']), axis=1).astype(np.float)



In [44]:

    
sum(df['match_material'])









    Out[44]:





13907.0

Flag & Count & Ratio



In [64]:

    
df['flag_st_in_pt'] = df.apply(lambda x: int(x['search_term'] in x['product_title']), axis=1)
df['flag_st_in_pd'] = df.apply(lambda x: int(x['search_term'] in x['product_description']), axis=1)
df['flag_st_in_br'] = df.apply(lambda x: int(x['search_term'] in x['brand']), axis=1)
df['flag_st_in_bl'] = df.apply(lambda x: int(x['search_term'] in x['bullet']), axis=1)



In [65]:

    
df['num_st_in_pt'] = \
    df.apply(lambda x: len(set(x['tokens_search_term']).intersection(set(x['tokens_product_title']))), axis=1)
df['num_st_in_pd'] = \
    df.apply(lambda x: len(set(x['tokens_search_term']).intersection(set(x['tokens_product_description']))), axis=1)
df['num_st_in_br'] = \
    df.apply(lambda x: len(set(x['tokens_search_term']).intersection(set(x['tokens_brand']))), axis=1)
df['num_st_in_bl'] = \
    df.apply(lambda x: len(set(x['tokens_search_term']).intersection(set(x['tokens_bullet']))), axis=1)



In [66]:

    
df['ratio_st_in_pt'] = \
    df.apply(lambda x: x['num_st_in_pt'] / float(x['len_search_term']), axis=1)
df['ratio_st_in_pd'] = \
    df.apply(lambda x: x['num_st_in_pd'] / float(x['len_search_term']), axis=1)
df['ratio_st_in_br'] = \
    df.apply(lambda x: x['num_st_in_br'] / float(x['len_search_term']), axis=1)
df['ratio_st_in_bl'] = \
    df.apply(lambda x: x['num_st_in_bl'] / float(x['len_search_term']), axis=1)



In [75]:

    
sns.set_palette("husl")



In [76]:

    
sns.boxplot(x='relevance', y='ratio_st_in_pt', data=majoritize(df))









    Out[76]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a154240>



In [77]:

    
sns.boxplot(x='relevance', y='ratio_st_in_pd', data=majoritize(df))









    Out[77]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a46ffd0>



In [78]:

    
# not very useful
sns.boxplot(x='relevance', y='ratio_st_in_br', data=majoritize(df))









    Out[78]:





<matplotlib.axes._subplots.AxesSubplot at 0x1ab36b38>



In [79]:

    
# not very useful
sns.boxplot(x='relevance', y='ratio_st_in_bl', data=majoritize(df))









    Out[79]:





<matplotlib.axes._subplots.AxesSubplot at 0x1b161dd8>

Positioned Word Matching



In [46]:

    
df['len_search_term'].max()









    Out[46]:





12



In [170]:

    
def match_pos(row, col, pos):
    if pos >= row['len_search_term'] or pos >= row['len_'+col]:
        return 0
    else:
        return int(row['tokens_search_term'][pos] in row[col])



In [171]:

    
for i in range(10):
    df[str(i)+'th_word_in_pt'] = df.apply(lambda x: match_pos(x, 'product_title', i), axis=1)



In [172]:

    
for i in range(10):
    df[str(i)+'th_word_in_pd'] = df.apply(lambda x: match_pos(x, 'product_description', i), axis=1)



In [173]:

    
for i in range(10):
    df[str(i)+'th_word_in_bl'] = df.apply(lambda x: match_pos(x, 'bullet', i), axis=1)

Encode Brand Feature



In [80]:

    
brands = pd.unique(df.brand.ravel())
brand_encoder = {}
index = 1000
for brand in brands:
    brand_encoder[brand] = index
    index += 10
brand_encoder['nobrand'] = 500
df['brand_encoded'] = df['brand'].map(lambda x: brand_encoder.get(x, 500))



In [81]:

    
pid_with_attr_material = pd.unique(df_material.product_uid.ravel())
material_encoder = {}
for pid in pid_with_attr_material:
    material_encoder[pid] = 1
df['flag_attr_has_material'] = df['product_uid'].map(lambda x: material_encoder.get(x, 0)).astype(np.float)



In [82]:

    
pid_with_attr_color = pd.unique(df_color.product_uid.ravel())
color_encoder = {}
for pid in pid_with_attr_color:
    color_encoder[pid] = 1
df['flag_attr_has_color'] = df['product_uid'].map(lambda x: color_encoder.get(x, 0)).astype(np.float)

Encode Attributes Feature



In [83]:

    
pids_with_attr = pd.unique(df_attr.product_uid.ravel())
attr_encoder = {}
for pid in pids_with_attr:
    attr_encoder[pid] = 1
df['flag_has_attr'] = df['product_uid'].map(lambda x: attr_encoder.get(x, 0)).astype(np.float)



In [85]:

    
sns.boxplot(x='flag_has_attr', y='relevance', data=majoritize(df))









    Out[85]:





<matplotlib.axes._subplots.AxesSubplot at 0x1bad6ba8>

Distance Metrics

BOW



In [87]:

    
from sklearn.feature_extraction.text import CountVectorizer



In [115]:

    
cv = CountVectorizer(stop_words='english', max_features=1000)



In [116]:

    
cv.fit(df['search_term'] + ' ' + df['product_title'] + ' ' + df['product_description'] + ' ' + df['bullet'])









    Out[116]:





CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)



In [122]:

    
cv_of_st = cv.transform(df['search_term'])
cv_of_pt = cv.transform(df['product_title'])
cv_of_pd = cv.transform(df['product_description'])
cv_of_bl = cv.transform(df['bullet'])

BOW based Cosine Similarity



In [101]:

    
from sklearn.metrics.pairwise import cosine_similarity



In [129]:

    
cv_cos_sim_st_pt = [cosine_similarity(cv_of_st[i], cv_of_pt[i])[0][0] for i in range(cv_of_st.shape[0])]
cv_cos_sim_st_pd = [cosine_similarity(cv_of_st[i], cv_of_pd[i])[0][0] for i in range(cv_of_st.shape[0])]
cv_cos_sim_st_bl = [cosine_similarity(cv_of_st[i], cv_of_bl[i])[0][0] for i in range(cv_of_st.shape[0])]



In [132]:

    
df['cv_cos_sim_st_pt'] = cv_cos_sim_st_pt
df['cv_cos_sim_st_pd'] = cv_cos_sim_st_pd
df['cv_cos_sim_st_bl'] = cv_cos_sim_st_bl



In [128]:

    
sns.boxplot(x='relevance', y='cv_cos_sim_st_pt', data=majoritize(df))









    Out[128]:





<matplotlib.axes._subplots.AxesSubplot at 0x10bfccc88>



In [133]:

    
sns.boxplot(x='relevance', y='cv_cos_sim_st_pd', data=majoritize(df))









    Out[133]:





<matplotlib.axes._subplots.AxesSubplot at 0x10ce62748>



In [134]:

    
sns.boxplot(x='relevance', y='cv_cos_sim_st_bl', data=majoritize(df))









    Out[134]:





<matplotlib.axes._subplots.AxesSubplot at 0x111ed0550>

TF-IDF



In [135]:

    
from sklearn.feature_extraction.text import TfidfVectorizer



In [136]:

    
tiv = TfidfVectorizer(ngram_range=(1, 3), stop_words='english', max_features=1000)
tiv.fit(df['search_term'] + ' ' + df['product_title'] + ' ' + df['product_description'] + ' ' + df['bullet'])









    Out[136]:





TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)



In [137]:

    
tiv_of_st = tiv.transform(df['search_term'])
tiv_of_pt = tiv.transform(df['product_title'])
tiv_of_pd = tiv.transform(df['product_description'])
tiv_of_bl = tiv.transform(df['bullet'])

TF-IDF based Cosine Similarity



In [ ]:

    
tiv_cos_sim_st_pt = [cosine_similarity(tiv_of_st[i], tiv_of_pt[i])[0][0] for i in range(tiv_of_st.shape[0])]
tiv_cos_sim_st_pd = [cosine_similarity(tiv_of_st[i], tiv_of_pd[i])[0][0] for i in range(tiv_of_st.shape[0])]
tiv_cos_sim_st_bl = [cosine_similarity(tiv_of_st[i], tiv_of_bl[i])[0][0] for i in range(tiv_of_st.shape[0])]



In [ ]:

    
df['tiv_cos_sim_st_pt'] = tiv_cos_sim_st_pt
df['tiv_cos_sim_st_pd'] = tiv_cos_sim_st_pd
df['tiv_cos_sim_st_bl'] = tiv_cos_sim_st_bl



In [ ]:

    
sns.boxplot(x='relevance', y='tiv_cos_sim_st_pt', data=majoritize(df))



In [ ]:

    
sns.boxplot(x='relevance', y='tiv_cos_sim_st_pd', data=majoritize(df))



In [ ]:

    
sns.boxplot(x='relevance', y='tiv_cos_sim_st_bl', data=majoritize(df))

Jaccard Similarity



In [138]:

    
def jaccard(A, B):
    C = A.intersection(B)
    return float(len(C)) / (len(A) + len(B) - len(C))



In [139]:

    
df['jaccard_st_pt'] = df.apply(lambda x: jaccard(set(x['tokens_search_term']), set(x['tokens_product_title'])), axis=1)
df['jaccard_st_pd'] = df.apply(lambda x: jaccard(set(x['tokens_search_term']), set(x['tokens_product_description'])), axis=1)    
df['jaccard_st_br'] = df.apply(lambda x: jaccard(set(x['tokens_search_term']), set(x['tokens_brand'])), axis=1)
df['jaccard_st_bl'] = df.apply(lambda x: jaccard(set(x['tokens_search_term']), set(x['tokens_bullet'])), axis=1)



In [140]:

    
sns.boxplot(x='relevance', y='jaccard_st_pt', data=majoritize(df))









    Out[140]:





<matplotlib.axes._subplots.AxesSubplot at 0x1954d6748>



In [141]:

    
sns.boxplot(x='relevance', y='jaccard_st_pd', data=majoritize(df))









    Out[141]:





<matplotlib.axes._subplots.AxesSubplot at 0x14f953ef0>



In [142]:

    
sns.boxplot(x='relevance', y='jaccard_st_br', data=majoritize(df))









    Out[142]:





<matplotlib.axes._subplots.AxesSubplot at 0x191a0dac8>



In [143]:

    
sns.boxplot(x='relevance', y='jaccard_st_bl', data=majoritize(df))









    Out[143]:





<matplotlib.axes._subplots.AxesSubplot at 0x160b02c50>

Edit Distance



In [154]:

    
from nltk.metrics import edit_distance



In [186]:

    
def calc_edit_distance(row, col):
    dists = [min([edit_distance(w, x) for x in row['tokens_'+col]]) for w in row['tokens_search_term']]
    return (min(dists), sum(dists)) if dists



In [187]:

    
df['edit_dist_st_pt_raw'] = df.apply(lambda x: calc_edit_distance(x, 'product_title'), axis=1)



In [189]:

    
df['edit_dist_st_pt_min'] = df['edit_dist_st_pt_raw'].map(lambda x: x[0])
df['edit_dist_st_pt_avg'] = df['edit_dist_st_pt_raw'].map(lambda x: x[1]) / df['len_search_term']



In [194]:

    
sns.boxplot(x='relevance', y='edit_dist_st_pt_avg', data=majoritize(df))









    Out[194]:





<matplotlib.axes._subplots.AxesSubplot at 0x163222cf8>



In [ ]:

    
df['edit_dist_st_pd_raw'] = df.apply(lambda x: calc_edit_distance(x, 'product_description'), axis=1)



In [214]:

    
df['edit_dist_st_pd_min'] = df['edit_dist_st_pd_raw'].map(lambda x: x[0])
df['edit_dist_st_pd_avg'] = df['edit_dist_st_pd_raw'].map(lambda x: x[1]) / df['len_search_term']



In [215]:

    
sns.boxplot(x='relevance', y='edit_dist_st_pd_avg', data=majoritize(df))









    Out[215]:





<matplotlib.axes._subplots.AxesSubplot at 0x171a6d6a0>



In [216]:

    
df.drop(['edit_dist_st_pt_raw', 'edit_dist_st_pd_raw'], axis=1, inplace=True)



In [ ]:

    
# df['edit_dist_st_bl_raw'] = df.apply(lambda x: calc_edit_distance(x, 'bullet'), axis=1)
# df['edit_dist_st_br_raw'] = df.apply(lambda x: calc_edit_distance(x, 'brand'), axis=1)
# df['edit_dist_st_bl_min'] = df['edit_dist_st_bl_raw'].map(lambda x: x[0])
# df['edit_dist_st_bl_avg'] = df['edit_dist_st_bl_raw'].map(lambda x: x[1]) / df['len_search_term']
# df['edit_dist_st_br_min'] = df['edit_dist_st_br_raw'].map(lambda x: x[0])
# df['edit_dist_st_br_avg'] = df['edit_dist_st_br_raw'].map(lambda x: x[1]) / df['len_search_term']

Latent Semantic Space

By SVD-decomposing BOW / TF-IDF matrix, we obtain features that can be used to capture different query/product groups.



In [144]:

    
from sklearn.decomposition import TruncatedSVD



In [145]:

    
tsvd = TruncatedSVD(n_components=10, random_state=2016)

tSVD for BOW



In [146]:

    
st_bow_tsvd = tsvd.fit_transform(cv_of_st)
for i in range(st_bow_tsvd.shape[1]):
    df['st_bow_tsvd'+str(i)] = st_bow_tsvd[:,i]



In [147]:

    
pt_bow_tsvd = tsvd.fit_transform(cv_of_pt)
for i in range(pt_bow_tsvd.shape[1]):
    df['pt_bow_tsvd'+str(i)] = pt_bow_tsvd[:,i]



In [148]:

    
pd_bow_tsvd = tsvd.fit_transform(cv_of_pd)
for i in range(pd_bow_tsvd.shape[1]):
    df['pd_bow_tsvd'+str(i)] = pd_bow_tsvd[:,i]



In [149]:

    
bl_bow_tsvd = tsvd.fit_transform(cv_of_bl)
for i in range(bl_bow_tsvd.shape[1]):
    df['bl_bow_tsvd'+str(i)] = bl_bow_tsvd[:,i]

tSVD for TF-IDF



In [150]:

    
st_tfidf_tsvd = tsvd.fit_transform(tiv_of_st)
for i in range(st_tfidf_tsvd.shape[1]):
    df['st_tfidf_tsvd_'+str(i)] = st_tfidf_tsvd[:,i]



In [151]:

    
pt_tfidf_tsvd = tsvd.fit_transform(tiv_of_pt)
for i in range(pt_tfidf_tsvd.shape[1]):
    df['pt_tfidf_tsvd_'+str(i)] = pt_tfidf_tsvd[:,i]



In [152]:

    
pd_tfidf_tsvd = tsvd.fit_transform(tiv_of_pd)
for i in range(pd_tfidf_tsvd.shape[1]):
    df['pd_tfidf_tsvd_'+str(i)] = pd_tfidf_tsvd[:,i]



In [153]:

    
bl_tfidf_tsvd = tsvd.fit_transform(tiv_of_bl)
for i in range(bl_tfidf_tsvd.shape[1]):
    df['bl_tfidf_tsvd_'+str(i)] = bl_tfidf_tsvd[:,i]

Append

This part is due to some mess I made. Just ignore it.



In [156]:

    
append = pd.read_csv('df_lev_dist_more_jaccard.csv', encoding='ISO-8859-1')



In [157]:

    
cols_to_append = [
    'query_in_title',
    'query_in_description',
    'query_last_word_in_title',
    'query_last_word_in_description',
    'word_in_title',
    'word_in_description',
    'word_in_brand',
    'ratio_title',
    'ratio_description',
    'ratio_brand',
    'lev_dist_to_product_title_min',
    'lev_dist_to_product_title_max',
    'lev_dist_to_product_title_sum',
    'lev_dist_to_product_description_min',
    'lev_dist_to_product_description_max',
    'lev_dist_to_product_description_sum'
]



In [158]:

    
for x in cols_to_append:
    df['old_'+x] = append[x]

Export



In [217]:

    
cols_to_drop = [
    #'product_uid',
    'search_term',
    'product_title',
    'product_description',
    'brand',
    'bullet',
    'color',
    'material',    
    'tokens_search_term',
    'tokens_product_title',
    'tokens_product_description',
    'tokens_brand',
    'tokens_bullet',
    'majority_relevance'
]
export_df = df.drop(cols_to_drop, axis=1)
print('Number of Features: ', len(export_df.columns.tolist()) - 2)









    



Number of Features:  172



In [218]:

    
export_df.head(3)









    Out[218]:






  
    
      
      id
      product_uid
      relevance
      bullet_count
      flag_commercial
      flag_residential
      flag_indoor
      flag_outdoor
      flag_estar
      match_commercial
      match_residential
      match_estar
      match_indoor
      match_outdoor
      match_color
      match_material
      len_search_term
      len_product_title
      len_product_description
      len_brand
      len_bullet
      0th_word_in_pd
      1th_word_in_pd
      2th_word_in_pd
      3th_word_in_pd
      4th_word_in_pd
      5th_word_in_pd
      6th_word_in_pd
      7th_word_in_pd
      8th_word_in_pd
      9th_word_in_pd
      0th_word_in_bl
      1th_word_in_bl
      2th_word_in_bl
      3th_word_in_bl
      4th_word_in_bl
      5th_word_in_bl
      6th_word_in_bl
      7th_word_in_bl
      8th_word_in_bl
      9th_word_in_bl
      0th_word_in_pt
      1th_word_in_pt
      2th_word_in_pt
      3th_word_in_pt
      4th_word_in_pt
      5th_word_in_pt
      6th_word_in_pt
      7th_word_in_pt
      8th_word_in_pt
      9th_word_in_pt
      flag_st_in_pt
      flag_st_in_pd
      flag_st_in_br
      flag_st_in_bl
      num_st_in_pt
      num_st_in_pd
      num_st_in_br
      num_st_in_bl
      ratio_st_in_pt
      ratio_st_in_pd
      ratio_st_in_br
      ratio_st_in_bl
      brand_encoded
      flag_attr_has_material
      flag_attr_has_color
      flag_has_attr
      cv_cos_sim_st_pt
      cv_cos_sim_st_pd
      cv_cos_sim_st_bl
      jaccard_st_pt
      jaccard_st_pd
      jaccard_st_br
      jaccard_st_bl
      st_bow_tsvd0
      st_bow_tsvd1
      st_bow_tsvd2
      st_bow_tsvd3
      st_bow_tsvd4
      st_bow_tsvd5
      st_bow_tsvd6
      st_bow_tsvd7
      st_bow_tsvd8
      st_bow_tsvd9
      pt_bow_tsvd0
      pt_bow_tsvd1
      pt_bow_tsvd2
      pt_bow_tsvd3
      pt_bow_tsvd4
      pt_bow_tsvd5
      pt_bow_tsvd6
      pt_bow_tsvd7
      pt_bow_tsvd8
      pt_bow_tsvd9
      pd_bow_tsvd0
      pd_bow_tsvd1
      pd_bow_tsvd2
      pd_bow_tsvd3
      pd_bow_tsvd4
      pd_bow_tsvd5
      pd_bow_tsvd6
      pd_bow_tsvd7
      pd_bow_tsvd8
      pd_bow_tsvd9
      bl_bow_tsvd0
      bl_bow_tsvd1
      bl_bow_tsvd2
      bl_bow_tsvd3
      bl_bow_tsvd4
      bl_bow_tsvd5
      bl_bow_tsvd6
      bl_bow_tsvd7
      bl_bow_tsvd8
      bl_bow_tsvd9
      st_tfidf_tsvd_0
      st_tfidf_tsvd_1
      st_tfidf_tsvd_2
      st_tfidf_tsvd_3
      st_tfidf_tsvd_4
      st_tfidf_tsvd_5
      st_tfidf_tsvd_6
      st_tfidf_tsvd_7
      st_tfidf_tsvd_8
      st_tfidf_tsvd_9
      pt_tfidf_tsvd_0
      pt_tfidf_tsvd_1
      pt_tfidf_tsvd_2
      pt_tfidf_tsvd_3
      pt_tfidf_tsvd_4
      pt_tfidf_tsvd_5
      pt_tfidf_tsvd_6
      pt_tfidf_tsvd_7
      pt_tfidf_tsvd_8
      pt_tfidf_tsvd_9
      pd_tfidf_tsvd_0
      pd_tfidf_tsvd_1
      pd_tfidf_tsvd_2
      pd_tfidf_tsvd_3
      pd_tfidf_tsvd_4
      pd_tfidf_tsvd_5
      pd_tfidf_tsvd_6
      pd_tfidf_tsvd_7
      pd_tfidf_tsvd_8
      pd_tfidf_tsvd_9
      bl_tfidf_tsvd_0
      bl_tfidf_tsvd_1
      bl_tfidf_tsvd_2
      bl_tfidf_tsvd_3
      bl_tfidf_tsvd_4
      bl_tfidf_tsvd_5
      bl_tfidf_tsvd_6
      bl_tfidf_tsvd_7
      bl_tfidf_tsvd_8
      bl_tfidf_tsvd_9
      old_query_in_title
      old_query_in_description
      old_query_last_word_in_title
      old_query_last_word_in_description
      old_word_in_title
      old_word_in_description
      old_word_in_brand
      old_ratio_title
      old_ratio_description
      old_ratio_brand
      old_lev_dist_to_product_title_min
      old_lev_dist_to_product_title_max
      old_lev_dist_to_product_title_sum
      old_lev_dist_to_product_description_min
      old_lev_dist_to_product_description_max
      old_lev_dist_to_product_description_sum
      edit_dist_st_pt_min
      edit_dist_st_pt_avg
      edit_dist_st_pd_min
      edit_dist_st_pd_avg
    
  
  
    
      0
      2
      100001
      3.0
      7
      -1
      -1
      -1
      -1
      -1
      0
      0
      0
      0
      0
      0
      0
      2
      6
      135
      3
      58
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      1
      0
      1
      0.5
      0.5
      0
      0.5
      1000
      1
      0
      1
      0.316228
      0.197814
      0.096225
      0.142857
      0.010101
      0
      0.020833
      0.003626
      0.004019
      0.000797
      0.006102
      0.008658
      -0.000548
      0.006355
      -0.002796
      0.007951
      0.005769
      0.045605
      0.024936
      0.026892
      0.028064
      0.049714
      0.092839
      -0.078192
      -0.067007
      0.050241
      0.049224
      3.313323
      -1.080023
      0.246664
      0.035078
      1.213365
      0.344366
      -0.339457
      -0.778180
      1.311161
      -0.855836
      2.016449
      -0.805119
      1.716349
      -0.538509
      1.614636
      0.482738
      0.415191
      -0.226891
      -0.103242
      -0.652989
      0.003235
      0.001598
      0.004377
      0.004809
      0.003127
      0.001640
      -0.000262
      0.010862
      0.003171
      -0.000317
      0.040224
      -0.002031
      -0.002297
      -0.031202
      0.025376
      0.031313
      0.013203
      0.000737
      -0.007120
      0.007481
      0.224923
      -0.004951
      0.010509
      -0.007065
      -0.100198
      -0.059775
      -0.120487
      -0.041719
      0.115705
      -0.026294
      0.190741
      -0.004078
      0.002348
      -0.025698
      -0.087801
      0.007791
      -0.176249
      -0.059840
      0.028053
      -0.022842
      0
      0
      0
      0
      1
      1
      0
      0.5
      0.5
      0.00
      3
      7
      63
      3
      16
      321
      0
      3.0
      0
      2.0
    
    
      1
      3
      100001
      2.5
      7
      -1
      -1
      -1
      -1
      -1
      0
      0
      0
      0
      0
      0
      1
      2
      6
      135
      3
      58
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0.0
      0.0
      0
      0.0
      1000
      1
      0
      1
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0
      0.000000
      0.001644
      0.001633
      0.001574
      0.003842
      0.002912
      0.001569
      0.005649
      -0.003359
      0.003245
      0.004215
      0.045605
      0.024936
      0.026892
      0.028064
      0.049714
      0.092839
      -0.078192
      -0.067007
      0.050241
      0.049224
      3.313323
      -1.080023
      0.246664
      0.035078
      1.213365
      0.344366
      -0.339456
      -0.778180
      1.311161
      -0.855836
      2.016449
      -0.805119
      1.716349
      -0.538509
      1.614636
      0.482739
      0.415191
      -0.226891
      -0.103242
      -0.652989
      0.001812
      0.002332
      0.003688
      0.001637
      0.004279
      0.001851
      0.000037
      0.011753
      0.004452
      -0.000231
      0.040224
      -0.002031
      -0.002297
      -0.031202
      0.025376
      0.031313
      0.013203
      0.000737
      -0.007120
      0.007481
      0.224923
      -0.004951
      0.010509
      -0.007065
      -0.100198
      -0.059775
      -0.120487
      -0.041719
      0.115705
      -0.026294
      0.190741
      -0.004078
      0.002348
      -0.025698
      -0.087801
      0.007791
      -0.176249
      -0.059840
      0.028053
      -0.022842
      0
      0
      0
      0
      1
      1
      0
      0.5
      0.5
      0.00
      3
      7
      67
      3
      18
      337
      2
      4.0
      1
      2.5
    
    
      2
      9
      100002
      3.0
      10
      -1
      -1
      -1
      -1
      -1
      0
      0
      0
      0
      0
      0
      0
      2
      12
      169
      4
      109
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      1
      0.0
      0.5
      0
      0.5
      1010
      0
      1
      1
      0.000000
      0.243332
      0.212000
      0.000000
      0.008621
      0
      0.012048
      0.006960
      0.000900
      0.005602
      0.005818
      -0.000712
      0.003775
      -0.004111
      0.010913
      0.014781
      -0.027090
      0.052576
      0.029210
      0.058717
      -0.054237
      0.038212
      0.104605
      0.018579
      -0.144403
      -0.038084
      0.025078
      3.088265
      -0.928897
      -0.779630
      -0.235617
      1.022412
      -6.070258
      -0.177288
      1.629826
      1.669294
      1.009680
      2.418942
      -0.412886
      1.744545
      -1.499353
      -3.124326
      -3.041471
      2.646309
      1.225488
      0.994887
      2.065224
      0.001645
      0.004407
      0.006753
      -0.001348
      0.007957
      0.000271
      -0.005846
      0.015219
      0.042325
      0.009750
      0.046026
      -0.003515
      0.005800
      -0.003947
      -0.017649
      0.021120
      0.043252
      0.012502
      0.010272
      -0.001656
      0.280601
      0.264614
      -0.309222
      0.395171
      0.065995
      0.230039
      0.065034
      -0.131033
      0.016887
      -0.094675
      0.256366
      -0.025741
      0.073359
      0.695745
      0.252360
      -0.039998
      -0.089243
      -0.013464
      0.089968
      0.028354
      0
      0
      0
      0
      1
      1
      1
      0.5
      0.5
      0.25
      2
      7
      120
      2
      18
      337
      2
      2.5
      0
      1.0



In [219]:

    
export_df.to_csv('./df.csv')



In [220]:

    
df.to_csv('./df_full.csv')

	id	product_title	product_uid	relevance	search_term	product_description	brand	bullet	bullet_count	color	material	flag_commercial	flag_residential
375	1208	Liberty 3-3/4 in. Steel Bar Pull (25-Pack)	100209	2.67	bathroom hardware knobs and pulls	Sleek and sophisticated, this design makes a c...	Liberty		0.0		Steel	-1.0	-1.0
1416	4355	Martha Stewart Living 3-3/4 in. Bar Cabinet Ha...	100748	2.33	3/4' hardware	The Martha Stewart Living Country 3-3/4 in. Po...	Martha Stewart Living		0.0		Metal	-1.0	-1.0
1989	6161	Defiant Hartford Satin Nickel Entry Knob	101061	2.67	door lock hardware	Featuring a lifetime guarantee, Defiant meets ...	Defiant		0.0		Stainless steel Metal	-1.0	-1.0
2180	6735	Martha Stewart Living 3-3/4 in. Dowel Cabinet ...	101158	3.00	3/4' hardware	The Martha Stewart Living Country 3-3/4 in. Be...	Martha Stewart Living		0.0		Metal	-1.0	-1.0
2295	7047	Oz-Post T4-850 4 in. Square Wood Post Anchor (...	101200	1.33	oz metal fence hardware	Oz-Post is one of the best ways to secure a wo...	Oz-Post		0.0		Metal	-1.0	-1.0
2337	7169	Liberty 2-3/4 in. or 3 in. Newton Cabinet Hard...	101223	2.00	3/4' hardware	The Liberty 2-3/4 or 3 in. Satin Nickel Dual-M...	Liberty		0.0		Metal	-1.0	-1.0
2339	7172	Liberty 2-3/4 in. or 3 in. Newton Cabinet Hard...	101223	3.00	kitchen cabinet drawer center-mount hardware	The Liberty 2-3/4 or 3 in. Satin Nickel Dual-M...	Liberty		0.0		Metal	-1.0	-1.0
2706	8370	HDX 4 ft. x 100 ft. 14-Gauge Welded Wire	101419	2.00	wire fences hardware	The HDX 4 ft. x 100 ft. Welded Wire is made of...	HDX	Silver	1.0	Silver	Metal	-1.0	-1.0
2965	9189	Liberty 2-1/2 in. or 3 in. Dark Oil Rubbed Bro...	101566	2.00	kitchen hardware	Use the Liberty Hardware 2-1/2 or 3 in. Dual M...	Liberty		0.0		Metal	-1.0	-1.0
2975	9221	Oz-Post Steel 2 Wood Fence Bracket Project Pac...	101571	2.33	wap around hardware	The WAP-238 from OZCO is a galvanized bracket ...	Oz-Post		0.0		Metal	-1.0	-1.0
2982	9244	HDX 1/4 in. x 2 ft. x 5 ft. Hardware Cloth	101575	2.00	wire fences hardware	A lightweight, flexible and economical wire me...	HDX	Silver	1.0	Silver	Metal	-1.0	-1.0
3300	10232	Schlage Plymouth Double Cylinder Antique Brass...	101755	3.00	interior door hardware by schlage	The Plymouth front entry features a simple cur...	Schlage		0.0		Solid Brass	-1.0	-1.0
3421	10576	Liberty 6-2/7 in. Steel Bar Cabinet Hardware A...	101831	2.00	kitchen cabinet drawer center-mount hardware	Sleek and sophisticated, this design makes a c...	Liberty		0.0		Steel	-1.0	-1.0
3422	10577	Liberty 6-2/7 in. Steel Bar Cabinet Hardware A...	101831	2.00	kitchen cabinte hardware blue knob	Sleek and sophisticated, this design makes a c...	Liberty		0.0		Steel	-1.0	-1.0
3423	10578	Liberty 6-2/7 in. Steel Bar Cabinet Hardware A...	101831	3.00	liberty campaign hardware	Sleek and sophisticated, this design makes a c...	Liberty		0.0		Steel	-1.0	-1.0
3775	11728	Veranda 1-1/2 oz. Vinyl Fence Cement	102046	2.67	vinyl fence hardware	Veranda PVC cement glue is specifically design...	Veranda		0.0		Metal	-1.0	-1.0
3792	11788	HDX 28 in. x 50 ft. Garden Fence	102057	2.00	wire fences hardware	If rabbits and other varmints are getting to y...	HDX	Silver	1.0	Silver	Metal	-1.0	-1.0
3933	12218	YARDGARD 5 ft. x 50 ft. 14-Gauge Vinyl Galvani...	102139	1.67	wire fences hardware	Welded wire is a general purpose fence providi...	YARDGARD	Green	1.0	Green	Metal	-1.0	-1.0
4159	13026	Tenax 3 ft. x 15 ft. Plastic Black Hardware Net	102276	2.33	wire fences hardware	Replaces metal hardware net under must applica...	Tenax	Black Black	2.0	Black Black	Plastic	-1.0	-1.0
4613	14452	Liberty 3-3/4 in. Steel Bar Cabinet Hardware Pull	102520	1.33	liberty campaign hardware	Sleek and sophisticated, this design makes a c...	Liberty		0.0		Steel	-1.0	-1.0
4844	15184	Oz-Post Steel 2 Wood Fence Bracket WAP-OZ	102651	2.67	oz metal fence hardware	The remarkable WAP-OZ fence bracket from OZCO ...	Oz-Post		0.0		Metal	-1.0	-1.0
5260	16453	Martha Stewart Living 3 in. Bedford Nickel Cyl...	102905	2.00	kitchen hardware	The Martha Stewart Living Modern 3 in. Bedford...	Martha Stewart Living		0.0		Metal	-1.0	-1.0
6770	21202	Hickory Hardware Oil-Rubbed Bronze Surface Sel...	103960	2.00	hickory hardware 469999035	Update your cabinetry with the Hickory Hardwar...	Hickory Hardware		0.0		Steel	-1.0	-1.0
6835	21419	Defiant Springfield Satin Nickel Mushroom Hand...	104017	3.00	front door hardware	Reinforce your entry door with the Defiant Spr...	Defiant		0.0		Solid Brass	-1.0	-1.0
6909	21626	HDX 1 in. x 4 ft. x 50 ft. Poultry Netting	104079	2.00	wire fences hardware	The HDX 1 in. x 4 ft. x 50 ft. 20-Gauge Galvan...	HDX		0.0		Galvanized Steel	-1.0	-1.0
6923	21680	Everbilt Anti-Sag Gate Kit	104092	2.00	fence gate hardware eyebolt	The Everbilt Anti-Sag Gate Kit is ideal to eli...	Everbilt		0.0		Steel	-1.0	-1.0
7209	22570	ClosetMaid Preloaded Wall Brackets for SuperSl...	104340	2.00	shelves wood hardware	This set of 2 ClosetMaid Wall Brackets is desi...	ClosetMaid	White White	2.0	White White	Resin	-1.0	-1.0
7363	23045	Veranda Aluminum Rail Bracket for Vinyl Fencin...	104469	3.00	vinyl fence hardware	Choose Veranda's 2 in. x 3 in. Fence Rail Brac...	Veranda		0.0		Metal	-1.0	-1.0
7461	23327	Veranda Shadowbox White Vinyl Fence Bracket Kit	104547	3.00	vinyl fence hardware	Veranda vinyl fencing is The Home Depot's prem...	Veranda		0.0		Metal	-1.0	-1.0
7475	23381	Liberty Satin Nickel 1-3/8 in. Large Football ...	104569	3.00	bathroom hardware knobs and pulls	Use the Liberty 1-3/8 in. Satin Nickel Large F...	nobrand		0.0			-1.0	-1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
220714	220324	PartsmasterPro Universal Designer Lever Handle...	205663	NaN	porcelain handle hardware	Replacing your faucets handles makes your fauc...	PartsmasterPro	Chrome Chrome	2.0	Chrome Chrome	Metal	-1.0	-1.0
222464	222465	Stanley-National Hardware 4-1/2 in. Template H...	207497	NaN	hardware template	Whether for home, farm, builder or industrial ...	Stanley-National Hardware		0.0		Steel	-1.0	-1.0
223589	223590	Prime-Line 3/8 in. Bi-Fold Door Pivot Set	208443	NaN	bi fold door hardware	This Prime-Line Products 3/8 in. Bifold Door P...	Prime-Line		0.0		Plastic	-1.0	-1.0
223642	223643	Barton Kramer Bi-Fold Jamb Bracket Closet Door...	208491	NaN	closet hardware	This bi-fold closet door jamb bracket is desig...	Barton Kramer		0.0		Steel	-1.0	-1.0
223643	223644	Barton Kramer Bi-Fold Jamb Bracket Closet Door...	208491	NaN	hardware brackets	This bi-fold closet door jamb bracket is desig...	Barton Kramer		0.0		Steel	-1.0	-1.0
223774	223775	Stainless Glide Stainless Steel Top Mount Roll...	208596	NaN	stainless steel hardware	The Stainless Glide Stainless Steel Rolling Do...	Stainless Glide	Stainless Steel	1.0	Stainless Steel	Stainless Steel	-1.0	-1.0
224771	224772	Canine Hardware 30 in. x 39 in. Pet Travel Bed	209453	NaN	murphy bed hardware	This travel bed's plush look and feel will als...	Canine Hardware		0.0		Other	-1.0	-1.0
226311	226312	Martha Stewart Living Bedford 3 in. Nickel Can...	210797	NaN	hardware	The Martha Stewart Living 3 in. Bedford Nickel...	Martha Stewart Living		0.0		Metal	-1.0	-1.0
226361	226362	Prime-Line Bi-Fold Door Top Guide Wheel, 5/8 i...	210841	NaN	bi fold door hardware	This bi-fold door pivot is constructed from st...	Prime-Line		0.0		Steel	-1.0	-1.0
226826	226827	Stainless Glide Stainless Steel Dual Wheel Str...	211265	NaN	stainless steel hardware	The Stainless Glide Stainless Steel Rolling Do...	Stainless Glide	Stainless Steel	1.0	Stainless Steel	Stainless Steel	-1.0	-1.0
226881	226882	Prime-Line Solid Brass Pocket Door Combination...	211314	NaN	pocket door hardware	This mortise latch unit features recessed grip...	Prime-Line		0.0		Solid Brass	-1.0	-1.0
227150	227151	Young House Love 3 in. Vintage Style Cocoa Bro...	211554	NaN	liberty campaign hardware	From Liberty Hardware and Young House Love, th...	Young House Love		0.0		Metal	-1.0	-1.0
228862	228863	Continental Home Hardware 1-3/4 in. Satin Nick...	213100	NaN	3/4' hardware	Thomasville Hardware brings a customized hard...	Continental Home Hardware		0.0		Metal	-1.0	-1.0
229742	229743	Stanley-National Hardware 6 in. Professional C...	213907	NaN	gate hardware kit	Stanley-National Hardware has been a leading m...	Stanley-National Hardware		0.0		Steel	-1.0	-1.0
229999	230000	Prime-Line 3/4 in. Flat Nylon Wheel Bi-Fold Do...	214140	NaN	pocket door hardware	This wardrobe door roller is constructed from ...	Prime-Line		0.0		Other	-1.0	-1.0
230047	230048	Kwikset Arlington Single Cylinder Antique Bras...	214184	NaN	front door hardware	Step up to designer styles and superior securi...	Kwikset		0.0		Metal	-1.0	-1.0
230825	230826	PlayStar All Star Build It Yourself Gold Plays...	214889	NaN	lumber hardware	The All Star XP Gold Design has 12 sq. ft. of ...	PlayStar		0.0		Galvanized Steel Metal Plastic/Metal	-1.0	-1.0
231360	231361	Barton Kramer Johnson Hardware Bi-Fold Door Bo...	215386	NaN	hardware brackets	This bi-fold door bottom pivot and bracket is ...	Barton Kramer		0.0		Steel	-1.0	-1.0
232910	232911	Prime-Line Closet Pole Sockets, 1-3/8 in., Pla...	216836	NaN	closet hardware	These pole sockets are constructed from sturdy...	Prime-Line		0.0		Plastic	-1.0	-1.0
233124	233125	Richelieu Hardware 4-15/16 in. Furniture Leg	217037	NaN	murphy bed hardware	An ideal way to update your furniture, the Ric...	Richelieu Hardware	Matte Black	1.0	Matte Black	Other	-1.0	-1.0
233302	233303	Hickory Hardware Studio Collection 1 in. Oil-R...	217204	NaN	hickory hardware studio	This Hickory hardware Studio Collection 1 in. ...	Hickory Hardware		0.0		Metal	-1.0	-1.0
234873	234874	Liberty 3-3/4 in. Plaza Cabinet Hardware Pull	218685	NaN	3/4' hardware	The Liberty 3-3/4 in. Brushed Satin-Nickel Pla...	Liberty		0.0		Metal	-1.0	-1.0
235233	235234	National Hardware Vinyl Fence Gate Kit in Whit...	219029	NaN	vinyl fence hardware	Whether for home, farm, builder or industrial ...	National Hardware		0.0		Steel	-1.0	-1.0
235587	235588	Richelieu Hardware 8 in. x 10 in. White Enamel...	219367	NaN	hardware brackets	Onward products offer unique and creative hard...	Richelieu Hardware	White White	2.0	White White	Metal	-1.0	-1.0
235701	235702	Liberty Cabinet Drawer Hardware Installation T...	219477	NaN	hardware template	This Liberty Cabinet Hardware Installation Tem...	Liberty		0.0		Plastic	-1.0	-1.0
236106	236107	Rustica Hardware 42 in. x 84 in. Modern Range ...	219865	NaN	sliding cabinet door hardware	As unique as your fingerprints are to you, so ...	nobrand		0.0			-1.0	-1.0
237028	237029	Everbilt Heavy Duty 36 in. Pocket Door Frame Set	220759	NaN	pocket door hardware	The Everbilt 36 in. Heavy Duty Pocket Door Fra...	Everbilt		0.0		Aluminum	-1.0	-1.0
237151	237152	Hickory Hardware Studio 1-1/4 in. Oil Rubbed B...	220878	NaN	hickory hardware studio	Bold style and functionality are combined in t...	Hickory Hardware		0.0		Metal	-1.0	-1.0
237739	237740	Johnson Hardware 111FD Series 72 in. Track and...	221455	NaN	bi fold door hardware	The Johnson Hardware 111FD Series 72 in. Track...	Johnson Hardware		0.0		Aluminum	-1.0	-1.0
239861	239862	Liberty 1-1/4 in. Hollow Cabinet Hardware Knob	223535	NaN	hardware knob	The clean lines of this knob fit several desig...	Liberty		0.0		Ceramic	-1.0	-1.0

	id	product_uid	relevance	bullet_count	flag_commercial	flag_residential	flag_indoor	flag_outdoor	flag_estar	match_material	len_search_term	len_product_title	len_product_description	len_brand	len_bullet	0th_word_in_pd	0th_word_in_bl	1th_word_in_bl	0th_word_in_pt	num_st_in_pt	num_st_in_pd	num_st_in_bl	ratio_st_in_pt	ratio_st_in_pd	ratio_st_in_bl	brand_encoded	flag_attr_has_material	flag_attr_has_color	flag_has_attr	cv_cos_sim_st_pt	cv_cos_sim_st_pd	cv_cos_sim_st_bl	jaccard_st_pt	jaccard_st_pd	jaccard_st_bl	st_bow_tsvd0	st_bow_tsvd1	st_bow_tsvd2	st_bow_tsvd3	st_bow_tsvd4	st_bow_tsvd5	st_bow_tsvd6	st_bow_tsvd7	st_bow_tsvd8	st_bow_tsvd9	pt_bow_tsvd0	pt_bow_tsvd1	pt_bow_tsvd2	pt_bow_tsvd3	pt_bow_tsvd4	pt_bow_tsvd5	pt_bow_tsvd6	pt_bow_tsvd7	pt_bow_tsvd8	pt_bow_tsvd9	pd_bow_tsvd0	pd_bow_tsvd1	pd_bow_tsvd2	pd_bow_tsvd3	pd_bow_tsvd4	pd_bow_tsvd5	pd_bow_tsvd6	pd_bow_tsvd7	pd_bow_tsvd8	pd_bow_tsvd9	bl_bow_tsvd0	bl_bow_tsvd1	bl_bow_tsvd2	bl_bow_tsvd3	bl_bow_tsvd4	bl_bow_tsvd5	bl_bow_tsvd6	bl_bow_tsvd7	bl_bow_tsvd8	bl_bow_tsvd9	st_tfidf_tsvd_0	st_tfidf_tsvd_1	st_tfidf_tsvd_2	st_tfidf_tsvd_3	st_tfidf_tsvd_4	st_tfidf_tsvd_5	st_tfidf_tsvd_6	st_tfidf_tsvd_7	st_tfidf_tsvd_8	st_tfidf_tsvd_9	pt_tfidf_tsvd_0	pt_tfidf_tsvd_1	pt_tfidf_tsvd_2	pt_tfidf_tsvd_3	pt_tfidf_tsvd_4	pt_tfidf_tsvd_5	pt_tfidf_tsvd_6	pt_tfidf_tsvd_7	pt_tfidf_tsvd_8	pt_tfidf_tsvd_9	pd_tfidf_tsvd_0	pd_tfidf_tsvd_1	pd_tfidf_tsvd_2	pd_tfidf_tsvd_3	pd_tfidf_tsvd_4	pd_tfidf_tsvd_5	pd_tfidf_tsvd_6	pd_tfidf_tsvd_7	pd_tfidf_tsvd_8	pd_tfidf_tsvd_9	bl_tfidf_tsvd_0	bl_tfidf_tsvd_1	bl_tfidf_tsvd_2	bl_tfidf_tsvd_3	bl_tfidf_tsvd_4	bl_tfidf_tsvd_5	bl_tfidf_tsvd_6	bl_tfidf_tsvd_7	bl_tfidf_tsvd_8	bl_tfidf_tsvd_9	old_word_in_title	old_word_in_description	old_word_in_brand	old_ratio_title	old_ratio_description	old_ratio_brand	old_lev_dist_to_product_title_min	old_lev_dist_to_product_title_max	old_lev_dist_to_product_title_sum	old_lev_dist_to_product_description_min	old_lev_dist_to_product_description_max	old_lev_dist_to_product_description_sum	edit_dist_st_pt_min	edit_dist_st_pt_avg	edit_dist_st_pd_min	edit_dist_st_pd_avg
0	2	100001	3.0	7	-1	-1	-1	-1	-1	0	2	6	135	3	58	1	1	0	1	1	1	1	0.5	0.5	0.5	1000	1	0	1	0.316228	0.197814	0.096225	0.142857	0.010101	0.020833	0.003626	0.004019	0.000797	0.006102	0.008658	-0.000548	0.006355	-0.002796	0.007951	0.005769	0.045605	0.024936	0.026892	0.028064	0.049714	0.092839	-0.078192	-0.067007	0.050241	0.049224	3.313323	-1.080023	0.246664	0.035078	1.213365	0.344366	-0.339457	-0.778180	1.311161	-0.855836	2.016449	-0.805119	1.716349	-0.538509	1.614636	0.482738	0.415191	-0.226891	-0.103242	-0.652989	0.003235	0.001598	0.004377	0.004809	0.003127	0.001640	-0.000262	0.010862	0.003171	-0.000317	0.040224	-0.002031	-0.002297	-0.031202	0.025376	0.031313	0.013203	0.000737	-0.007120	0.007481	0.224923	-0.004951	0.010509	-0.007065	-0.100198	-0.059775	-0.120487	-0.041719	0.115705	-0.026294	0.190741	-0.004078	0.002348	-0.025698	-0.087801	0.007791	-0.176249	-0.059840	0.028053	-0.022842	1	1	0	0.5	0.5	0.00	3	7	63	3	16	321	0	3.0	0	2.0
1	3	100001	2.5	7	-1	-1	-1	-1	-1	1	2	6	135	3	58	1	1	0	1	0	0	0	0.0	0.0	0.0	1000	1	0	1	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.001644	0.001633	0.001574	0.003842	0.002912	0.001569	0.005649	-0.003359	0.003245	0.004215	0.045605	0.024936	0.026892	0.028064	0.049714	0.092839	-0.078192	-0.067007	0.050241	0.049224	3.313323	-1.080023	0.246664	0.035078	1.213365	0.344366	-0.339456	-0.778180	1.311161	-0.855836	2.016449	-0.805119	1.716349	-0.538509	1.614636	0.482739	0.415191	-0.226891	-0.103242	-0.652989	0.001812	0.002332	0.003688	0.001637	0.004279	0.001851	0.000037	0.011753	0.004452	-0.000231	0.040224	-0.002031	-0.002297	-0.031202	0.025376	0.031313	0.013203	0.000737	-0.007120	0.007481	0.224923	-0.004951	0.010509	-0.007065	-0.100198	-0.059775	-0.120487	-0.041719	0.115705	-0.026294	0.190741	-0.004078	0.002348	-0.025698	-0.087801	0.007791	-0.176249	-0.059840	0.028053	-0.022842	1	1	0	0.5	0.5	0.00	3	7	67	3	18	337	2	4.0	1	2.5
2	9	100002	3.0	10	-1	-1	-1	-1	-1	0	2	12	169	4	109	1	1	1	1	0	1	1	0.0	0.5	0.5	1010	0	1	1	0.000000	0.243332	0.212000	0.000000	0.008621	0.012048	0.006960	0.000900	0.005602	0.005818	-0.000712	0.003775	-0.004111	0.010913	0.014781	-0.027090	0.052576	0.029210	0.058717	-0.054237	0.038212	0.104605	0.018579	-0.144403	-0.038084	0.025078	3.088265	-0.928897	-0.779630	-0.235617	1.022412	-6.070258	-0.177288	1.629826	1.669294	1.009680	2.418942	-0.412886	1.744545	-1.499353	-3.124326	-3.041471	2.646309	1.225488	0.994887	2.065224	0.001645	0.004407	0.006753	-0.001348	0.007957	0.000271	-0.005846	0.015219	0.042325	0.009750	0.046026	-0.003515	0.005800	-0.003947	-0.017649	0.021120	0.043252	0.012502	0.010272	-0.001656	0.280601	0.264614	-0.309222	0.395171	0.065995	0.230039	0.065034	-0.131033	0.016887	-0.094675	0.256366	-0.025741	0.073359	0.695745	0.252360	-0.039998	-0.089243	-0.013464	0.089968	0.028354	1	1	1	0.5	0.5	0.25	2	7	120	2	18	337	2	2.5	0	1.0