RUN MODEL



In [69]:

    
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.base import TransformerMixin
from xgboost import XGBClassifier
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import warnings
import pickle
import re
% matplotlib inline

TRANSFORMERS



In [70]:

    
class HtmlTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        cleaner = re.compile(r'<.*?>')
        return pd.Series(X.apply(lambda x: re.sub(cleaner,' ',x)))
    def fit(self, X, y=None, **fit_params):
        return self
    
class RemoveCamelCaseTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return pd.Series(X.apply(lambda x: re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ',x)))
    def fit(self, X, y=None, **fit_params):
        return self
    
class RemoveSymsTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return pd.Series(X.apply(lambda x: re.sub(re.compile(r'[^A-za-z0-9\s\.]'),' ',x)))
    def fit(self, X, y=None, **fit_params):
        return self
    
class TokenizeTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return pd.Series(X.apply(lambda x:word_tokenize(x)))
    def fit(self, X, y=None, **fit_params):
        return self

class LemmatizeTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        lmtzr = WordNetLemmatizer()
        return pd.Series(X.apply(lambda x:[lmtzr.lemmatize(token.lower()) for token in x]))
    def fit(self, X, y=None, **fit_params):
        return self

class RemoveStopWordsTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        stop = set(stopwords.words('english'))
        return pd.Series(X.apply(lambda x: ' '.join([token for token in x if token not in stop])))
    def fit(self, X, y=None, **fit_params):
        return self

LOAD MODELS



In [71]:

    
with open('models/MODEL_XG_Boost_Product_Long_Description.plk','rb') as f:
    XGB_MODEL_PLD = pickle.load(f)
    
with open('models/MODEL_XG_Boost_Product_Name.plk','rb') as f:
    XGB_MODEL_PN = pickle.load(f)

LOAD TEST SET



In [109]:

    
df = pd.read_csv('data/test.tsv',sep='\t')
keep = set(['item_id','Product Long Description','Product Name'])
drop = set(df.columns) - keep
df.drop(drop,axis=1,inplace=True)
df.head(7)









    Out[109]:






  
    
      
      item_id
      Product Long Description
      Product Name
    
  
  
    
      0
      10593
      Universal Flat TV Mount, , For Use With 32 to ...
      PEERLESS SUF651 TV Mount, 37-75 in Ultra-Thin,...
    
    
      1
      10594
      <ul><li>Lots of plastic cable ties with self-l...
      100 Pcs Toothed Flexible Marker Cable Zip Wire...
    
    
      2
      10595
      Chris and Erin leave Australia to buy and reno...
      House Hunters Renovation: Down Under to Over T...
    
    
      3
      10596
      Tripp Lite Protect It! Three-Outlet Travel-Siz...
      TRAVELER3USB Surge Suppressor Notebook 3 Outle...
    
    
      4
      10597
      RapidRun&reg; is designed to be the standard f...
      75FT RAPIDRUN RUNNER MULTI-FORMAT CMG
    
    
      5
      10598
      <ul><li>Louvered Panel, H 19 In, L 18 In, Gray...
      Akro-Mils 30618230SC Louvered Panel Wall-Mount...
    
    
      6
      10599
      Power Supply, Type Power Supply, Material Stee...
      SDC 634RF Power Supply,16 in. L,14 in. W G1877325



In [110]:

    
df.shape









    Out[110]:





(10593, 3)

REPLACE NaNs



In [111]:

    
text = 'This is a super fake text block that I am writing to replace the NaNs. \
I should have account for these 6 NaNs in the test set. Next time, I will account for this!'
nan_indx = [2335,4098,6132,8174,9421,10209]
labels = ['Product Long Description','Product Name']
for label in labels:
    for indx in nan_indx:
        df.loc[indx,label] = text

GET PREDICTIONS



In [74]:

    
X_test1 = df['Product Name']
X_test2 = df['Product Long Description']
y_pred1 = XGB_MODEL_PN.predict(X_test1)
y_pred2 = XGB_MODEL_PLD.predict(X_test2)



In [128]:

    
y_pred1[5]









    Out[128]:





'[581514]'



In [127]:

    
y_pred2[5]









    Out[127]:





'[4537]'

ENSEMBLE MODEL PREDICTIONS



In [113]:

    
pred_bin = []
for indx,ID in enumerate(df.item_id):
    if y_pred1[indx] == y_pred2[indx]:
        pred = y_pred1[indx]
    else:
        tag1 = re.findall('\d+\d',y_pred1[indx])
        tag2 = re.findall('\d+\d',y_pred2[indx])
        pred = '['+tag1[0]+', '+tag2[0]+']'
    pred_bin.append(pred)

SAVE TO TSV



In [117]:

    
d = {}
d['item_id'] = list(df.item_id)
d['tag'] = pred_bin
pred_df = pd.DataFrame(d)
pred_df.to_csv('tags.tsv',sep='\t',index=False)
pred_df.head()









    Out[117]:






  
    
      
      item_id
      tag
    
  
  
    
      0
      10593
      [581514]
    
    
      1
      10594
      [4537]
    
    
      2
      10595
      [4483]
    
    
      3
      10596
      [4483]
    
    
      4
      10597
      [4483, 4537]



In [120]:

    
pred_df.tag.value_counts()









    Out[120]:





[4537]                2427
[4483]                1844
[581514]               673
[529295]               417
[95987]                349
[1229817]              348
[106546]               314
[95987, 106546]        268
[4483, 4537]           247
[1229821]              239
[1180168]              230
[127175]               229
[4537, 4483]           183
[4538]                 181
[4536]                 146
[447913]               138
[95987, 522484]        135
[522484]               127
[650659]                86
[1229817, 1229821]      74
[1070524]               69
[4537, 581514]          58
[106546, 4483]          54
[581514, 4537]          51
[1229821, 1229817]      47
[95987, 4483]           41
[1229821, 447913]       38
[447913, 4483]          38
[95987, 447913]         37
[447913, 1229821]       33
                      ... 
[4457, 4483]             1
[581514, 1229821]        1
[95987, 3304195]         1
[529295, 4538]           1
[4538, 106546]           1
[95987, 1229825]         1
[5065, 95987]            1
[447913, 1180168]        1
[1229820, 1225174]       1
[1229818, 447913]        1
[650659, 1229817]        1
[1229821, 1085065]       1
[1225174, 4457]          1
[1225174, 529295]        1
[650659, 4538]           1
[4538, 529295]           1
[1229817, 4538]          1
[1070524, 4536]          1
[1225174, 650659]        1
[5065, 127175]           1
[5065, 1180168]          1
[5065, 1229821]          1
[648819, 4483]           1
[1071165, 581514]        1
[522484, 3304195]        1
[1229817, 1229818]       1
[650659, 1085065]        1
[1180168, 4537]          1
[1085065, 4538]          1
[581514, 95987]          1
Name: tag, dtype: int64

	item_id	Product Long Description	Product Name
0	10593	Universal Flat TV Mount, , For Use With 32 to ...	PEERLESS SUF651 TV Mount, 37-75 in Ultra-Thin,...
1	10594	<ul><li>Lots of plastic cable ties with self-l...	100 Pcs Toothed Flexible Marker Cable Zip Wire...
2	10595	Chris and Erin leave Australia to buy and reno...	House Hunters Renovation: Down Under to Over T...
3	10596	Tripp Lite Protect It! Three-Outlet Travel-Siz...	TRAVELER3USB Surge Suppressor Notebook 3 Outle...
4	10597	RapidRun® is designed to be the standard f...	75FT RAPIDRUN RUNNER MULTI-FORMAT CMG
5	10598	<ul><li>Louvered Panel, H 19 In, L 18 In, Gray...	Akro-Mils 30618230SC Louvered Panel Wall-Mount...
6	10599	Power Supply, Type Power Supply, Material Stee...	SDC 634RF Power Supply,16 in. L,14 in. W G1877325

	item_id	tag
0	10593	[581514]
1	10594	[4537]
2	10595	[4483]
3	10596	[4483]
4	10597	[4483, 4537]