RUN MODEL


In [69]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.base import TransformerMixin
from xgboost import XGBClassifier
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import warnings
import pickle
import re
% matplotlib inline

TRANSFORMERS


In [70]:
class HtmlTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        cleaner = re.compile(r'<.*?>')
        return pd.Series(X.apply(lambda x: re.sub(cleaner,' ',x)))
    def fit(self, X, y=None, **fit_params):
        return self
    
class RemoveCamelCaseTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return pd.Series(X.apply(lambda x: re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ',x)))
    def fit(self, X, y=None, **fit_params):
        return self
    
class RemoveSymsTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return pd.Series(X.apply(lambda x: re.sub(re.compile(r'[^A-za-z0-9\s\.]'),' ',x)))
    def fit(self, X, y=None, **fit_params):
        return self
    
class TokenizeTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return pd.Series(X.apply(lambda x:word_tokenize(x)))
    def fit(self, X, y=None, **fit_params):
        return self

class LemmatizeTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        lmtzr = WordNetLemmatizer()
        return pd.Series(X.apply(lambda x:[lmtzr.lemmatize(token.lower()) for token in x]))
    def fit(self, X, y=None, **fit_params):
        return self

class RemoveStopWordsTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        stop = set(stopwords.words('english'))
        return pd.Series(X.apply(lambda x: ' '.join([token for token in x if token not in stop])))
    def fit(self, X, y=None, **fit_params):
        return self

LOAD MODELS


In [71]:
with open('models/MODEL_XG_Boost_Product_Long_Description.plk','rb') as f:
    XGB_MODEL_PLD = pickle.load(f)
    
with open('models/MODEL_XG_Boost_Product_Name.plk','rb') as f:
    XGB_MODEL_PN = pickle.load(f)

LOAD TEST SET


In [109]:
df = pd.read_csv('data/test.tsv',sep='\t')
keep = set(['item_id','Product Long Description','Product Name'])
drop = set(df.columns) - keep
df.drop(drop,axis=1,inplace=True)
df.head(7)


Out[109]:
item_id Product Long Description Product Name
0 10593 Universal Flat TV Mount, , For Use With 32 to ... PEERLESS SUF651 TV Mount, 37-75 in Ultra-Thin,...
1 10594 <ul><li>Lots of plastic cable ties with self-l... 100 Pcs Toothed Flexible Marker Cable Zip Wire...
2 10595 Chris and Erin leave Australia to buy and reno... House Hunters Renovation: Down Under to Over T...
3 10596 Tripp Lite Protect It! Three-Outlet Travel-Siz... TRAVELER3USB Surge Suppressor Notebook 3 Outle...
4 10597 RapidRun&reg; is designed to be the standard f... 75FT RAPIDRUN RUNNER MULTI-FORMAT CMG
5 10598 <ul><li>Louvered Panel, H 19 In, L 18 In, Gray... Akro-Mils 30618230SC Louvered Panel Wall-Mount...
6 10599 Power Supply, Type Power Supply, Material Stee... SDC 634RF Power Supply,16 in. L,14 in. W G1877325

In [110]:
df.shape


Out[110]:
(10593, 3)

REPLACE NaNs


In [111]:
text = 'This is a super fake text block that I am writing to replace the NaNs. \
I should have account for these 6 NaNs in the test set. Next time, I will account for this!'
nan_indx = [2335,4098,6132,8174,9421,10209]
labels = ['Product Long Description','Product Name']
for label in labels:
    for indx in nan_indx:
        df.loc[indx,label] = text

GET PREDICTIONS


In [74]:
X_test1 = df['Product Name']
X_test2 = df['Product Long Description']
y_pred1 = XGB_MODEL_PN.predict(X_test1)
y_pred2 = XGB_MODEL_PLD.predict(X_test2)

In [128]:
y_pred1[5]


Out[128]:
'[581514]'

In [127]:
y_pred2[5]


Out[127]:
'[4537]'

ENSEMBLE MODEL PREDICTIONS


In [113]:
pred_bin = []
for indx,ID in enumerate(df.item_id):
    if y_pred1[indx] == y_pred2[indx]:
        pred = y_pred1[indx]
    else:
        tag1 = re.findall('\d+\d',y_pred1[indx])
        tag2 = re.findall('\d+\d',y_pred2[indx])
        pred = '['+tag1[0]+', '+tag2[0]+']'
    pred_bin.append(pred)

SAVE TO TSV


In [117]:
d = {}
d['item_id'] = list(df.item_id)
d['tag'] = pred_bin
pred_df = pd.DataFrame(d)
pred_df.to_csv('tags.tsv',sep='\t',index=False)
pred_df.head()


Out[117]:
item_id tag
0 10593 [581514]
1 10594 [4537]
2 10595 [4483]
3 10596 [4483]
4 10597 [4483, 4537]

In [120]:
pred_df.tag.value_counts()


Out[120]:
[4537]                2427
[4483]                1844
[581514]               673
[529295]               417
[95987]                349
[1229817]              348
[106546]               314
[95987, 106546]        268
[4483, 4537]           247
[1229821]              239
[1180168]              230
[127175]               229
[4537, 4483]           183
[4538]                 181
[4536]                 146
[447913]               138
[95987, 522484]        135
[522484]               127
[650659]                86
[1229817, 1229821]      74
[1070524]               69
[4537, 581514]          58
[106546, 4483]          54
[581514, 4537]          51
[1229821, 1229817]      47
[95987, 4483]           41
[1229821, 447913]       38
[447913, 4483]          38
[95987, 447913]         37
[447913, 1229821]       33
                      ... 
[4457, 4483]             1
[581514, 1229821]        1
[95987, 3304195]         1
[529295, 4538]           1
[4538, 106546]           1
[95987, 1229825]         1
[5065, 95987]            1
[447913, 1180168]        1
[1229820, 1225174]       1
[1229818, 447913]        1
[650659, 1229817]        1
[1229821, 1085065]       1
[1225174, 4457]          1
[1225174, 529295]        1
[650659, 4538]           1
[4538, 529295]           1
[1229817, 4538]          1
[1070524, 4536]          1
[1225174, 650659]        1
[5065, 127175]           1
[5065, 1180168]          1
[5065, 1229821]          1
[648819, 4483]           1
[1071165, 581514]        1
[522484, 3304195]        1
[1229817, 1229818]       1
[650659, 1085065]        1
[1180168, 4537]          1
[1085065, 4538]          1
[581514, 95987]          1
Name: tag, dtype: int64