In [69]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.base import TransformerMixin
from xgboost import XGBClassifier
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import warnings
import pickle
import re
% matplotlib inline
In [70]:
class HtmlTransformer(TransformerMixin):
def transform(self, X, **transform_params):
cleaner = re.compile(r'<.*?>')
return pd.Series(X.apply(lambda x: re.sub(cleaner,' ',x)))
def fit(self, X, y=None, **fit_params):
return self
class RemoveCamelCaseTransformer(TransformerMixin):
def transform(self, X, **transform_params):
return pd.Series(X.apply(lambda x: re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ',x)))
def fit(self, X, y=None, **fit_params):
return self
class RemoveSymsTransformer(TransformerMixin):
def transform(self, X, **transform_params):
return pd.Series(X.apply(lambda x: re.sub(re.compile(r'[^A-za-z0-9\s\.]'),' ',x)))
def fit(self, X, y=None, **fit_params):
return self
class TokenizeTransformer(TransformerMixin):
def transform(self, X, **transform_params):
return pd.Series(X.apply(lambda x:word_tokenize(x)))
def fit(self, X, y=None, **fit_params):
return self
class LemmatizeTransformer(TransformerMixin):
def transform(self, X, **transform_params):
lmtzr = WordNetLemmatizer()
return pd.Series(X.apply(lambda x:[lmtzr.lemmatize(token.lower()) for token in x]))
def fit(self, X, y=None, **fit_params):
return self
class RemoveStopWordsTransformer(TransformerMixin):
def transform(self, X, **transform_params):
stop = set(stopwords.words('english'))
return pd.Series(X.apply(lambda x: ' '.join([token for token in x if token not in stop])))
def fit(self, X, y=None, **fit_params):
return self
In [71]:
with open('models/MODEL_XG_Boost_Product_Long_Description.plk','rb') as f:
XGB_MODEL_PLD = pickle.load(f)
with open('models/MODEL_XG_Boost_Product_Name.plk','rb') as f:
XGB_MODEL_PN = pickle.load(f)
In [109]:
df = pd.read_csv('data/test.tsv',sep='\t')
keep = set(['item_id','Product Long Description','Product Name'])
drop = set(df.columns) - keep
df.drop(drop,axis=1,inplace=True)
df.head(7)
Out[109]:
In [110]:
df.shape
Out[110]:
In [111]:
text = 'This is a super fake text block that I am writing to replace the NaNs. \
I should have account for these 6 NaNs in the test set. Next time, I will account for this!'
nan_indx = [2335,4098,6132,8174,9421,10209]
labels = ['Product Long Description','Product Name']
for label in labels:
for indx in nan_indx:
df.loc[indx,label] = text
In [74]:
X_test1 = df['Product Name']
X_test2 = df['Product Long Description']
y_pred1 = XGB_MODEL_PN.predict(X_test1)
y_pred2 = XGB_MODEL_PLD.predict(X_test2)
In [128]:
y_pred1[5]
Out[128]:
In [127]:
y_pred2[5]
Out[127]:
In [113]:
pred_bin = []
for indx,ID in enumerate(df.item_id):
if y_pred1[indx] == y_pred2[indx]:
pred = y_pred1[indx]
else:
tag1 = re.findall('\d+\d',y_pred1[indx])
tag2 = re.findall('\d+\d',y_pred2[indx])
pred = '['+tag1[0]+', '+tag2[0]+']'
pred_bin.append(pred)
In [117]:
d = {}
d['item_id'] = list(df.item_id)
d['tag'] = pred_bin
pred_df = pd.DataFrame(d)
pred_df.to_csv('tags.tsv',sep='\t',index=False)
pred_df.head()
Out[117]:
In [120]:
pred_df.tag.value_counts()
Out[120]: