In [1]:
import pandas as pd
In [2]:
# set measurement levels
col_types = {'chain': 'object',
'offer': 'object',
'market': 'object',
'category': 'object',
'company': 'object',
'brand': 'object',
'exact_item_bought': 'object'}
In [3]:
# read data created from SAS key
data_pd = pd.read_csv('/home/patrickh/workspace/GWU_data_mining/01_basic_data_prep/assignment/key/assignment_1_key_train_sas.csv', dtype=col_types)
score_pd = pd.read_csv('/home/patrickh/workspace/GWU_data_mining/01_basic_data_prep/assignment/key/assignment_1_key_test_sas.csv', dtype=col_types)
In [4]:
# confirm shape
data_pd.shape
Out[4]:
In [5]:
# confirm shape
score_pd.shape
Out[5]:
In [6]:
# data['chain'].unique().shape # 130 levels
# data['market'].unique().shape # 34 levels
# data['category'].unique().shape # 13 levels
# data['brand'].unique().shape # 12 levels
# data['company'].unique().shape # 11 levels
In [7]:
# show column names
data_pd.columns
Out[7]:
In [8]:
# ensure repeater==t is modeled as 1
data_pd.loc[data_pd['repeater'] == 't', 'repeater'] = 1
data_pd.loc[data_pd['repeater'] == 'f', 'repeater'] = 0
In [9]:
# fix unknown values in train and test
chain_unknowns =\
list(set(data_pd['chain'].unique()) - set(score_pd['chain'].unique())) +\
list(set(score_pd['chain'].unique()) - set(data_pd['chain'].unique()))
print(chain_unknowns)
data_pd.loc[data_pd['chain'].isin(chain_unknowns), 'chain'] = 'unknown'
score_pd.loc[score_pd['chain'].isin(chain_unknowns), 'chain'] = 'unknown'
print(sorted(data_pd['chain'].unique()))
print(len(data_pd['chain'].unique()))
print(sorted(score_pd['chain'].unique()))
print(len(score_pd['chain'].unique()))
In [10]:
# fix unknown values in train and test
market_unknowns =\
list(set(data_pd['market'].unique()) - set(score_pd['market'].unique())) +\
list(set(score_pd['market'].unique()) - set(data_pd['market'].unique()))
print(market_unknowns)
print(sorted(data_pd['market'].unique()))
print(len(data_pd['market'].unique()))
print(sorted(score_pd['market'].unique()))
print(len(score_pd['market'].unique()))
In [11]:
# fix unknown values in train and test
category_unknowns =\
list(set(data_pd['category'].unique()) - set(score_pd['category'].unique())) +\
list(set(score_pd['category'].unique()) - set(data_pd['category'].unique()))
print(category_unknowns)
data_pd.loc[data_pd['category'].isin(category_unknowns), 'category'] = 'unknown'
score_pd.loc[score_pd['category'].isin(category_unknowns), 'category'] = 'unknown'
print(sorted(data_pd['category'].unique()))
print(len(data_pd['category'].unique()))
print(sorted(score_pd['category'].unique()))
print(len(score_pd['category'].unique()))
In [12]:
# fix unknown values in train and test
brand_unknowns =\
list(set(data_pd['brand'].unique()) - set(score_pd['brand'].unique())) +\
list(set(score_pd['brand'].unique()) - set(data_pd['brand'].unique()))
print(brand_unknowns)
data_pd.loc[data_pd['brand'].isin(brand_unknowns), 'brand'] = 'unknown'
score_pd.loc[score_pd['brand'].isin(brand_unknowns), 'brand'] = 'unknown'
print(sorted(data_pd['brand'].unique()))
print(len(data_pd['brand'].unique()))
print(sorted(score_pd['brand'].unique()))
print(len(score_pd['brand'].unique()))
In [13]:
# fix unknown values in train and test
company_unknowns =\
list(set(data_pd['company'].unique()) - set(score_pd['company'].unique())) +\
list(set(score_pd['company'].unique()) - set(data_pd['company'].unique()))
print(company_unknowns)
data_pd.loc[data_pd['company'].isin(company_unknowns), 'company'] = 'unknown'
score_pd.loc[score_pd['company'].isin(company_unknowns), 'company'] = 'unknown'
print(sorted(data_pd['company'].unique()))
print(len(data_pd['company'].unique()))
print(sorted(score_pd['company'].unique()))
print(len(score_pd['company'].unique()))
In [14]:
# fix unknown values in train and test
offer_unknowns =\
list(set(data_pd['offer'].unique()) - set(score_pd['offer'].unique())) +\
list(set(score_pd['offer'].unique()) - set(data_pd['offer'].unique()))
print(offer_unknowns)
data_pd.loc[data_pd['offer'].isin(offer_unknowns), 'offer'] = 'unknown'
score_pd.loc[score_pd['offer'].isin(offer_unknowns), 'offer'] = 'unknown'
print(sorted(data_pd['offer'].unique()))
print(len(data_pd['offer'].unique()))
print(sorted(score_pd['offer'].unique()))
print(len(score_pd['offer'].unique()))
In [15]:
# start and import h2o
# set seed
import h2o
h2o.init()
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
SEED = 12345
In [16]:
# enforce same measurement levels in h2o and pandas
col_types = {'chain': 'enum',
'offer': 'enum',
'market': 'enum',
'category': 'enum',
'company': 'enum',
'brand': 'enum',
'exact_item_bought': 'enum'}
data_h2o = h2o.H2OFrame(data_pd, column_types=col_types)
score_h2o = h2o.H2OFrame(score_pd, column_types=col_types)
In [17]:
# expand date into new features
data_h2o = data_h2o.concat(data_h2o['offerdate'].month().rename({'offerdate': 'month'}).asfactor())
data_h2o = data_h2o.concat(data_h2o['offerdate'].week().rename({'offerdate': 'week'}).asfactor())
data_h2o = data_h2o.concat(data_h2o['offerdate'].dayOfWeek().rename({'offerdate': 'dayOfWeek'}))
data_h2o = data_h2o.concat(data_h2o['offerdate'].day().rename({'offerdate': 'day'}))
score_h2o = score_h2o.concat(score_h2o['offerdate'].month().rename({'offerdate': 'month'}).asfactor())
score_h2o = score_h2o.concat(score_h2o['offerdate'].week().rename({'offerdate': 'week'}).asfactor())
score_h2o = score_h2o.concat(score_h2o['offerdate'].dayOfWeek().rename({'offerdate': 'dayOfWeek'}))
score_h2o = score_h2o.concat(score_h2o['offerdate'].day().rename({'offerdate': 'day'}))
In [18]:
# look at training data
data_h2o.describe()
In [19]:
# look at test data
score_h2o.describe()
In [20]:
# quantity unary, drop it
# drop other unusable variables
# set modeling roles
drops = ['id', 'chain', 'market', 'offerdate', 'quantity']
y = 'repeater'
X = [name for name in data_h2o.columns if name not in [y] + drops]
print(y)
print(X)
In [21]:
# create modeling partitions
train, valid, test = data_h2o.split_frame([0.4, 0.3], seed=SEED)
In [22]:
# check shape
train.shape
Out[22]:
In [23]:
# check shape
valid.shape
Out[23]:
In [24]:
# check shape
test.shape
Out[24]:
In [25]:
# elastic net regularized regression
# - L1 for variable selection
# - L2 for handling multicollinearity
# - IRLS for handling outliers
# - with lamba parameter tuning for variable selection
# initialize
rptr_glm = H2OGeneralizedLinearEstimator(family='binomial',
model_id='rptr_glm1',
solver='IRLSM',
nfolds=3,
standardize=True,
seed=SEED,
lambda_search=True)
# train
rptr_glm.train(X, y, training_frame=train, validation_frame=valid)
In [26]:
# check for stability across folds -- looks good
rptr_glm.cross_validation_metrics_summary().as_data_frame()
Out[26]:
In [27]:
# train AUC
rptr_glm.auc(valid=False)
Out[27]:
In [28]:
# valid AUC
rptr_glm.auc(valid=True)
Out[28]:
In [29]:
# test AUC
rptr_glm.model_performance(test).auc()
Out[29]:
In [30]:
# many validation metrics
rptr_glm.model_performance(valid)
Out[30]:
In [31]:
# print coefficients
for key_ in rptr_glm.coef():
print(key_, rptr_glm.coef()[key_])
In [32]:
# find id == 13584134
score_h2o['id'].asfactor().head()
Out[32]:
In [33]:
# get probability for id == 13584134
rptr_glm.predict(score_h2o)
Out[33]:
In [34]:
# import target encoder
from h2o.targetencoder import TargetEncoder
In [35]:
# train target encoder
e_columns = ['market', 'chain']
te_ = TargetEncoder(x=e_columns, y=y)
train[y] = train[y].asfactor()
_ = te_.fit(train)
In [36]:
# leave-one-out target encoding on train, valid, test
e_train = te_.transform(frame=train, holdout_type='loo', seed=12345)
valid[y] = valid[y].asfactor()
e_valid = te_.transform(frame=valid, holdout_type='loo', seed=12345)
test[y] = test[y].asfactor()
e_test = te_.transform(frame=test, holdout_type='loo', seed=12345)
In [37]:
# check train
e_train.head(rows=2)
Out[37]:
In [38]:
# check valid
e_valid.head(rows=2)
Out[38]:
In [39]:
# check test
e_test.head(rows=2)
Out[39]:
In [40]:
e_test['id'].asfactor().head(rows=2)
Out[40]:
In [41]:
X = X + ['market_te', 'chain_te']
print(X)
In [42]:
# import GBM and grid search
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
In [43]:
# GBM with random hyperparameter search
# train many different GBM models with random hyperparameters
# and select best model based on validation error
# define random grid search parameters
hyper_parameters = {'ntrees':list(range(50, 500, 50)),
'max_depth':list(range(2, 20, 2)),
'sample_rate':[s/float(10) for s in range(1, 11)],
'col_sample_rate':[s/float(10) for s in range(1, 11)]}
# define search strategy
search_criteria = {'strategy':'RandomDiscrete',
'max_models':50,
'max_runtime_secs':1200,
'seed': 12345}
# initialize grid search
gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
hyper_params=hyper_parameters,
search_criteria=search_criteria)
# execute training w/ grid search
gsearch.train(x=X,
y=y,
training_frame=e_train,
validation_frame=e_valid,
seed=12345)
# view detailed results at http://localhost:54321/flow/index.html
In [44]:
# show grid search results
gsearch.show()
# select best model
gbm_model = gsearch.get_grid()[0]
In [45]:
# train AUC
gbm_model.auc(valid=False)
Out[45]:
In [46]:
# valid AUC
gbm_model.auc(valid=True)
Out[46]:
In [47]:
# test AUC
gbm_model.model_performance(e_test).auc()
Out[47]:
In [48]:
# examine variable importance
%matplotlib inline
gbm_model.varimp_plot()
In [49]:
# make list of most important variables
important_vars = [row[0] for row in gbm_model.varimp()]
important_vars[:10]
Out[49]:
In [50]:
# generate partial dependence plot for most important variable
_ = gbm_model.partial_plot(data=e_test, cols=[important_vars[0]], server=True, plot=True)
In [51]:
# update X to contain only ten most important variables
X = important_vars[:10]
print(X)
In [52]:
# import mlp
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
# NN with random hyperparameter search
# train many different NN models with random hyperparameters
# and select best model based on validation error
# define random grid search parameters
hyper_parameters = {'hidden':[[170, 320], [80, 190], [320, 160, 80], [100], [50, 50, 50, 50]],
'l1':[s/1e4 for s in range(0, 1000, 100)],
'l2':[s/1e5 for s in range(0, 1000, 100)],
'input_dropout_ratio':[s/1e2 for s in range(0, 20, 2)]}
# define search strategy
search_criteria = {'strategy':'RandomDiscrete',
'max_models':50,
'max_runtime_secs':1800,
'seed': 12345}
# initialize grid search
gsearch = H2OGridSearch(H2ODeepLearningEstimator,
hyper_params=hyper_parameters,
search_criteria=search_criteria)
# execute training w/ grid search
gsearch.train(x=X,
y=y,
training_frame=e_train,
validation_frame=e_valid,
seed=12345)
# view detailed results at http://localhost:54321/flow/index.html
In [53]:
# show grid search results
gsearch.show()
# select best model
mlp_model = gsearch.get_grid()[0]
In [54]:
# print train, valid, test AUC
print(mlp_model.auc(valid=False))
print(mlp_model.auc(valid=True))
print(mlp_model.model_performance(e_test).auc())
In [55]:
# print partial dependence
_ = mlp_model.partial_plot(data=e_test, cols=[important_vars[0]], server=True, plot=True)