In [1]:
import pandas as pd

In [2]:
# set measurement levels
col_types = {'chain': 'object',
             'offer': 'object',
             'market': 'object',
             'category': 'object',
             'company': 'object',
             'brand': 'object',
             'exact_item_bought': 'object'}

In [3]:
# read data created from SAS key
data_pd = pd.read_csv('/home/patrickh/workspace/GWU_data_mining/01_basic_data_prep/assignment/key/assignment_1_key_train_sas.csv', dtype=col_types)
score_pd = pd.read_csv('/home/patrickh/workspace/GWU_data_mining/01_basic_data_prep/assignment/key/assignment_1_key_test_sas.csv', dtype=col_types)

In [4]:
# confirm shape
data_pd.shape


Out[4]:
(160057, 14)

In [5]:
# confirm shape
score_pd.shape


Out[5]:
(151484, 13)

In [6]:
# data['chain'].unique().shape    # 130 levels
# data['market'].unique().shape   # 34 levels
# data['category'].unique().shape # 13 levels
# data['brand'].unique().shape    # 12 levels
# data['company'].unique().shape  # 11 levels

In [7]:
# show column names
data_pd.columns


Out[7]:
Index(['id', 'chain', 'offer', 'market', 'repeater', 'offerdate', 'category',
       'quantity', 'company', 'offervalue', 'brand', 'avg_category_quantity',
       'avg_category_amount', 'exact_item_bought'],
      dtype='object')

In [8]:
# ensure repeater==t is modeled as 1
data_pd.loc[data_pd['repeater'] == 't', 'repeater'] = 1
data_pd.loc[data_pd['repeater'] == 'f', 'repeater'] = 0

In [9]:
# fix unknown values in train and test
chain_unknowns =\
list(set(data_pd['chain'].unique()) - set(score_pd['chain'].unique())) +\
list(set(score_pd['chain'].unique()) - set(data_pd['chain'].unique()))
    
print(chain_unknowns)     

data_pd.loc[data_pd['chain'].isin(chain_unknowns), 'chain'] = 'unknown'
score_pd.loc[score_pd['chain'].isin(chain_unknowns), 'chain'] = 'unknown'
     
print(sorted(data_pd['chain'].unique()))
print(len(data_pd['chain'].unique()))
print(sorted(score_pd['chain'].unique()))
print(len(score_pd['chain'].unique()))


['230', '369', '9', '284', '421', '370', '387']
['10', '100', '101', '104', '106', '108', '109', '115', '116', '12', '122', '126', '133', '134', '14', '140', '143', '15', '151', '152', '153', '16', '161', '163', '165', '166', '169', '17', '18', '180', '184', '187', '191', '192', '2', '20', '205', '206', '21', '211', '214', '215', '217', '224', '23', '233', '24', '240', '241', '246', '26', '278', '285', '293', '3', '304', '306', '307', '31', '313', '356', '360', '362', '368', '373', '376', '377', '384', '386', '388', '389', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399', '4', '40', '400', '401', '402', '403', '404', '405', '42', '422', '424', '431', '46', '48', '507', '508', '509', '510', '520', '521', '522', '523', '524', '525', '526', '58', '6', '62', '63', '64', '65', '68', '69', '70', '71', '73', '77', '8', '81', '83', '85', '88', '89', '95', '96', '98', 'unknown']
128
['10', '100', '101', '104', '106', '108', '109', '115', '116', '12', '122', '126', '133', '134', '14', '140', '143', '15', '151', '152', '153', '16', '161', '163', '165', '166', '169', '17', '18', '180', '184', '187', '191', '192', '2', '20', '205', '206', '21', '211', '214', '215', '217', '224', '23', '233', '24', '240', '241', '246', '26', '278', '285', '293', '3', '304', '306', '307', '31', '313', '356', '360', '362', '368', '373', '376', '377', '384', '386', '388', '389', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399', '4', '40', '400', '401', '402', '403', '404', '405', '42', '422', '424', '431', '46', '48', '507', '508', '509', '510', '520', '521', '522', '523', '524', '525', '526', '58', '6', '62', '63', '64', '65', '68', '69', '70', '71', '73', '77', '8', '81', '83', '85', '88', '89', '95', '96', '98', 'unknown']
128

In [10]:
# fix unknown values in train and test
market_unknowns =\
list(set(data_pd['market'].unique()) - set(score_pd['market'].unique())) +\
list(set(score_pd['market'].unique()) - set(data_pd['market'].unique()))
    
print(market_unknowns)     
     
print(sorted(data_pd['market'].unique()))
print(len(data_pd['market'].unique()))
print(sorted(score_pd['market'].unique()))
print(len(score_pd['market'].unique()))


[]
['1', '10', '11', '12', '14', '15', '16', '17', '18', '2', '20', '21', '22', '23', '24', '26', '27', '28', '33', '34', '35', '37', '39', '4', '43', '45', '47', '5', '6', '7', '8', '9', '93', '96']
34
['1', '10', '11', '12', '14', '15', '16', '17', '18', '2', '20', '21', '22', '23', '24', '26', '27', '28', '33', '34', '35', '37', '39', '4', '43', '45', '47', '5', '6', '7', '8', '9', '93', '96']
34

In [11]:
# fix unknown values in train and test
category_unknowns =\
list(set(data_pd['category'].unique()) - set(score_pd['category'].unique())) +\
list(set(score_pd['category'].unique()) - set(data_pd['category'].unique()))
    
print(category_unknowns)     

data_pd.loc[data_pd['category'].isin(category_unknowns), 'category'] = 'unknown'
score_pd.loc[score_pd['category'].isin(category_unknowns), 'category'] = 'unknown'
     
print(sorted(data_pd['category'].unique()))
print(len(data_pd['category'].unique()))
print(sorted(score_pd['category'].unique()))
print(len(score_pd['category'].unique()))


['3509', '1703', '3203', '9909', '4401', '5122', '7205', '706', '5824', '4517', '799', '9115']
['1726', '2119', '2202', '3504', '5558', '5616', '5619', '6202', 'unknown']
9
['1726', '2119', '2202', '3504', '5558', '5616', '5619', '6202', 'unknown']
9

In [12]:
# fix unknown values in train and test
brand_unknowns =\
list(set(data_pd['brand'].unique()) - set(score_pd['brand'].unique())) +\
list(set(score_pd['brand'].unique()) - set(data_pd['brand'].unique()))
    
print(brand_unknowns)     

data_pd.loc[data_pd['brand'].isin(brand_unknowns), 'brand'] = 'unknown'
score_pd.loc[score_pd['brand'].isin(brand_unknowns), 'brand'] = 'unknown'
     
print(sorted(data_pd['brand'].unique()))
print(len(data_pd['brand'].unique()))
print(sorted(score_pd['brand'].unique()))
print(len(score_pd['brand'].unique()))


['6732', '13474', '13791', '875', '28840', '1322', '26456', '17286', '4294', '17311', '26189', '93904']
['102504', '15889', '3718', '5072', '64486', '6926', '7668', 'unknown']
8
['102504', '15889', '3718', '5072', '64486', '6926', '7668', 'unknown']
8

In [13]:
# fix unknown values in train and test
company_unknowns =\
list(set(data_pd['company'].unique()) - set(score_pd['company'].unique())) +\
list(set(score_pd['company'].unique()) - set(data_pd['company'].unique()))
    
print(company_unknowns)     
     
data_pd.loc[data_pd['company'].isin(company_unknowns), 'company'] = 'unknown'
score_pd.loc[score_pd['company'].isin(company_unknowns), 'company'] = 'unknown'    
    
print(sorted(data_pd['company'].unique()))
print(len(data_pd['company'].unique()))
print(sorted(score_pd['company'].unique()))
print(len(score_pd['company'].unique()))


['107127979', '1089520383', '105100050', '103320030', '106414464', '108500080', '107106878', '105190050', '105450050', '1076211171', '103700030', '104127141']
['104460040', '104610040', '107120272', '107717272', '108079383', '1087744888', 'unknown']
7
['104460040', '104610040', '107120272', '107717272', '108079383', '1087744888', 'unknown']
7

In [14]:
# fix unknown values in train and test
offer_unknowns =\
list(set(data_pd['offer'].unique()) - set(score_pd['offer'].unique())) +\
list(set(score_pd['offer'].unique()) - set(data_pd['offer'].unique()))
    
print(offer_unknowns)     

data_pd.loc[data_pd['offer'].isin(offer_unknowns), 'offer'] = 'unknown'
score_pd.loc[score_pd['offer'].isin(offer_unknowns), 'offer'] = 'unknown'    
    
print(sorted(data_pd['offer'].unique()))
print(len(data_pd['offer'].unique()))
print(sorted(score_pd['offer'].unique()))
print(len(score_pd['offer'].unique()))


['1197502', '1200988', '1199258', '1199256', '1203052', '1194044', '1200579', '1200578', '1220502', '1221666', '1219900', '1221665', '1219903', '1221667', '1203439', '1221658', '1220503', '1221663', '1230218', '1190530', '1213242']
['1198271', '1198272', '1198273', '1198274', '1198275', '1200581', '1200582', '1200584', '1204576', '1204821', '1204822', '1208251', '1208252', '1208329', '1208501', '1208503', 'unknown']
17
['1198271', '1198272', '1198273', '1198274', '1198275', '1200581', '1200582', '1200584', '1204576', '1204821', '1204822', '1208251', '1208252', '1208329', '1208501', '1208503', 'unknown']
17

In [15]:
# start and import h2o
# set seed
import h2o
h2o.init()
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

SEED = 12345


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_201"; Java(TM) SE Runtime Environment (build 1.8.0_201-b09); Java HotSpot(TM) 64-Bit Server VM (build 25.201-b09, mixed mode)
  Starting server from /home/patrickh/anaconda3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpt6ervwvl
  JVM stdout: /tmp/tmpt6ervwvl/h2o_patrickh_started_from_python.out
  JVM stderr: /tmp/tmpt6ervwvl/h2o_patrickh_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.
H2O cluster uptime: 01 secs
H2O cluster timezone: America/New_York
H2O data parsing timezone: UTC
H2O cluster version: 3.22.1.6
H2O cluster version age: 24 days
H2O cluster name: H2O_from_python_patrickh_020k5w
H2O cluster total nodes: 1
H2O cluster free memory: 3.422 Gb
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster status: accepting new members, healthy
H2O connection url: http://127.0.0.1:54321
H2O connection proxy: None
H2O internal security: False
H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4
Python version: 3.6.4 final

In [16]:
# enforce same measurement levels in h2o and pandas
col_types = {'chain': 'enum',
             'offer': 'enum',
             'market': 'enum',
             'category': 'enum',
             'company': 'enum',
             'brand': 'enum',
             'exact_item_bought': 'enum'}

data_h2o = h2o.H2OFrame(data_pd, column_types=col_types)
score_h2o = h2o.H2OFrame(score_pd, column_types=col_types)


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%

In [17]:
# expand date into new features
data_h2o = data_h2o.concat(data_h2o['offerdate'].month().rename({'offerdate': 'month'}).asfactor())
data_h2o = data_h2o.concat(data_h2o['offerdate'].week().rename({'offerdate': 'week'}).asfactor())
data_h2o = data_h2o.concat(data_h2o['offerdate'].dayOfWeek().rename({'offerdate': 'dayOfWeek'}))
data_h2o = data_h2o.concat(data_h2o['offerdate'].day().rename({'offerdate': 'day'}))

score_h2o = score_h2o.concat(score_h2o['offerdate'].month().rename({'offerdate': 'month'}).asfactor())
score_h2o = score_h2o.concat(score_h2o['offerdate'].week().rename({'offerdate': 'week'}).asfactor())
score_h2o = score_h2o.concat(score_h2o['offerdate'].dayOfWeek().rename({'offerdate': 'dayOfWeek'}))
score_h2o = score_h2o.concat(score_h2o['offerdate'].day().rename({'offerdate': 'day'}))

In [18]:
# look at training data
data_h2o.describe()


Rows:160057
Cols:18


id chain offer market repeater offerdate category quantity company offervalue brand avg_category_quantity avg_category_amount exact_item_bought month week dayOfWeek day
type int enum enum enum int time enum int enum real enum real real enum enum enum enum int
mins 86246.0 0.0 1362096000000.0 1.0 0.75 -1.0 -85.14 1.0
mean 1869324131.2555754 0.271390817021436141365220653082.3323 1.0 1.25532310364433 0.6949999031818261 2.2441312757227134 19.004467158574705
maxs 4809911000.0 1.0 1367280000000.0 1.0 3.0 44.181818182 62.218 31.0
sigma 1570833451.4856217 0.4446786220260453 1303790212.0504372 0.0 0.5246445261465219 0.7609072934172032 2.5989523363669176 9.632905002480289
zeros 0 116619 0 0 0 72549 72591 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 86246.0 205 120825134 1.0 2013-04-24 00:00:002202 1.0 1044600402.0 3718 0.0 0.0 0 4 17 Wed 24.0
1 86252.0 205 unknown34 1.0 2013-03-27 00:00:00unknown 1.0 unknown 0.75 unknown1.0 2.425 1 3 13 Wed 27.0
2 12682470.0 18 unknown11 0.0 2013-03-28 00:00:00unknown 1.0 unknown 0.75 unknown1.0 2.5 1 3 13 Thu 28.0
3 12996040.0 15 unknown9 0.0 2013-03-25 00:00:00unknown 1.0 unknown 0.75 unknown0.0 0.0 0 3 13 Mon 25.0
4 13089312.0 15 12048219 0.0 2013-04-01 00:00:005619 1.0 1077172721.5 102504 0.0 0.0 0 4 14 Mon 1.0
5 13179265.0 14 unknown8 0.0 2013-03-29 00:00:00unknown 1.0 unknown 0.75 unknown0.0 0.0 0 3 13 Fri 29.0
6 13251776.0 15 12005819 0.0 2013-03-30 00:00:001726 1.0 1044600401.25 7668 1.8 4.563 1 3 13 Sat 30.0
7 13540129.0 14 12005818 0.0 2013-03-30 00:00:001726 1.0 1044600401.25 7668 1.3333333333 4.6533333333 1 3 13 Sat 30.0
8 13807224.0 4 12045761 0.0 2013-04-05 00:00:005616 1.0 1046100401.0 15889 1.4375 3.365 1 4 14 Fri 5.0
9 13873775.0 4 unknown1 0.0 2013-03-26 00:00:00unknown 1.0 unknown 0.75 unknown0.0 0.0 0 3 13 Tue 26.0

In [19]:
# look at test data
score_h2o.describe()


Rows:151484
Cols:17


id chain offer market offerdate category quantity company offervalue brand avg_category_quantity avg_category_amount exact_item_bought month week dayOfWeek day
type int enum enum enum time enum int enum real enum real real enum enum enum enum int
mins 12262064.0 1367366400000.0 1.0 1.0 -1.0 -18.76 1.0
mean 2367235598.062943 1372045742758.312 1.2141942383354016 1.8039594940719834 0.9798367012155557 3.918267425267764 20.815993768318656
maxs 4853598737.0 1375228800000.0 2.0 5.0 40.0 103.87 31.0
sigma 1629644569.9344237 1768808429.4717584 0.4102635466557285 0.7684280524816689 0.6520652050938551 3.24780201201996 7.076741234531553
zeros 0 0 0 0 27950 27970 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 12262064.0 95 unknown39 2013-06-27 00:00:00unknown 1.0 unknown 1.5 unknown0.0 0.0 0 6 26 Thu 27.0
1 12277270.0 95 unknown39 2013-06-23 00:00:00unknown 2.0 unknown 3.0 unknown1.2 3.882 1 6 25 Sun 23.0
2 12332190.0 95 unknown39 2013-06-15 00:00:00unknown 1.0 unknown 2.0 unknown2.0 6.98 1 6 24 Sat 15.0
3 12524696.0 4 unknown1 2013-06-20 00:00:00unknown 1.0 unknown 1.5 unknown1.0 4.2425 1 6 25 Thu 20.0
4 13074629.0 14 unknown8 2013-06-21 00:00:00unknown 2.0 unknown 3.0 unknown1.0 5.6185714286 1 6 25 Fri 21.0
5 13387341.0 14 unknown8 2013-06-22 00:00:00unknown 2.0 unknown 3.0 unknown1.0 4.49 1 6 25 Sat 22.0
6 13501141.0 4 unknown1 2013-05-13 00:00:00unknown 1.0 unknown 1.5 unknown1.0 6.74 1 5 20 Mon 13.0
7 13558712.0 15 unknown9 2013-05-12 00:00:00unknown 1.0 unknown 1.5 unknown1.0 10.323333333 1 5 19 Sun 12.0
8 13563017.0 14 unknown8 2013-06-24 00:00:00unknown 1.0 unknown 1.5 unknown1.0 5.3066666667 1 6 26 Mon 24.0
9 13584134.0 14 unknown8 2013-06-27 00:00:00unknown 1.0 unknown 1.5 unknown1.0 2.84 1 6 26 Thu 27.0

In [20]:
# quantity unary, drop it
# drop other unusable variables
# set modeling roles
drops = ['id', 'chain', 'market', 'offerdate', 'quantity']
y = 'repeater'
X = [name for name in data_h2o.columns if name not in [y] + drops]
print(y)
print(X)


repeater
['offer', 'category', 'company', 'offervalue', 'brand', 'avg_category_quantity', 'avg_category_amount', 'exact_item_bought', 'month', 'week', 'dayOfWeek', 'day']

In [21]:
# create modeling partitions
train, valid, test = data_h2o.split_frame([0.4, 0.3], seed=SEED)

In [22]:
# check shape 
train.shape


Out[22]:
(64122, 18)

In [23]:
# check shape 
valid.shape


Out[23]:
(47980, 18)

In [24]:
# check shape 
test.shape


Out[24]:
(47955, 18)

In [25]:
# elastic net regularized regression 
#   - L1 for variable selection
#   - L2 for handling multicollinearity
#   - IRLS for handling outliers
#   - with lamba parameter tuning for variable selection

# initialize
rptr_glm = H2OGeneralizedLinearEstimator(family='binomial',
                                         model_id='rptr_glm1',
                                         solver='IRLSM',
                                         nfolds=3,
                                         standardize=True,
                                         seed=SEED,
                                         lambda_search=True)

# train 
rptr_glm.train(X, y, training_frame=train, validation_frame=valid)


glm Model Build progress: |███████████████████████████████████████████████| 100%

In [26]:
# check for stability across folds -- looks good
rptr_glm.cross_validation_metrics_summary().as_data_frame()


Out[26]:
mean sd cv_1_valid cv_2_valid cv_3_valid
0 accuracy 0.5633181 0.0034985114 0.5576324 0.5626291 0.56969273
1 auc 0.66687614 0.0013872668 0.6641128 0.6684734 0.6680421
2 err 0.43668193 0.0034985114 0.4423676 0.4373709 0.4303073
3 err_count 9334.0 99.13627 9514.0 9316.0 9172.0
4 f0point5 0.39562488 8.234713E-4 0.39596537 0.3968501 0.39405915
5 f1 0.48101103 0.0012617736 0.48254105 0.481984 0.47850809
6 f2 0.613405 0.0024687524 0.61756927 0.6136203 0.60902536
7 lift_top_group 2.0142968 0.050682575 2.027856 2.0945127 1.920521
8 logloss 0.5482143 0.0020346912 0.5511301 0.5492147 0.54429805
9 max_per_class_error 0.50603294 0.0070085404 0.5175562 0.5071829 0.4933597
10 mcc 0.21990056 0.0016726145 0.21801764 0.21844749 0.22323658
11 mean_per_class_accuracy 0.6226167 0.0014614518 0.6208214 0.62151676 0.6255119
12 mean_per_class_error 0.37738332 0.0014614518 0.37917858 0.37848327 0.37448812
13 mse 0.18361354 9.3339913E-4 0.18492535 0.18410787 0.18180738
14 null_deviance 24907.797 143.848 25161.13 24899.21 24663.049
15 precision 0.3537607 7.134559E-4 0.35366338 0.3550422 0.35257646
16 r2 0.0670119 9.0821454E-4 0.06541767 0.068562925 0.067055106
17 recall 0.7512663 0.004308973 0.759199 0.75021636 0.7443835
18 residual_deviance 23435.428 146.4661 23706.31 23396.547 23203.426
19 rmse 0.42849872 0.0010901511 0.43002948 0.42907792 0.42638877
20 specificity 0.49396706 0.0070085404 0.4824438 0.4928171 0.50664026

In [27]:
# train AUC
rptr_glm.auc(valid=False)


Out[27]:
0.6682898515349129

In [28]:
# valid AUC
rptr_glm.auc(valid=True)


Out[28]:
0.6642797159900581

In [29]:
# test AUC
rptr_glm.model_performance(test).auc()


Out[29]:
0.66930388703468

In [30]:
# many validation metrics
rptr_glm.model_performance(valid)


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.18433338418488365
RMSE: 0.4293406388695154
LogLoss: 0.5500303205580226
Null degrees of freedom: 47979
Residual degrees of freedom: 47926
Null deviance: 56034.26930760575
Residual deviance: 52780.909560747896
AIC: 52888.909560747896
AUC: 0.6642797159900581
pr_auc: 0.4082413348072177
Gini: 0.32855943198011617
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.23680137514237834: 
0 1 Error Rate
0 17240.0 17753.0 0.5073 (17753.0/34993.0)
1 3305.0 9682.0 0.2545 (3305.0/12987.0)
Total 20545.0 27435.0 0.4389 (21058.0/47980.0)
Maximum Metrics: Maximum metrics at their respective thresholds

metric threshold value idx
max f1 0.2368014 0.4790461 274.0
max f2 0.1223229 0.6605821 358.0
max f0point5 0.3284445 0.4251892 202.0
max accuracy 0.4805090 0.7320342 76.0
max precision 0.7387865 0.6923077 8.0
max recall 0.0559133 1.0 399.0
max specificity 0.9378370 0.9999143 0.0
max absolute_mcc 0.2928417 0.2189525 229.0
max min_per_class_accuracy 0.2913873 0.6202098 231.0
max mean_per_class_accuracy 0.2928417 0.6222223 229.0
Gains/Lift Table: Avg response rate: 27.07 %, avg score: 26.98 %

group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate score cumulative_response_rate cumulative_score capture_rate cumulative_capture_rate gain cumulative_gain
1 0.0100459 0.5366650 2.0005291 2.0005291 0.5414938 0.5850936 0.5414938 0.5850936 0.0200970 0.0200970 100.0529096 100.0529096
2 0.0200917 0.5128554 1.9698696 1.9851994 0.5331950 0.5228512 0.5373444 0.5539724 0.0197890 0.0398860 96.9869646 98.5199371
3 0.0300125 0.4961243 1.9093237 1.9601182 0.5168067 0.5044425 0.5305556 0.5376000 0.0189420 0.0588281 90.9323674 96.0118238
4 0.0400167 0.4826037 1.9472902 1.9569112 0.5270833 0.4896555 0.5296875 0.5256139 0.0194810 0.0783091 94.7290239 95.6911238
5 0.05 0.4676137 1.7353953 1.9126819 0.4697286 0.4742862 0.5177157 0.5153654 0.0173250 0.0956341 73.5395264 91.2681913
6 0.1 0.4247933 1.7386617 1.8256718 0.4706128 0.4440226 0.4941642 0.4796940 0.0869331 0.1825672 73.8661739 82.5671826
7 0.15 0.3940127 1.4676215 1.7063217 0.3972489 0.4091581 0.4618591 0.4561820 0.0733811 0.2559483 46.7621468 70.6321706
8 0.2 0.3653559 1.3043813 1.6058366 0.3530638 0.3791261 0.4346603 0.4369180 0.0652191 0.3211673 30.4381304 60.5836606
9 0.3152980 0.3233892 1.3456877 1.5107055 0.3642444 0.3402462 0.4089106 0.4015671 0.1551552 0.4763225 34.5687698 51.0705474
10 0.4043560 0.3039213 1.1542497 1.4321974 0.3124269 0.3130243 0.3876604 0.3820659 0.1027951 0.5791176 15.4249715 43.2197384
11 0.5002293 0.2727063 0.9581511 1.3413423 0.2593478 0.2891334 0.3630682 0.3642546 0.0918611 0.6709787 -4.1848872 34.1342304
12 0.6016048 0.2177024 0.9608340 1.2772234 0.2600740 0.2480176 0.3457128 0.3446676 0.0974051 0.7683838 -3.9166001 27.7223392
13 0.7013964 0.1838723 0.6767011 1.1917837 0.1831662 0.1940466 0.3225864 0.3232379 0.0675291 0.8359128 -32.3298943 19.1783730
14 0.8050855 0.1746898 0.6928512 1.1275249 0.1875377 0.1783366 0.3051931 0.3045757 0.0718411 0.9077539 -30.7148819 12.7524917
15 0.9000208 0.1270697 0.6277750 1.0748106 0.1699232 0.1564416 0.2909247 0.2889503 0.0595981 0.9673520 -37.2225050 7.4810629
16 1.0 0.0519661 0.3265484 1.0 0.0883886 0.0972660 0.2706753 0.2697859 0.0326480 1.0 -67.3451614 0.0

Out[30]:


In [31]:
# print coefficients
for key_ in rptr_glm.coef():
    print(key_, rptr_glm.coef()[key_])


Intercept 0.9142362343476567
offer.1198271 0.03463083385893831
offer.1198272 0.14052728909397258
offer.1198273 -0.028110797822582217
offer.1198274 -0.13117283777912048
offer.1198275 -0.07475893905600814
offer.1200581 -0.2663110424214166
offer.1200582 0.0
offer.1200584 0.0
offer.1204576 -0.13863085089106314
offer.1204821 -0.16653161796482882
offer.1204822 0.0
offer.1208251 0.0
offer.1208252 0.90290635746174
offer.1208329 0.03988082619178288
offer.1208501 0.4323275425518459
offer.1208503 0.0
offer.unknown -0.24095314385202235
week.9 0.3882866662473721
week.10 0.34503199800881224
week.11 -0.3374664786715979
week.12 0.0
week.13 -0.3946215550940864
week.14 -0.22160719365547848
week.15 0.0
week.16 -0.09617188647612415
week.17 0.053510178998351796
week.18 0.15565259807093573
category.1726 -0.16996755533119035
category.2119 0.03988082619178368
category.2202 0.6589080081306594
category.3504 0.0
category.5558 -0.04144651740120281
category.5616 -0.13863085089106691
category.5619 -0.09783190861526495
category.6202 0.12060177946446535
category.unknown -0.2409531438519952
brand.102504 -0.09783190861526239
brand.15889 -0.13863085089106675
brand.3718 0.6589080081306663
brand.5072 -0.04144651740120234
brand.64486 0.12060177946446629
brand.6926 0.03988082619178345
brand.7668 -0.9124249815547996
brand.unknown 0.0
company.104460040 0.0
company.104610040 -0.1386308508910667
company.107120272 -0.041446517401202365
company.107717272 -0.09783190861526334
company.108079383 0.03988082619178309
company.1087744888 0.1206017794644651
company.unknown 0.0
dayOfWeek.Mon -0.06788130486673358
dayOfWeek.Tue -0.001410333705654479
dayOfWeek.Wed 0.007728177083505018
dayOfWeek.Thu 0.0
dayOfWeek.Fri 0.0
dayOfWeek.Sat 0.14530381687259547
dayOfWeek.Sun 0.1032571462037334
exact_item_bought.0 -0.1507968420390149
exact_item_bought.1 0.1263809799828914
month.3 0.19741069489701107
month.4 -0.1549780733058436
offervalue -0.9912946388035481
avg_category_quantity -0.03619275098428959
avg_category_amount 0.08680975561720691
day -0.03191424771152614

In [32]:
# find id == 13584134
score_h2o['id'].asfactor().head()


id
12262064
12277270
12332190
12524696
13074629
13387341
13501141
13558712
13563017
13584134
Out[32]:


In [33]:
# get probability for id == 13584134
rptr_glm.predict(score_h2o)


glm prediction progress: |████████████████████████████████████████████████| 100%
/home/patrickh/anaconda3/lib/python3.6/site-packages/h2o/job.py:69: UserWarning: Test/Validation dataset column 'week' has levels not trained on: [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
  warnings.warn(w)
/home/patrickh/anaconda3/lib/python3.6/site-packages/h2o/job.py:69: UserWarning: Test/Validation dataset column 'month' has levels not trained on: [5, 6, 7]
  warnings.warn(w)
predict p0 p1
00.9319280.0680718
00.9644970.0355034
00.8548730.145127
00.8561920.143808
00.9602050.0397945
00.9596120.0403879
00.8040330.195967
10.71045 0.28955
00.8684330.131567
00.8937060.106294
Out[33]:


In [34]:
# import target encoder
from h2o.targetencoder import TargetEncoder

In [35]:
# train target encoder
e_columns = ['market', 'chain']
te_ = TargetEncoder(x=e_columns, y=y)
train[y] = train[y].asfactor()
_ = te_.fit(train)

In [36]:
# leave-one-out target encoding on train, valid, test
e_train = te_.transform(frame=train, holdout_type='loo', seed=12345)
valid[y] = valid[y].asfactor()
e_valid = te_.transform(frame=valid, holdout_type='loo', seed=12345)
test[y] = test[y].asfactor()
e_test = te_.transform(frame=test, holdout_type='loo', seed=12345)

In [37]:
# check train
e_train.head(rows=2)


chain market idoffer repeaterofferdate category quantitycompany offervaluebrand avg_category_quantity avg_category_amount exact_item_bought month weekdayOfWeek day market_te chain_te
10 52.55092e+08unknown 02013-03-14 00:00:00unknown 1unknown 2unknown 1 9.99 1 3 11Thu 14 0.162703 0.160728
10 52.66075e+081204576 12013-04-04 00:00:005616 1104610040 115889 1 2.79 1 4 14Thu 4 0.125308 0.124052
Out[37]:


In [38]:
# check valid
e_valid.head(rows=2)


chain market idoffer repeaterofferdate category quantitycompany offervaluebrand avg_category_quantity avg_category_amount exact_item_bought month weekdayOfWeek day market_te chain_te
10 52.54652e+081198275 02013-03-27 00:00:005558 1107120272 1.5 5072 1.4 1.8 1 3 13Wed 27 0.158439 0.160728
10 52.54734e+08unknown 02013-04-24 00:00:00unknown 1unknown 0.75unknown 0 0 0 4 17Wed 24 0.150508 0.165719
Out[38]:


In [39]:
# check test 
e_test.head(rows=2)


chain market idoffer repeaterofferdate category quantitycompany offervaluebrand avg_category_quantity avg_category_amount exact_item_bought month weekdayOfWeek day market_te chain_te
10 52.58623e+081198271 02013-03-25 00:00:005558 1107120272 1.5 5072 3 3.66667 1 3 13Mon 25 0.163404 0.160728
10 52.58693e+08unknown 12013-03-29 00:00:00unknown 1unknown 0.75unknown 0 0 0 3 13Fri 29 0.121452 0.124052
Out[39]:


In [40]:
e_test['id'].asfactor().head(rows=2)


id
258623302
258692579
Out[40]:


In [41]:
X = X + ['market_te', 'chain_te']
print(X)


['offer', 'category', 'company', 'offervalue', 'brand', 'avg_category_quantity', 'avg_category_amount', 'exact_item_bought', 'month', 'week', 'dayOfWeek', 'day', 'market_te', 'chain_te']

In [42]:
# import GBM and grid search
from h2o.estimators.gbm import H2OGradientBoostingEstimator 
from h2o.grid.grid_search import H2OGridSearch

In [43]:
# GBM with random hyperparameter search
# train many different GBM models with random hyperparameters
# and select best model based on validation error

# define random grid search parameters
hyper_parameters = {'ntrees':list(range(50, 500, 50)),
                    'max_depth':list(range(2, 20, 2)),
                    'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

# define search strategy
search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':50,
                   'max_runtime_secs':1200, 
                   'seed': 12345}

# initialize grid search
gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

# execute training w/ grid search
gsearch.train(x=X,
              y=y,
              training_frame=e_train,
              validation_frame=e_valid, 
              seed=12345)

# view detailed results at http://localhost:54321/flow/index.html


gbm Grid Build progress: |████████████████████████████████████████████████| 100%

In [44]:
# show grid search results
gsearch.show()

# select best model
gbm_model = gsearch.get_grid()[0]


     col_sample_rate max_depth ntrees sample_rate  \
0                0.9         6    250         1.0   
1                0.4         6    150         0.9   
2                0.4         4    300         0.9   
3                1.0         4    300         0.5   
4                0.4         6    100         1.0   
5                1.0         2    400         0.9   
6                0.1        10    350         0.8   
7                1.0         4    250         0.3   
8                0.1         6    150         0.9   
9                0.1        10    350         0.6   
10               0.2         8    150         0.5   
11               0.1        10    250         0.7   
12               0.6         4     50         0.6   
13               0.8         8    100         0.6   
14               1.0         4     50         0.3   
15               0.8         4     50         0.3   
16               0.1        14    350         0.5   
17               1.0         8     50         0.3   
18               0.2         8    300         0.4   
19               0.1        12    100         0.1   
20               0.4        10    350         0.7   
21               0.7        12    150         1.0   
22               0.9         6    350         0.4   
23               1.0        12     50         1.0   
24               0.2         6    350         0.1   
25               0.6         6    300         0.2   
26               0.4        14    250         1.0   
27               0.4        14    200         1.0   
28               0.9         8    150         0.2   
29               1.0        12    300         0.7   
30               0.9         8    250         0.3   
31               0.3        16     50         0.4   
32               1.0        14     50         0.3   
33               0.9        14    400         0.7   
34               0.2        16    250         0.5   
35               0.6        16    400         0.9   
36               0.8        16     50         0.9   
37               0.5         8    400         0.2   
38               0.9        16    400         0.7   
39               0.7        18     50         0.6   
40               0.9        12    150         0.4   
41               0.7         8    350         0.1   
42               0.5        12    200         0.3   
43               0.9        16    100         0.1   
44               0.8        18    250         0.6   
45               0.8        10    450         0.3   
46               0.6        12    350         0.1   
47               0.7        18    200         0.1   
48               0.5        18    400         0.1   
49               0.9        16    400         0.2   

                                                         model_ids  \
0   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_11   
1   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_32   
2   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_10   
3   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_27   
4   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_43   
5    Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_1   
6    Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_2   
7   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_33   
8   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_29   
9   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_31   
10  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_16   
11  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_12   
12   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_3   
13  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_50   
14  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_28   
15  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_14   
16  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_25   
17  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_22   
18  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_39   
19  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_42   
20   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_8   
21  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_47   
22  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_34   
23  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_41   
24  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_15   
25  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_30   
26  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_24   
27   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_4   
28  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_45   
29  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_21   
30  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_38   
31   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_5   
32  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_35   
33   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_7   
34  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_36   
35  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_20   
36  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_23   
37  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_13   
38  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_26   
39  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_17   
40  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_49   
41  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_19   
42  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_48   
43  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_40   
44  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_37   
45  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_46   
46   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_6   
47  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_44   
48  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_18   
49   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_9   

               logloss  
0   0.5348834983989057  
1   0.5350291550847558  
2   0.5350894789165441  
3    0.535248031739906  
4   0.5354232866015268  
5    0.536646342786623  
6    0.537452131939599  
7   0.5376479557679856  
8   0.5378304173039002  
9   0.5378854566021242  
10  0.5380053569351866  
11  0.5380773747073453  
12  0.5386174767148093  
13  0.5388992002840919  
14  0.5391120670305152  
15  0.5391530686859511  
16  0.5393710636142622  
17   0.540390311161104  
18  0.5416671715429446  
19  0.5419625334634491  
20  0.5426712431838964  
21  0.5432625381356367  
22  0.5432646516697345  
23  0.5451345163682967  
24  0.5458733104268865  
25   0.545909460632734  
26  0.5498716819062104  
27  0.5498716819062104  
28  0.5499028994766184  
29  0.5502053659777679  
30  0.5558261784124103  
31   0.557359195507266  
32  0.5581103738596365  
33  0.5584909832655498  
34  0.5588497425482545  
35  0.5617982925457293  
36  0.5625896941243087  
37  0.5654485751617531  
38  0.5713129537881524  
39  0.5713801530621511  
40   0.574567832157054  
41  0.5748519555317287  
42  0.5825993745923173  
43  0.5871166938602922  
44  0.5994404448481553  
45  0.6146780634348513  
46  0.6200616332955663  
47  0.6294138163966044  
48  0.6860690534310121  
49  0.7170542574246234  

In [45]:
# train AUC
gbm_model.auc(valid=False)


Out[45]:
0.7372164154988929

In [46]:
# valid AUC
gbm_model.auc(valid=True)


Out[46]:
0.6955609131044218

In [47]:
# test AUC
gbm_model.model_performance(e_test).auc()


Out[47]:
0.7033280640065208

In [48]:
# examine variable importance
%matplotlib inline
gbm_model.varimp_plot()



In [49]:
# make list of most important variables
important_vars = [row[0] for row in gbm_model.varimp()]
important_vars[:10]


Out[49]:
['chain_te',
 'week',
 'avg_category_amount',
 'avg_category_quantity',
 'market_te',
 'offer',
 'offervalue',
 'dayOfWeek',
 'brand',
 'day']

In [50]:
# generate partial dependence plot for most important variable
_ = gbm_model.partial_plot(data=e_test, cols=[important_vars[0]], server=True, plot=True)


PartialDependencePlot progress: |█████████████████████████████████████████| 100%

In [51]:
# update X to contain only ten most important variables
X = important_vars[:10]
print(X)


['chain_te', 'week', 'avg_category_amount', 'avg_category_quantity', 'market_te', 'offer', 'offervalue', 'dayOfWeek', 'brand', 'day']

In [52]:
# import mlp
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

# NN with random hyperparameter search
# train many different NN models with random hyperparameters
# and select best model based on validation error

# define random grid search parameters
hyper_parameters = {'hidden':[[170, 320], [80, 190], [320, 160, 80], [100], [50, 50, 50, 50]],
                    'l1':[s/1e4 for s in range(0, 1000, 100)],
                    'l2':[s/1e5 for s in range(0, 1000, 100)],
                    'input_dropout_ratio':[s/1e2 for s in range(0, 20, 2)]}

# define search strategy
search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':50,
                   'max_runtime_secs':1800, 
                   'seed': 12345}

# initialize grid search
gsearch = H2OGridSearch(H2ODeepLearningEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

# execute training w/ grid search
gsearch.train(x=X,
              y=y,
              training_frame=e_train,
              validation_frame=e_valid, 
              seed=12345)

# view detailed results at http://localhost:54321/flow/index.html


deeplearning Grid Build progress: |███████████████████████████████████████| 100%

In [53]:
# show grid search results
gsearch.show()

# select best model
mlp_model = gsearch.get_grid()[0]


                hidden input_dropout_ratio    l1     l2  \
0     [50, 50, 50, 50]                 0.1   0.0  0.002   
1           [170, 320]                0.16   0.0  0.008   
2                [100]                0.14  0.01  0.006   
3                [100]                0.14  0.01  0.001   
4                [100]                0.06  0.02  0.006   
5                [100]                0.06  0.05  0.008   
6                [100]                 0.0  0.08  0.003   
7                [100]                0.18  0.03  0.009   
8                [100]                0.14  0.04  0.009   
9                [100]                0.04  0.07  0.003   
10               [100]                0.16  0.07  0.002   
11               [100]                0.04  0.07  0.009   
12               [100]                0.02  0.05  0.008   
13               [100]                0.16  0.06  0.006   
14               [100]                0.12  0.06  0.001   
15               [100]                0.14  0.08    0.0   
16               [100]                0.12  0.09  0.009   
17    [50, 50, 50, 50]                0.04  0.09  0.004   
18    [50, 50, 50, 50]                0.08  0.09  0.001   
19           [80, 190]                0.02  0.09    0.0   
20    [50, 50, 50, 50]                 0.1  0.08  0.009   
21          [170, 320]                 0.1  0.08  0.007   
22          [170, 320]                0.14  0.08  0.006   
23          [170, 320]                0.06  0.07  0.005   
24           [80, 190]                0.02  0.07  0.004   
25          [170, 320]                0.08  0.07    0.0   
26          [170, 320]                0.12  0.06  0.004   
27      [320, 160, 80]                0.14  0.06  0.002   
28    [50, 50, 50, 50]                0.02  0.06  0.002   
29          [170, 320]                0.18  0.06    0.0   
30           [80, 190]                0.02  0.05  0.009   
31      [320, 160, 80]                0.18  0.05  0.002   
32           [80, 190]                 0.1  0.05    0.0   
33    [50, 50, 50, 50]                0.02  0.04  0.008   
34      [320, 160, 80]                0.18  0.04  0.008   
35      [320, 160, 80]                0.06  0.04  0.005   
36           [80, 190]                 0.0  0.04  0.004   
37           [80, 190]                0.12  0.04  0.003   
38    [50, 50, 50, 50]                0.06  0.04  0.002   
39          [170, 320]                0.12  0.04  0.001   
40          [170, 320]                0.16  0.03  0.008   
41    [50, 50, 50, 50]                0.16  0.03  0.006   
42      [320, 160, 80]                0.14  0.02  0.005   
43          [170, 320]                0.06  0.02  0.005   
44           [80, 190]                 0.0  0.02  0.005   
45      [320, 160, 80]                 0.0  0.01  0.003   
46    [50, 50, 50, 50]                0.16  0.01  0.002   
47    [50, 50, 50, 50]                0.02  0.01  0.002   
48    [50, 50, 50, 50]                0.08  0.01    0.0   
49           [80, 190]                0.18  0.01    0.0   

                                                                model_ids  \
0   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
1   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
2   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
3   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
4   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
5   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
6   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
7   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
8   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
9   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
10  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
11  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
12  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
13  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
14  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
15  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
16  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
17  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
18  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
19  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
20  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
21  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
22  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
23  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
24  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
25  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
26  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
27  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
28  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
29  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
30  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
31  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
32  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
33  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
34  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
35  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
36  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
37  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
38  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
39  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
40  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
41  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
42  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
43  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
44  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
45  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
46  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
47  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
48  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
49  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   

               logloss  
0   0.5441277425619976  
1   0.5468463763679692  
2   0.5506008410750938  
3   0.5506723146192487  
4   0.5606036290296568  
5   0.5839340260871398  
6   0.5839911584694784  
7   0.5844394972593425  
8   0.5846837953814987  
9   0.5849682481160456  
10  0.5850026388614178  
11  0.5851131278900356  
12  0.5857919095875189  
13  0.5858466411918215  
14  0.5858754362158225  
15  0.5906781682601115  
16  0.5940594978259867  
17  0.6431177451391431  
18  0.6450529608012587  
19  0.6457162013454759  
20  0.6486068335324051  
21  0.6499785663239965  
22  0.6506787737522219  
23  0.6611527490256719  
24  0.6619788531748063  
25  0.6654116986223156  
26  0.6728097497599246  
27  0.6747520073743039  
28  0.6747520073743055  
29  0.6767584866583781  
30  0.6794279016802731  
31  0.6869352546368972  
32  0.6892490551997769  
33  0.6928809978038432  
34  0.6928809978038432  
35   0.696514948013381  
36  0.6977683709229899  
37  0.6990455169581326  
38  0.7003460472005603  
39  0.7016671407154101  
40  0.7065238616979619  
41  0.7092814797747413  
42  0.7263483169261058  
43  0.7263483169261058  
44  0.7263483169261058  
45  0.7475621179617055  
46  0.7495904295327387  
47  0.7495904295327387  
48  0.7537832750142842  
49  0.7537832750142842  

In [54]:
# print train, valid, test AUC
print(mlp_model.auc(valid=False))
print(mlp_model.auc(valid=True))
print(mlp_model.model_performance(e_test).auc())


0.6898022376723639
0.6827321992354999
0.6906367158970282

In [55]:
# print partial dependence
_ = mlp_model.partial_plot(data=e_test, cols=[important_vars[0]], server=True, plot=True)


PartialDependencePlot progress: |█████████████████████████████████████████| 100%