notebook.community

Edit and run



In [1]:

    
import pandas as pd



In [2]:

    
# set measurement levels
col_types = {'chain': 'object',
             'offer': 'object',
             'market': 'object',
             'category': 'object',
             'company': 'object',
             'brand': 'object',
             'exact_item_bought': 'object'}



In [3]:

    
# read data created from SAS key
data_pd = pd.read_csv('/home/patrickh/workspace/GWU_data_mining/01_basic_data_prep/assignment/key/assignment_1_key_train_sas.csv', dtype=col_types)
score_pd = pd.read_csv('/home/patrickh/workspace/GWU_data_mining/01_basic_data_prep/assignment/key/assignment_1_key_test_sas.csv', dtype=col_types)



In [4]:

    
# confirm shape
data_pd.shape









    Out[4]:





(160057, 14)



In [5]:

    
# confirm shape
score_pd.shape









    Out[5]:





(151484, 13)



In [6]:

    
# data['chain'].unique().shape    # 130 levels
# data['market'].unique().shape   # 34 levels
# data['category'].unique().shape # 13 levels
# data['brand'].unique().shape    # 12 levels
# data['company'].unique().shape  # 11 levels



In [7]:

    
# show column names
data_pd.columns









    Out[7]:





Index(['id', 'chain', 'offer', 'market', 'repeater', 'offerdate', 'category',
       'quantity', 'company', 'offervalue', 'brand', 'avg_category_quantity',
       'avg_category_amount', 'exact_item_bought'],
      dtype='object')



In [8]:

    
# ensure repeater==t is modeled as 1
data_pd.loc[data_pd['repeater'] == 't', 'repeater'] = 1
data_pd.loc[data_pd['repeater'] == 'f', 'repeater'] = 0



In [9]:

    
# fix unknown values in train and test
chain_unknowns =\
list(set(data_pd['chain'].unique()) - set(score_pd['chain'].unique())) +\
list(set(score_pd['chain'].unique()) - set(data_pd['chain'].unique()))
    
print(chain_unknowns)     

data_pd.loc[data_pd['chain'].isin(chain_unknowns), 'chain'] = 'unknown'
score_pd.loc[score_pd['chain'].isin(chain_unknowns), 'chain'] = 'unknown'
     
print(sorted(data_pd['chain'].unique()))
print(len(data_pd['chain'].unique()))
print(sorted(score_pd['chain'].unique()))
print(len(score_pd['chain'].unique()))









    



['230', '369', '9', '284', '421', '370', '387']
['10', '100', '101', '104', '106', '108', '109', '115', '116', '12', '122', '126', '133', '134', '14', '140', '143', '15', '151', '152', '153', '16', '161', '163', '165', '166', '169', '17', '18', '180', '184', '187', '191', '192', '2', '20', '205', '206', '21', '211', '214', '215', '217', '224', '23', '233', '24', '240', '241', '246', '26', '278', '285', '293', '3', '304', '306', '307', '31', '313', '356', '360', '362', '368', '373', '376', '377', '384', '386', '388', '389', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399', '4', '40', '400', '401', '402', '403', '404', '405', '42', '422', '424', '431', '46', '48', '507', '508', '509', '510', '520', '521', '522', '523', '524', '525', '526', '58', '6', '62', '63', '64', '65', '68', '69', '70', '71', '73', '77', '8', '81', '83', '85', '88', '89', '95', '96', '98', 'unknown']
128
['10', '100', '101', '104', '106', '108', '109', '115', '116', '12', '122', '126', '133', '134', '14', '140', '143', '15', '151', '152', '153', '16', '161', '163', '165', '166', '169', '17', '18', '180', '184', '187', '191', '192', '2', '20', '205', '206', '21', '211', '214', '215', '217', '224', '23', '233', '24', '240', '241', '246', '26', '278', '285', '293', '3', '304', '306', '307', '31', '313', '356', '360', '362', '368', '373', '376', '377', '384', '386', '388', '389', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399', '4', '40', '400', '401', '402', '403', '404', '405', '42', '422', '424', '431', '46', '48', '507', '508', '509', '510', '520', '521', '522', '523', '524', '525', '526', '58', '6', '62', '63', '64', '65', '68', '69', '70', '71', '73', '77', '8', '81', '83', '85', '88', '89', '95', '96', '98', 'unknown']
128



In [10]:

    
# fix unknown values in train and test
market_unknowns =\
list(set(data_pd['market'].unique()) - set(score_pd['market'].unique())) +\
list(set(score_pd['market'].unique()) - set(data_pd['market'].unique()))
    
print(market_unknowns)     
     
print(sorted(data_pd['market'].unique()))
print(len(data_pd['market'].unique()))
print(sorted(score_pd['market'].unique()))
print(len(score_pd['market'].unique()))









    



[]
['1', '10', '11', '12', '14', '15', '16', '17', '18', '2', '20', '21', '22', '23', '24', '26', '27', '28', '33', '34', '35', '37', '39', '4', '43', '45', '47', '5', '6', '7', '8', '9', '93', '96']
34
['1', '10', '11', '12', '14', '15', '16', '17', '18', '2', '20', '21', '22', '23', '24', '26', '27', '28', '33', '34', '35', '37', '39', '4', '43', '45', '47', '5', '6', '7', '8', '9', '93', '96']
34



In [11]:

    
# fix unknown values in train and test
category_unknowns =\
list(set(data_pd['category'].unique()) - set(score_pd['category'].unique())) +\
list(set(score_pd['category'].unique()) - set(data_pd['category'].unique()))
    
print(category_unknowns)     

data_pd.loc[data_pd['category'].isin(category_unknowns), 'category'] = 'unknown'
score_pd.loc[score_pd['category'].isin(category_unknowns), 'category'] = 'unknown'
     
print(sorted(data_pd['category'].unique()))
print(len(data_pd['category'].unique()))
print(sorted(score_pd['category'].unique()))
print(len(score_pd['category'].unique()))









    



['3509', '1703', '3203', '9909', '4401', '5122', '7205', '706', '5824', '4517', '799', '9115']
['1726', '2119', '2202', '3504', '5558', '5616', '5619', '6202', 'unknown']
9
['1726', '2119', '2202', '3504', '5558', '5616', '5619', '6202', 'unknown']
9



In [12]:

    
# fix unknown values in train and test
brand_unknowns =\
list(set(data_pd['brand'].unique()) - set(score_pd['brand'].unique())) +\
list(set(score_pd['brand'].unique()) - set(data_pd['brand'].unique()))
    
print(brand_unknowns)     

data_pd.loc[data_pd['brand'].isin(brand_unknowns), 'brand'] = 'unknown'
score_pd.loc[score_pd['brand'].isin(brand_unknowns), 'brand'] = 'unknown'
     
print(sorted(data_pd['brand'].unique()))
print(len(data_pd['brand'].unique()))
print(sorted(score_pd['brand'].unique()))
print(len(score_pd['brand'].unique()))









    



['6732', '13474', '13791', '875', '28840', '1322', '26456', '17286', '4294', '17311', '26189', '93904']
['102504', '15889', '3718', '5072', '64486', '6926', '7668', 'unknown']
8
['102504', '15889', '3718', '5072', '64486', '6926', '7668', 'unknown']
8



In [13]:

    
# fix unknown values in train and test
company_unknowns =\
list(set(data_pd['company'].unique()) - set(score_pd['company'].unique())) +\
list(set(score_pd['company'].unique()) - set(data_pd['company'].unique()))
    
print(company_unknowns)     
     
data_pd.loc[data_pd['company'].isin(company_unknowns), 'company'] = 'unknown'
score_pd.loc[score_pd['company'].isin(company_unknowns), 'company'] = 'unknown'    
    
print(sorted(data_pd['company'].unique()))
print(len(data_pd['company'].unique()))
print(sorted(score_pd['company'].unique()))
print(len(score_pd['company'].unique()))









    



['107127979', '1089520383', '105100050', '103320030', '106414464', '108500080', '107106878', '105190050', '105450050', '1076211171', '103700030', '104127141']
['104460040', '104610040', '107120272', '107717272', '108079383', '1087744888', 'unknown']
7
['104460040', '104610040', '107120272', '107717272', '108079383', '1087744888', 'unknown']
7



In [14]:

    
# fix unknown values in train and test
offer_unknowns =\
list(set(data_pd['offer'].unique()) - set(score_pd['offer'].unique())) +\
list(set(score_pd['offer'].unique()) - set(data_pd['offer'].unique()))
    
print(offer_unknowns)     

data_pd.loc[data_pd['offer'].isin(offer_unknowns), 'offer'] = 'unknown'
score_pd.loc[score_pd['offer'].isin(offer_unknowns), 'offer'] = 'unknown'    
    
print(sorted(data_pd['offer'].unique()))
print(len(data_pd['offer'].unique()))
print(sorted(score_pd['offer'].unique()))
print(len(score_pd['offer'].unique()))









    



['1197502', '1200988', '1199258', '1199256', '1203052', '1194044', '1200579', '1200578', '1220502', '1221666', '1219900', '1221665', '1219903', '1221667', '1203439', '1221658', '1220503', '1221663', '1230218', '1190530', '1213242']
['1198271', '1198272', '1198273', '1198274', '1198275', '1200581', '1200582', '1200584', '1204576', '1204821', '1204822', '1208251', '1208252', '1208329', '1208501', '1208503', 'unknown']
17
['1198271', '1198272', '1198273', '1198274', '1198275', '1200581', '1200582', '1200584', '1204576', '1204821', '1204822', '1208251', '1208252', '1208329', '1208501', '1208503', 'unknown']
17



In [15]:

    
# start and import h2o
# set seed
import h2o
h2o.init()
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

SEED = 12345









    



Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_201"; Java(TM) SE Runtime Environment (build 1.8.0_201-b09); Java HotSpot(TM) 64-Bit Server VM (build 25.201-b09, mixed mode)
  Starting server from /home/patrickh/anaconda3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpt6ervwvl
  JVM stdout: /tmp/tmpt6ervwvl/h2o_patrickh_started_from_python.out
  JVM stderr: /tmp/tmpt6ervwvl/h2o_patrickh_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.






    




H2O cluster uptime:
01 secs
H2O cluster timezone:
America/New_York
H2O data parsing timezone:
UTC
H2O cluster version:
3.22.1.6
H2O cluster version age:
24 days 
H2O cluster name:
H2O_from_python_patrickh_020k5w
H2O cluster total nodes:
1
H2O cluster free memory:
3.422 Gb
H2O cluster total cores:
8
H2O cluster allowed cores:
8
H2O cluster status:
accepting new members, healthy
H2O connection url:
http://127.0.0.1:54321
H2O connection proxy:
None
H2O internal security:
False
H2O API Extensions:
Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4
Python version:
3.6.4 final



In [16]:

    
# enforce same measurement levels in h2o and pandas
col_types = {'chain': 'enum',
             'offer': 'enum',
             'market': 'enum',
             'category': 'enum',
             'company': 'enum',
             'brand': 'enum',
             'exact_item_bought': 'enum'}

data_h2o = h2o.H2OFrame(data_pd, column_types=col_types)
score_h2o = h2o.H2OFrame(score_pd, column_types=col_types)









    



Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%



In [17]:

    
# expand date into new features
data_h2o = data_h2o.concat(data_h2o['offerdate'].month().rename({'offerdate': 'month'}).asfactor())
data_h2o = data_h2o.concat(data_h2o['offerdate'].week().rename({'offerdate': 'week'}).asfactor())
data_h2o = data_h2o.concat(data_h2o['offerdate'].dayOfWeek().rename({'offerdate': 'dayOfWeek'}))
data_h2o = data_h2o.concat(data_h2o['offerdate'].day().rename({'offerdate': 'day'}))

score_h2o = score_h2o.concat(score_h2o['offerdate'].month().rename({'offerdate': 'month'}).asfactor())
score_h2o = score_h2o.concat(score_h2o['offerdate'].week().rename({'offerdate': 'week'}).asfactor())
score_h2o = score_h2o.concat(score_h2o['offerdate'].dayOfWeek().rename({'offerdate': 'dayOfWeek'}))
score_h2o = score_h2o.concat(score_h2o['offerdate'].day().rename({'offerdate': 'day'}))



In [18]:

    
# look at training data
data_h2o.describe()









    



Rows:160057
Cols:18








    






       id                chain  offer  market  repeater           offerdate          category  quantity  company  offervalue        brand  avg_category_quantity  avg_category_amount  exact_item_bought  month  week  dayOfWeek  day               


type   int               enum   enum   enum    int                time               enum      int       enum     real              enum   real                   real                 enum               enum   enum  enum       int               
mins   86246.0                                 0.0                1362096000000.0              1.0                0.75                     -1.0                   -85.14                                                          1.0               
mean   1869324131.2555754                       0.27139081702143614 1365220653082.3323           1.0                1.25532310364433         0.6949999031818261     2.2441312757227134                                              19.004467158574705
maxs   4809911000.0                            1.0                1367280000000.0              1.0                3.0                      44.181818182           62.218                                                          31.0              
sigma  1570833451.4856217                       0.4446786220260453 1303790212.0504372           0.0                0.5246445261465219        0.7609072934172032     2.5989523363669176                                              9.632905002480289 
zeros  0                                       116619             0                            0                  0                        72549                  72591                                                           0                 
missing 0                 0      0      0       0                  0                  0         0         0        0                 0      0                      0                    0                  0      0     0          0                 
0      86246.0           205    1208251 34      1.0                2013-04-24 00:00:00 2202      1.0       104460040 2.0               3718   0.0                    0.0                  0                  4      17    Wed        24.0              
1      86252.0           205    unknown 34      1.0                2013-03-27 00:00:00 unknown   1.0       unknown  0.75              unknown 1.0                    2.425                1                  3      13    Wed        27.0              
2      12682470.0        18     unknown 11      0.0                2013-03-28 00:00:00 unknown   1.0       unknown  0.75              unknown 1.0                    2.5                  1                  3      13    Thu        28.0              
3      12996040.0        15     unknown 9       0.0                2013-03-25 00:00:00 unknown   1.0       unknown  0.75              unknown 0.0                    0.0                  0                  3      13    Mon        25.0              
4      13089312.0        15     1204821 9       0.0                2013-04-01 00:00:00 5619      1.0       107717272 1.5               102504 0.0                    0.0                  0                  4      14    Mon        1.0               
5      13179265.0        14     unknown 8       0.0                2013-03-29 00:00:00 unknown   1.0       unknown  0.75              unknown 0.0                    0.0                  0                  3      13    Fri        29.0              
6      13251776.0        15     1200581 9       0.0                2013-03-30 00:00:00 1726      1.0       104460040 1.25              7668   1.8                    4.563                1                  3      13    Sat        30.0              
7      13540129.0        14     1200581 8       0.0                2013-03-30 00:00:00 1726      1.0       104460040 1.25              7668   1.3333333333           4.6533333333         1                  3      13    Sat        30.0              
8      13807224.0        4      1204576 1       0.0                2013-04-05 00:00:00 5616      1.0       104610040 1.0               15889  1.4375                 3.365                1                  4      14    Fri        5.0               
9      13873775.0        4      unknown 1       0.0                2013-03-26 00:00:00 unknown   1.0       unknown  0.75              unknown 0.0                    0.0                  0                  3      13    Tue        26.0



In [19]:

    
# look at test data
score_h2o.describe()









    



Rows:151484
Cols:17








    






       id                chain  offer  market  offerdate          category  quantity          company  offervalue        brand  avg_category_quantity  avg_category_amount  exact_item_bought  month  week  dayOfWeek  day               


type   int               enum   enum   enum    time               enum      int               enum     real              enum   real                   real                 enum               enum   enum  enum       int               
mins   12262064.0                              1367366400000.0              1.0                        1.0                      -1.0                   -18.76                                                          1.0               
mean   2367235598.062943                       1372045742758.312            1.2141942383354016          1.8039594940719834        0.9798367012155557     3.918267425267764                                               20.815993768318656
maxs   4853598737.0                            1375228800000.0              2.0                        5.0                      40.0                   103.87                                                          31.0              
sigma  1629644569.9344237                       1768808429.4717584           0.4102635466557285          0.7684280524816689        0.6520652050938551     3.24780201201996                                                7.076741234531553 
zeros  0                                       0                            0                          0                        27950                  27970                                                           0                 
missing 0                 0      0      0       0                  0         0                 0        0                 0      0                      0                    0                  0      0     0          0                 
0      12262064.0        95     unknown 39      2013-06-27 00:00:00 unknown   1.0               unknown  1.5               unknown 0.0                    0.0                  0                  6      26    Thu        27.0              
1      12277270.0        95     unknown 39      2013-06-23 00:00:00 unknown   2.0               unknown  3.0               unknown 1.2                    3.882                1                  6      25    Sun        23.0              
2      12332190.0        95     unknown 39      2013-06-15 00:00:00 unknown   1.0               unknown  2.0               unknown 2.0                    6.98                 1                  6      24    Sat        15.0              
3      12524696.0        4      unknown 1       2013-06-20 00:00:00 unknown   1.0               unknown  1.5               unknown 1.0                    4.2425               1                  6      25    Thu        20.0              
4      13074629.0        14     unknown 8       2013-06-21 00:00:00 unknown   2.0               unknown  3.0               unknown 1.0                    5.6185714286         1                  6      25    Fri        21.0              
5      13387341.0        14     unknown 8       2013-06-22 00:00:00 unknown   2.0               unknown  3.0               unknown 1.0                    4.49                 1                  6      25    Sat        22.0              
6      13501141.0        4      unknown 1       2013-05-13 00:00:00 unknown   1.0               unknown  1.5               unknown 1.0                    6.74                 1                  5      20    Mon        13.0              
7      13558712.0        15     unknown 9       2013-05-12 00:00:00 unknown   1.0               unknown  1.5               unknown 1.0                    10.323333333         1                  5      19    Sun        12.0              
8      13563017.0        14     unknown 8       2013-06-24 00:00:00 unknown   1.0               unknown  1.5               unknown 1.0                    5.3066666667         1                  6      26    Mon        24.0              
9      13584134.0        14     unknown 8       2013-06-27 00:00:00 unknown   1.0               unknown  1.5               unknown 1.0                    2.84                 1                  6      26    Thu        27.0



In [20]:

    
# quantity unary, drop it
# drop other unusable variables
# set modeling roles
drops = ['id', 'chain', 'market', 'offerdate', 'quantity']
y = 'repeater'
X = [name for name in data_h2o.columns if name not in [y] + drops]
print(y)
print(X)









    



repeater
['offer', 'category', 'company', 'offervalue', 'brand', 'avg_category_quantity', 'avg_category_amount', 'exact_item_bought', 'month', 'week', 'dayOfWeek', 'day']



In [21]:

    
# create modeling partitions
train, valid, test = data_h2o.split_frame([0.4, 0.3], seed=SEED)



In [22]:

    
# check shape 
train.shape









    Out[22]:





(64122, 18)



In [23]:

    
# check shape 
valid.shape









    Out[23]:





(47980, 18)



In [24]:

    
# check shape 
test.shape









    Out[24]:





(47955, 18)



In [25]:

    
# elastic net regularized regression 
#   - L1 for variable selection
#   - L2 for handling multicollinearity
#   - IRLS for handling outliers
#   - with lamba parameter tuning for variable selection

# initialize
rptr_glm = H2OGeneralizedLinearEstimator(family='binomial',
                                         model_id='rptr_glm1',
                                         solver='IRLSM',
                                         nfolds=3,
                                         standardize=True,
                                         seed=SEED,
                                         lambda_search=True)

# train 
rptr_glm.train(X, y, training_frame=train, validation_frame=valid)









    



glm Model Build progress: |███████████████████████████████████████████████| 100%



In [26]:

    
# check for stability across folds -- looks good
rptr_glm.cross_validation_metrics_summary().as_data_frame()









    Out[26]:







  
    
      
      
      mean
      sd
      cv_1_valid
      cv_2_valid
      cv_3_valid
    
  
  
    
      0
      accuracy
      0.5633181
      0.0034985114
      0.5576324
      0.5626291
      0.56969273
    
    
      1
      auc
      0.66687614
      0.0013872668
      0.6641128
      0.6684734
      0.6680421
    
    
      2
      err
      0.43668193
      0.0034985114
      0.4423676
      0.4373709
      0.4303073
    
    
      3
      err_count
      9334.0
      99.13627
      9514.0
      9316.0
      9172.0
    
    
      4
      f0point5
      0.39562488
      8.234713E-4
      0.39596537
      0.3968501
      0.39405915
    
    
      5
      f1
      0.48101103
      0.0012617736
      0.48254105
      0.481984
      0.47850809
    
    
      6
      f2
      0.613405
      0.0024687524
      0.61756927
      0.6136203
      0.60902536
    
    
      7
      lift_top_group
      2.0142968
      0.050682575
      2.027856
      2.0945127
      1.920521
    
    
      8
      logloss
      0.5482143
      0.0020346912
      0.5511301
      0.5492147
      0.54429805
    
    
      9
      max_per_class_error
      0.50603294
      0.0070085404
      0.5175562
      0.5071829
      0.4933597
    
    
      10
      mcc
      0.21990056
      0.0016726145
      0.21801764
      0.21844749
      0.22323658
    
    
      11
      mean_per_class_accuracy
      0.6226167
      0.0014614518
      0.6208214
      0.62151676
      0.6255119
    
    
      12
      mean_per_class_error
      0.37738332
      0.0014614518
      0.37917858
      0.37848327
      0.37448812
    
    
      13
      mse
      0.18361354
      9.3339913E-4
      0.18492535
      0.18410787
      0.18180738
    
    
      14
      null_deviance
      24907.797
      143.848
      25161.13
      24899.21
      24663.049
    
    
      15
      precision
      0.3537607
      7.134559E-4
      0.35366338
      0.3550422
      0.35257646
    
    
      16
      r2
      0.0670119
      9.0821454E-4
      0.06541767
      0.068562925
      0.067055106
    
    
      17
      recall
      0.7512663
      0.004308973
      0.759199
      0.75021636
      0.7443835
    
    
      18
      residual_deviance
      23435.428
      146.4661
      23706.31
      23396.547
      23203.426
    
    
      19
      rmse
      0.42849872
      0.0010901511
      0.43002948
      0.42907792
      0.42638877
    
    
      20
      specificity
      0.49396706
      0.0070085404
      0.4824438
      0.4928171
      0.50664026



In [27]:

    
# train AUC
rptr_glm.auc(valid=False)









    Out[27]:





0.6682898515349129



In [28]:

    
# valid AUC
rptr_glm.auc(valid=True)









    Out[28]:





0.6642797159900581



In [29]:

    
# test AUC
rptr_glm.model_performance(test).auc()









    Out[29]:





0.66930388703468



In [30]:

    
# many validation metrics
rptr_glm.model_performance(valid)









    



ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.18433338418488365
RMSE: 0.4293406388695154
LogLoss: 0.5500303205580226
Null degrees of freedom: 47979
Residual degrees of freedom: 47926
Null deviance: 56034.26930760575
Residual deviance: 52780.909560747896
AIC: 52888.909560747896
AUC: 0.6642797159900581
pr_auc: 0.4082413348072177
Gini: 0.32855943198011617
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.23680137514237834: 






    





0
1
Error
Rate
0
17240.0
17753.0
0.5073
 (17753.0/34993.0)
1
3305.0
9682.0
0.2545
 (3305.0/12987.0)
Total
20545.0
27435.0
0.4389
 (21058.0/47980.0)






    



Maximum Metrics: Maximum metrics at their respective thresholds







    




metric
threshold
value
idx
max f1
0.2368014
0.4790461
274.0
max f2
0.1223229
0.6605821
358.0
max f0point5
0.3284445
0.4251892
202.0
max accuracy
0.4805090
0.7320342
76.0
max precision
0.7387865
0.6923077
8.0
max recall
0.0559133
1.0
399.0
max specificity
0.9378370
0.9999143
0.0
max absolute_mcc
0.2928417
0.2189525
229.0
max min_per_class_accuracy
0.2913873
0.6202098
231.0
max mean_per_class_accuracy
0.2928417
0.6222223
229.0






    



Gains/Lift Table: Avg response rate: 27.07 %, avg score: 26.98 %







    





group
cumulative_data_fraction
lower_threshold
lift
cumulative_lift
response_rate
score
cumulative_response_rate
cumulative_score
capture_rate
cumulative_capture_rate
gain
cumulative_gain

1
0.0100459
0.5366650
2.0005291
2.0005291
0.5414938
0.5850936
0.5414938
0.5850936
0.0200970
0.0200970
100.0529096
100.0529096

2
0.0200917
0.5128554
1.9698696
1.9851994
0.5331950
0.5228512
0.5373444
0.5539724
0.0197890
0.0398860
96.9869646
98.5199371

3
0.0300125
0.4961243
1.9093237
1.9601182
0.5168067
0.5044425
0.5305556
0.5376000
0.0189420
0.0588281
90.9323674
96.0118238

4
0.0400167
0.4826037
1.9472902
1.9569112
0.5270833
0.4896555
0.5296875
0.5256139
0.0194810
0.0783091
94.7290239
95.6911238

5
0.05
0.4676137
1.7353953
1.9126819
0.4697286
0.4742862
0.5177157
0.5153654
0.0173250
0.0956341
73.5395264
91.2681913

6
0.1
0.4247933
1.7386617
1.8256718
0.4706128
0.4440226
0.4941642
0.4796940
0.0869331
0.1825672
73.8661739
82.5671826

7
0.15
0.3940127
1.4676215
1.7063217
0.3972489
0.4091581
0.4618591
0.4561820
0.0733811
0.2559483
46.7621468
70.6321706

8
0.2
0.3653559
1.3043813
1.6058366
0.3530638
0.3791261
0.4346603
0.4369180
0.0652191
0.3211673
30.4381304
60.5836606

9
0.3152980
0.3233892
1.3456877
1.5107055
0.3642444
0.3402462
0.4089106
0.4015671
0.1551552
0.4763225
34.5687698
51.0705474

10
0.4043560
0.3039213
1.1542497
1.4321974
0.3124269
0.3130243
0.3876604
0.3820659
0.1027951
0.5791176
15.4249715
43.2197384

11
0.5002293
0.2727063
0.9581511
1.3413423
0.2593478
0.2891334
0.3630682
0.3642546
0.0918611
0.6709787
-4.1848872
34.1342304

12
0.6016048
0.2177024
0.9608340
1.2772234
0.2600740
0.2480176
0.3457128
0.3446676
0.0974051
0.7683838
-3.9166001
27.7223392

13
0.7013964
0.1838723
0.6767011
1.1917837
0.1831662
0.1940466
0.3225864
0.3232379
0.0675291
0.8359128
-32.3298943
19.1783730

14
0.8050855
0.1746898
0.6928512
1.1275249
0.1875377
0.1783366
0.3051931
0.3045757
0.0718411
0.9077539
-30.7148819
12.7524917

15
0.9000208
0.1270697
0.6277750
1.0748106
0.1699232
0.1564416
0.2909247
0.2889503
0.0595981
0.9673520
-37.2225050
7.4810629

16
1.0
0.0519661
0.3265484
1.0
0.0883886
0.0972660
0.2706753
0.2697859
0.0326480
1.0
-67.3451614
0.0






    









    Out[30]:



In [31]:

    
# print coefficients
for key_ in rptr_glm.coef():
    print(key_, rptr_glm.coef()[key_])









    



Intercept 0.9142362343476567
offer.1198271 0.03463083385893831
offer.1198272 0.14052728909397258
offer.1198273 -0.028110797822582217
offer.1198274 -0.13117283777912048
offer.1198275 -0.07475893905600814
offer.1200581 -0.2663110424214166
offer.1200582 0.0
offer.1200584 0.0
offer.1204576 -0.13863085089106314
offer.1204821 -0.16653161796482882
offer.1204822 0.0
offer.1208251 0.0
offer.1208252 0.90290635746174
offer.1208329 0.03988082619178288
offer.1208501 0.4323275425518459
offer.1208503 0.0
offer.unknown -0.24095314385202235
week.9 0.3882866662473721
week.10 0.34503199800881224
week.11 -0.3374664786715979
week.12 0.0
week.13 -0.3946215550940864
week.14 -0.22160719365547848
week.15 0.0
week.16 -0.09617188647612415
week.17 0.053510178998351796
week.18 0.15565259807093573
category.1726 -0.16996755533119035
category.2119 0.03988082619178368
category.2202 0.6589080081306594
category.3504 0.0
category.5558 -0.04144651740120281
category.5616 -0.13863085089106691
category.5619 -0.09783190861526495
category.6202 0.12060177946446535
category.unknown -0.2409531438519952
brand.102504 -0.09783190861526239
brand.15889 -0.13863085089106675
brand.3718 0.6589080081306663
brand.5072 -0.04144651740120234
brand.64486 0.12060177946446629
brand.6926 0.03988082619178345
brand.7668 -0.9124249815547996
brand.unknown 0.0
company.104460040 0.0
company.104610040 -0.1386308508910667
company.107120272 -0.041446517401202365
company.107717272 -0.09783190861526334
company.108079383 0.03988082619178309
company.1087744888 0.1206017794644651
company.unknown 0.0
dayOfWeek.Mon -0.06788130486673358
dayOfWeek.Tue -0.001410333705654479
dayOfWeek.Wed 0.007728177083505018
dayOfWeek.Thu 0.0
dayOfWeek.Fri 0.0
dayOfWeek.Sat 0.14530381687259547
dayOfWeek.Sun 0.1032571462037334
exact_item_bought.0 -0.1507968420390149
exact_item_bought.1 0.1263809799828914
month.3 0.19741069489701107
month.4 -0.1549780733058436
offervalue -0.9912946388035481
avg_category_quantity -0.03619275098428959
avg_category_amount 0.08680975561720691
day -0.03191424771152614



In [32]:

    
# find id == 13584134
score_h2o['id'].asfactor().head()



In [33]:

    
# get probability for id == 13584134
rptr_glm.predict(score_h2o)









    



glm prediction progress: |████████████████████████████████████████████████| 100%






    



/home/patrickh/anaconda3/lib/python3.6/site-packages/h2o/job.py:69: UserWarning: Test/Validation dataset column 'week' has levels not trained on: [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
  warnings.warn(w)
/home/patrickh/anaconda3/lib/python3.6/site-packages/h2o/job.py:69: UserWarning: Test/Validation dataset column 'month' has levels not trained on: [5, 6, 7]
  warnings.warn(w)






    






  predict       p0        p1


        0 0.931928 0.0680718
        0 0.964497 0.0355034
        0 0.854873 0.145127 
        0 0.856192 0.143808 
        0 0.960205 0.0397945
        0 0.959612 0.0403879
        0 0.804033 0.195967 
        1 0.71045 0.28955  
        0 0.868433 0.131567 
        0 0.893706 0.106294 








    Out[33]:



In [34]:

    
# import target encoder
from h2o.targetencoder import TargetEncoder



In [35]:

    
# train target encoder
e_columns = ['market', 'chain']
te_ = TargetEncoder(x=e_columns, y=y)
train[y] = train[y].asfactor()
_ = te_.fit(train)



In [36]:

    
# leave-one-out target encoding on train, valid, test
e_train = te_.transform(frame=train, holdout_type='loo', seed=12345)
valid[y] = valid[y].asfactor()
e_valid = te_.transform(frame=valid, holdout_type='loo', seed=12345)
test[y] = test[y].asfactor()
e_test = te_.transform(frame=test, holdout_type='loo', seed=12345)



In [37]:

    
# check train
e_train.head(rows=2)









    






  chain   market          id offer    repeater offerdate          category    quantity company    offervalue brand    avg_category_quantity   avg_category_amount   exact_item_bought   month   week dayOfWeek    day   market_te   chain_te


     10        5 2.55092e+08 unknown          0 2013-03-14 00:00:00 unknown            1 unknown             2 unknown                       1                  9.99                   1       3     11 Thu           14    0.162703   0.160728
     10        5 2.66075e+08 1204576          1 2013-04-04 00:00:00 5616               1 104610040            1 15889                        1                  2.79                   1       4     14 Thu            4    0.125308   0.124052








    Out[37]:



In [38]:

    
# check valid
e_valid.head(rows=2)









    






  chain   market          id offer    repeater offerdate          category    quantity company    offervalue brand    avg_category_quantity   avg_category_amount   exact_item_bought   month   week dayOfWeek    day   market_te   chain_te


     10        5 2.54652e+08 1198275          0 2013-03-27 00:00:00 5558               1 107120272         1.5 5072                       1.4                   1.8                   1       3     13 Wed           27    0.158439   0.160728
     10        5 2.54734e+08 unknown          0 2013-04-24 00:00:00 unknown            1 unknown          0.75 unknown                     0                    0                    0       4     17 Wed           24    0.150508   0.165719








    Out[38]:



In [39]:

    
# check test 
e_test.head(rows=2)









    






  chain   market          id offer    repeater offerdate          category    quantity company    offervalue brand    avg_category_quantity   avg_category_amount   exact_item_bought   month   week dayOfWeek    day   market_te   chain_te


     10        5 2.58623e+08 1198271          0 2013-03-25 00:00:00 5558               1 107120272         1.5 5072                         3               3.66667                   1       3     13 Mon           25    0.163404   0.160728
     10        5 2.58693e+08 unknown          1 2013-03-29 00:00:00 unknown            1 unknown          0.75 unknown                       0               0                        0       3     13 Fri           29    0.121452   0.124052








    Out[39]:



In [40]:

    
e_test['id'].asfactor().head(rows=2)



In [41]:

    
X = X + ['market_te', 'chain_te']
print(X)









    



['offer', 'category', 'company', 'offervalue', 'brand', 'avg_category_quantity', 'avg_category_amount', 'exact_item_bought', 'month', 'week', 'dayOfWeek', 'day', 'market_te', 'chain_te']



In [42]:

    
# import GBM and grid search
from h2o.estimators.gbm import H2OGradientBoostingEstimator 
from h2o.grid.grid_search import H2OGridSearch



In [43]:

    
# GBM with random hyperparameter search
# train many different GBM models with random hyperparameters
# and select best model based on validation error

# define random grid search parameters
hyper_parameters = {'ntrees':list(range(50, 500, 50)),
                    'max_depth':list(range(2, 20, 2)),
                    'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

# define search strategy
search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':50,
                   'max_runtime_secs':1200, 
                   'seed': 12345}

# initialize grid search
gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

# execute training w/ grid search
gsearch.train(x=X,
              y=y,
              training_frame=e_train,
              validation_frame=e_valid, 
              seed=12345)

# view detailed results at http://localhost:54321/flow/index.html









    



gbm Grid Build progress: |████████████████████████████████████████████████| 100%



In [44]:

    
# show grid search results
gsearch.show()

# select best model
gbm_model = gsearch.get_grid()[0]









    



     col_sample_rate max_depth ntrees sample_rate  \
0                0.9         6    250         1.0   
1                0.4         6    150         0.9   
2                0.4         4    300         0.9   
3                1.0         4    300         0.5   
4                0.4         6    100         1.0   
5                1.0         2    400         0.9   
6                0.1        10    350         0.8   
7                1.0         4    250         0.3   
8                0.1         6    150         0.9   
9                0.1        10    350         0.6   
10               0.2         8    150         0.5   
11               0.1        10    250         0.7   
12               0.6         4     50         0.6   
13               0.8         8    100         0.6   
14               1.0         4     50         0.3   
15               0.8         4     50         0.3   
16               0.1        14    350         0.5   
17               1.0         8     50         0.3   
18               0.2         8    300         0.4   
19               0.1        12    100         0.1   
20               0.4        10    350         0.7   
21               0.7        12    150         1.0   
22               0.9         6    350         0.4   
23               1.0        12     50         1.0   
24               0.2         6    350         0.1   
25               0.6         6    300         0.2   
26               0.4        14    250         1.0   
27               0.4        14    200         1.0   
28               0.9         8    150         0.2   
29               1.0        12    300         0.7   
30               0.9         8    250         0.3   
31               0.3        16     50         0.4   
32               1.0        14     50         0.3   
33               0.9        14    400         0.7   
34               0.2        16    250         0.5   
35               0.6        16    400         0.9   
36               0.8        16     50         0.9   
37               0.5         8    400         0.2   
38               0.9        16    400         0.7   
39               0.7        18     50         0.6   
40               0.9        12    150         0.4   
41               0.7         8    350         0.1   
42               0.5        12    200         0.3   
43               0.9        16    100         0.1   
44               0.8        18    250         0.6   
45               0.8        10    450         0.3   
46               0.6        12    350         0.1   
47               0.7        18    200         0.1   
48               0.5        18    400         0.1   
49               0.9        16    400         0.2   

                                                         model_ids  \
0   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_11   
1   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_32   
2   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_10   
3   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_27   
4   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_43   
5    Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_1   
6    Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_2   
7   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_33   
8   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_29   
9   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_31   
10  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_16   
11  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_12   
12   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_3   
13  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_50   
14  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_28   
15  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_14   
16  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_25   
17  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_22   
18  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_39   
19  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_42   
20   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_8   
21  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_47   
22  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_34   
23  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_41   
24  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_15   
25  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_30   
26  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_24   
27   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_4   
28  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_45   
29  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_21   
30  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_38   
31   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_5   
32  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_35   
33   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_7   
34  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_36   
35  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_20   
36  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_23   
37  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_13   
38  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_26   
39  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_17   
40  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_49   
41  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_19   
42  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_48   
43  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_40   
44  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_37   
45  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_46   
46   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_6   
47  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_44   
48  Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_18   
49   Grid_GBM_py_29_sid_8290_model_python_1554643783238_15_model_9   

               logloss  
0   0.5348834983989057  
1   0.5350291550847558  
2   0.5350894789165441  
3    0.535248031739906  
4   0.5354232866015268  
5    0.536646342786623  
6    0.537452131939599  
7   0.5376479557679856  
8   0.5378304173039002  
9   0.5378854566021242  
10  0.5380053569351866  
11  0.5380773747073453  
12  0.5386174767148093  
13  0.5388992002840919  
14  0.5391120670305152  
15  0.5391530686859511  
16  0.5393710636142622  
17   0.540390311161104  
18  0.5416671715429446  
19  0.5419625334634491  
20  0.5426712431838964  
21  0.5432625381356367  
22  0.5432646516697345  
23  0.5451345163682967  
24  0.5458733104268865  
25   0.545909460632734  
26  0.5498716819062104  
27  0.5498716819062104  
28  0.5499028994766184  
29  0.5502053659777679  
30  0.5558261784124103  
31   0.557359195507266  
32  0.5581103738596365  
33  0.5584909832655498  
34  0.5588497425482545  
35  0.5617982925457293  
36  0.5625896941243087  
37  0.5654485751617531  
38  0.5713129537881524  
39  0.5713801530621511  
40   0.574567832157054  
41  0.5748519555317287  
42  0.5825993745923173  
43  0.5871166938602922  
44  0.5994404448481553  
45  0.6146780634348513  
46  0.6200616332955663  
47  0.6294138163966044  
48  0.6860690534310121  
49  0.7170542574246234



In [45]:

    
# train AUC
gbm_model.auc(valid=False)









    Out[45]:





0.7372164154988929



In [46]:

    
# valid AUC
gbm_model.auc(valid=True)









    Out[46]:





0.6955609131044218



In [47]:

    
# test AUC
gbm_model.model_performance(e_test).auc()









    Out[47]:





0.7033280640065208



In [48]:

    
# examine variable importance
%matplotlib inline
gbm_model.varimp_plot()



In [49]:

    
# make list of most important variables
important_vars = [row[0] for row in gbm_model.varimp()]
important_vars[:10]









    Out[49]:





['chain_te',
 'week',
 'avg_category_amount',
 'avg_category_quantity',
 'market_te',
 'offer',
 'offervalue',
 'dayOfWeek',
 'brand',
 'day']



In [50]:

    
# generate partial dependence plot for most important variable
_ = gbm_model.partial_plot(data=e_test, cols=[important_vars[0]], server=True, plot=True)









    



PartialDependencePlot progress: |█████████████████████████████████████████| 100%



In [51]:

    
# update X to contain only ten most important variables
X = important_vars[:10]
print(X)









    



['chain_te', 'week', 'avg_category_amount', 'avg_category_quantity', 'market_te', 'offer', 'offervalue', 'dayOfWeek', 'brand', 'day']



In [52]:

    
# import mlp
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

# NN with random hyperparameter search
# train many different NN models with random hyperparameters
# and select best model based on validation error

# define random grid search parameters
hyper_parameters = {'hidden':[[170, 320], [80, 190], [320, 160, 80], [100], [50, 50, 50, 50]],
                    'l1':[s/1e4 for s in range(0, 1000, 100)],
                    'l2':[s/1e5 for s in range(0, 1000, 100)],
                    'input_dropout_ratio':[s/1e2 for s in range(0, 20, 2)]}

# define search strategy
search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':50,
                   'max_runtime_secs':1800, 
                   'seed': 12345}

# initialize grid search
gsearch = H2OGridSearch(H2ODeepLearningEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

# execute training w/ grid search
gsearch.train(x=X,
              y=y,
              training_frame=e_train,
              validation_frame=e_valid, 
              seed=12345)

# view detailed results at http://localhost:54321/flow/index.html









    



deeplearning Grid Build progress: |███████████████████████████████████████| 100%



In [53]:

    
# show grid search results
gsearch.show()

# select best model
mlp_model = gsearch.get_grid()[0]









    



                hidden input_dropout_ratio    l1     l2  \
0     [50, 50, 50, 50]                 0.1   0.0  0.002   
1           [170, 320]                0.16   0.0  0.008   
2                [100]                0.14  0.01  0.006   
3                [100]                0.14  0.01  0.001   
4                [100]                0.06  0.02  0.006   
5                [100]                0.06  0.05  0.008   
6                [100]                 0.0  0.08  0.003   
7                [100]                0.18  0.03  0.009   
8                [100]                0.14  0.04  0.009   
9                [100]                0.04  0.07  0.003   
10               [100]                0.16  0.07  0.002   
11               [100]                0.04  0.07  0.009   
12               [100]                0.02  0.05  0.008   
13               [100]                0.16  0.06  0.006   
14               [100]                0.12  0.06  0.001   
15               [100]                0.14  0.08    0.0   
16               [100]                0.12  0.09  0.009   
17    [50, 50, 50, 50]                0.04  0.09  0.004   
18    [50, 50, 50, 50]                0.08  0.09  0.001   
19           [80, 190]                0.02  0.09    0.0   
20    [50, 50, 50, 50]                 0.1  0.08  0.009   
21          [170, 320]                 0.1  0.08  0.007   
22          [170, 320]                0.14  0.08  0.006   
23          [170, 320]                0.06  0.07  0.005   
24           [80, 190]                0.02  0.07  0.004   
25          [170, 320]                0.08  0.07    0.0   
26          [170, 320]                0.12  0.06  0.004   
27      [320, 160, 80]                0.14  0.06  0.002   
28    [50, 50, 50, 50]                0.02  0.06  0.002   
29          [170, 320]                0.18  0.06    0.0   
30           [80, 190]                0.02  0.05  0.009   
31      [320, 160, 80]                0.18  0.05  0.002   
32           [80, 190]                 0.1  0.05    0.0   
33    [50, 50, 50, 50]                0.02  0.04  0.008   
34      [320, 160, 80]                0.18  0.04  0.008   
35      [320, 160, 80]                0.06  0.04  0.005   
36           [80, 190]                 0.0  0.04  0.004   
37           [80, 190]                0.12  0.04  0.003   
38    [50, 50, 50, 50]                0.06  0.04  0.002   
39          [170, 320]                0.12  0.04  0.001   
40          [170, 320]                0.16  0.03  0.008   
41    [50, 50, 50, 50]                0.16  0.03  0.006   
42      [320, 160, 80]                0.14  0.02  0.005   
43          [170, 320]                0.06  0.02  0.005   
44           [80, 190]                 0.0  0.02  0.005   
45      [320, 160, 80]                 0.0  0.01  0.003   
46    [50, 50, 50, 50]                0.16  0.01  0.002   
47    [50, 50, 50, 50]                0.02  0.01  0.002   
48    [50, 50, 50, 50]                0.08  0.01    0.0   
49           [80, 190]                0.18  0.01    0.0   

                                                                model_ids  \
0   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
1   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
2   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
3   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
4   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
5   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
6   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
7   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
8   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
9   Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
10  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
11  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
12  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
13  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
14  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
15  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
16  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
17  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
18  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
19  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
20  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
21  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
22  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
23  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
24  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
25  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
26  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
27  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
28  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
29  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
30  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
31  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
32  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
33  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
34  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
35  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
36  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
37  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
38  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
39  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
40  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
41  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
42  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
43  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
44  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
45  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
46  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
47  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
48  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   
49  Grid_DeepLearning_py_29_sid_8290_model_python_1554643783238_3765_m...   

               logloss  
0   0.5441277425619976  
1   0.5468463763679692  
2   0.5506008410750938  
3   0.5506723146192487  
4   0.5606036290296568  
5   0.5839340260871398  
6   0.5839911584694784  
7   0.5844394972593425  
8   0.5846837953814987  
9   0.5849682481160456  
10  0.5850026388614178  
11  0.5851131278900356  
12  0.5857919095875189  
13  0.5858466411918215  
14  0.5858754362158225  
15  0.5906781682601115  
16  0.5940594978259867  
17  0.6431177451391431  
18  0.6450529608012587  
19  0.6457162013454759  
20  0.6486068335324051  
21  0.6499785663239965  
22  0.6506787737522219  
23  0.6611527490256719  
24  0.6619788531748063  
25  0.6654116986223156  
26  0.6728097497599246  
27  0.6747520073743039  
28  0.6747520073743055  
29  0.6767584866583781  
30  0.6794279016802731  
31  0.6869352546368972  
32  0.6892490551997769  
33  0.6928809978038432  
34  0.6928809978038432  
35   0.696514948013381  
36  0.6977683709229899  
37  0.6990455169581326  
38  0.7003460472005603  
39  0.7016671407154101  
40  0.7065238616979619  
41  0.7092814797747413  
42  0.7263483169261058  
43  0.7263483169261058  
44  0.7263483169261058  
45  0.7475621179617055  
46  0.7495904295327387  
47  0.7495904295327387  
48  0.7537832750142842  
49  0.7537832750142842



In [54]:

    
# print train, valid, test AUC
print(mlp_model.auc(valid=False))
print(mlp_model.auc(valid=True))
print(mlp_model.model_performance(e_test).auc())









    



0.6898022376723639
0.6827321992354999
0.6906367158970282



In [55]:

    
# print partial dependence
_ = mlp_model.partial_plot(data=e_test, cols=[important_vars[0]], server=True, plot=True)









    



PartialDependencePlot progress: |█████████████████████████████████████████| 100%

H2O cluster uptime:	01 secs
H2O cluster timezone:	America/New_York
H2O data parsing timezone:	UTC
H2O cluster version:	3.22.1.6
H2O cluster version age:	24 days
H2O cluster name:	H2O_from_python_patrickh_020k5w
H2O cluster total nodes:	1
H2O cluster free memory:	3.422 Gb
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster status:	accepting new members, healthy
H2O connection url:	http://127.0.0.1:54321
H2O connection proxy:	None
H2O internal security:	False
H2O API Extensions:	Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4
Python version:	3.6.4 final

	id	chain	offer	market	repeater	offerdate	category	quantity	company	offervalue	brand	avg_category_quantity	avg_category_amount	exact_item_bought	month	week	dayOfWeek	day
type	int	enum	enum	enum	int	time	enum	int	enum	real	enum	real	real	enum	enum	enum	enum	int
mins	86246.0				0.0	1362096000000.0		1.0		0.75		-1.0	-85.14					1.0
mean	1869324131.2555754				0.27139081702143614	1365220653082.3323		1.0		1.25532310364433		0.6949999031818261	2.2441312757227134					19.004467158574705
maxs	4809911000.0				1.0	1367280000000.0		1.0		3.0		44.181818182	62.218					31.0
sigma	1570833451.4856217				0.4446786220260453	1303790212.0504372		0.0		0.5246445261465219		0.7609072934172032	2.5989523363669176					9.632905002480289
zeros	0				116619	0		0		0		72549	72591					0
missing	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
0	86246.0	205	1208251	34	1.0	2013-04-24 00:00:00	2202	1.0	104460040	2.0	3718	0.0	0.0	0	4	17	Wed	24.0
1	86252.0	205	unknown	34	1.0	2013-03-27 00:00:00	unknown	1.0	unknown	0.75	unknown	1.0	2.425	1	3	13	Wed	27.0
2	12682470.0	18	unknown	11	0.0	2013-03-28 00:00:00	unknown	1.0	unknown	0.75	unknown	1.0	2.5	1	3	13	Thu	28.0
3	12996040.0	15	unknown	9	0.0	2013-03-25 00:00:00	unknown	1.0	unknown	0.75	unknown	0.0	0.0	0	3	13	Mon	25.0
4	13089312.0	15	1204821	9	0.0	2013-04-01 00:00:00	5619	1.0	107717272	1.5	102504	0.0	0.0	0	4	14	Mon	1.0
5	13179265.0	14	unknown	8	0.0	2013-03-29 00:00:00	unknown	1.0	unknown	0.75	unknown	0.0	0.0	0	3	13	Fri	29.0
6	13251776.0	15	1200581	9	0.0	2013-03-30 00:00:00	1726	1.0	104460040	1.25	7668	1.8	4.563	1	3	13	Sat	30.0
7	13540129.0	14	1200581	8	0.0	2013-03-30 00:00:00	1726	1.0	104460040	1.25	7668	1.3333333333	4.6533333333	1	3	13	Sat	30.0
8	13807224.0	4	1204576	1	0.0	2013-04-05 00:00:00	5616	1.0	104610040	1.0	15889	1.4375	3.365	1	4	14	Fri	5.0
9	13873775.0	4	unknown	1	0.0	2013-03-26 00:00:00	unknown	1.0	unknown	0.75	unknown	0.0	0.0	0	3	13	Tue	26.0

		mean	sd	cv_1_valid	cv_2_valid	cv_3_valid
0	accuracy	0.5633181	0.0034985114	0.5576324	0.5626291	0.56969273
1	auc	0.66687614	0.0013872668	0.6641128	0.6684734	0.6680421
2	err	0.43668193	0.0034985114	0.4423676	0.4373709	0.4303073
3	err_count	9334.0	99.13627	9514.0	9316.0	9172.0
4	f0point5	0.39562488	8.234713E-4	0.39596537	0.3968501	0.39405915
5	f1	0.48101103	0.0012617736	0.48254105	0.481984	0.47850809
6	f2	0.613405	0.0024687524	0.61756927	0.6136203	0.60902536
7	lift_top_group	2.0142968	0.050682575	2.027856	2.0945127	1.920521
8	logloss	0.5482143	0.0020346912	0.5511301	0.5492147	0.54429805
9	max_per_class_error	0.50603294	0.0070085404	0.5175562	0.5071829	0.4933597
10	mcc	0.21990056	0.0016726145	0.21801764	0.21844749	0.22323658
11	mean_per_class_accuracy	0.6226167	0.0014614518	0.6208214	0.62151676	0.6255119
12	mean_per_class_error	0.37738332	0.0014614518	0.37917858	0.37848327	0.37448812
13	mse	0.18361354	9.3339913E-4	0.18492535	0.18410787	0.18180738
14	null_deviance	24907.797	143.848	25161.13	24899.21	24663.049
15	precision	0.3537607	7.134559E-4	0.35366338	0.3550422	0.35257646
16	r2	0.0670119	9.0821454E-4	0.06541767	0.068562925	0.067055106
17	recall	0.7512663	0.004308973	0.759199	0.75021636	0.7443835
18	residual_deviance	23435.428	146.4661	23706.31	23396.547	23203.426
19	rmse	0.42849872	0.0010901511	0.43002948	0.42907792	0.42638877
20	specificity	0.49396706	0.0070085404	0.4824438	0.4928171	0.50664026

	0	1	Error	Rate
0	17240.0	17753.0	0.5073	(17753.0/34993.0)
1	3305.0	9682.0	0.2545	(3305.0/12987.0)
Total	20545.0	27435.0	0.4389	(21058.0/47980.0)

metric	threshold	value	idx
max f1	0.2368014	0.4790461	274.0
max f2	0.1223229	0.6605821	358.0
max f0point5	0.3284445	0.4251892	202.0
max accuracy	0.4805090	0.7320342	76.0
max precision	0.7387865	0.6923077	8.0
max recall	0.0559133	1.0	399.0
max specificity	0.9378370	0.9999143	0.0
max absolute_mcc	0.2928417	0.2189525	229.0
max min_per_class_accuracy	0.2913873	0.6202098	231.0
max mean_per_class_accuracy	0.2928417	0.6222223	229.0

group	cumulative_data_fraction	lower_threshold	lift	cumulative_lift	response_rate	score	cumulative_response_rate	cumulative_score	capture_rate	cumulative_capture_rate	gain	cumulative_gain
1	0.0100459	0.5366650	2.0005291	2.0005291	0.5414938	0.5850936	0.5414938	0.5850936	0.0200970	0.0200970	100.0529096	100.0529096
2	0.0200917	0.5128554	1.9698696	1.9851994	0.5331950	0.5228512	0.5373444	0.5539724	0.0197890	0.0398860	96.9869646	98.5199371
3	0.0300125	0.4961243	1.9093237	1.9601182	0.5168067	0.5044425	0.5305556	0.5376000	0.0189420	0.0588281	90.9323674	96.0118238
4	0.0400167	0.4826037	1.9472902	1.9569112	0.5270833	0.4896555	0.5296875	0.5256139	0.0194810	0.0783091	94.7290239	95.6911238
5	0.05	0.4676137	1.7353953	1.9126819	0.4697286	0.4742862	0.5177157	0.5153654	0.0173250	0.0956341	73.5395264	91.2681913
6	0.1	0.4247933	1.7386617	1.8256718	0.4706128	0.4440226	0.4941642	0.4796940	0.0869331	0.1825672	73.8661739	82.5671826
7	0.15	0.3940127	1.4676215	1.7063217	0.3972489	0.4091581	0.4618591	0.4561820	0.0733811	0.2559483	46.7621468	70.6321706
8	0.2	0.3653559	1.3043813	1.6058366	0.3530638	0.3791261	0.4346603	0.4369180	0.0652191	0.3211673	30.4381304	60.5836606
9	0.3152980	0.3233892	1.3456877	1.5107055	0.3642444	0.3402462	0.4089106	0.4015671	0.1551552	0.4763225	34.5687698	51.0705474
10	0.4043560	0.3039213	1.1542497	1.4321974	0.3124269	0.3130243	0.3876604	0.3820659	0.1027951	0.5791176	15.4249715	43.2197384
11	0.5002293	0.2727063	0.9581511	1.3413423	0.2593478	0.2891334	0.3630682	0.3642546	0.0918611	0.6709787	-4.1848872	34.1342304
12	0.6016048	0.2177024	0.9608340	1.2772234	0.2600740	0.2480176	0.3457128	0.3446676	0.0974051	0.7683838	-3.9166001	27.7223392
13	0.7013964	0.1838723	0.6767011	1.1917837	0.1831662	0.1940466	0.3225864	0.3232379	0.0675291	0.8359128	-32.3298943	19.1783730
14	0.8050855	0.1746898	0.6928512	1.1275249	0.1875377	0.1783366	0.3051931	0.3045757	0.0718411	0.9077539	-30.7148819	12.7524917
15	0.9000208	0.1270697	0.6277750	1.0748106	0.1699232	0.1564416	0.2909247	0.2889503	0.0595981	0.9673520	-37.2225050	7.4810629
16	1.0	0.0519661	0.3265484	1.0	0.0883886	0.0972660	0.2706753	0.2697859	0.0326480	1.0	-67.3451614	0.0

predict	p0	p1
0	0.931928	0.0680718
0	0.964497	0.0355034
0	0.854873	0.145127
0	0.856192	0.143808
0	0.960205	0.0397945
0	0.959612	0.0403879
0	0.804033	0.195967
1	0.71045	0.28955
0	0.868433	0.131567
0	0.893706	0.106294

chain	market	id	offer	repeater	offerdate	category	quantity	company	offervalue	brand	avg_category_quantity	avg_category_amount	exact_item_bought	month	week	dayOfWeek	day	market_te	chain_te
10	5	2.55092e+08	unknown	0	2013-03-14 00:00:00	unknown	1	unknown	2	unknown	1	9.99	1	3	11	Thu	14	0.162703	0.160728
10	5	2.66075e+08	1204576	1	2013-04-04 00:00:00	5616	1	104610040	1	15889	1	2.79	1	4	14	Thu	4	0.125308	0.124052

chain	market	id	offer	repeater	offerdate	category	quantity	company	offervalue	brand	avg_category_quantity	avg_category_amount	exact_item_bought	month	week	dayOfWeek	day	market_te	chain_te
10	5	2.54652e+08	1198275	0	2013-03-27 00:00:00	5558	1	107120272	1.5	5072	1.4	1.8	1	3	13	Wed	27	0.158439	0.160728
10	5	2.54734e+08	unknown	0	2013-04-24 00:00:00	unknown	1	unknown	0.75	unknown	0	0	0	4	17	Wed	24	0.150508	0.165719

chain	market	id	offer	repeater	offerdate	category	quantity	company	offervalue	brand	avg_category_quantity	avg_category_amount	exact_item_bought	month	week	dayOfWeek	day	market_te	chain_te
10	5	2.58623e+08	1198271	0	2013-03-25 00:00:00	5558	1	107120272	1.5	5072	3	3.66667	1	3	13	Mon	25	0.163404	0.160728
10	5	2.58693e+08	unknown	1	2013-03-29 00:00:00	unknown	1	unknown	0.75	unknown	0	0	0	3	13	Fri	29	0.121452	0.124052