notebook.community

Edit and run



In [1]:

    
import pandas as pd



In [2]:

    
# set measurement levels
col_types = {'chain': 'object',
             'offer': 'object',
             'market': 'object',
             'category': 'object',
             'company': 'object',
             'brand': 'object',
             'exact_item_bought': 'object'}



In [3]:

    
# read data created from SAS key
data_pd = pd.read_csv('/home/patrickh/workspace/GWU_data_mining/01_basic_data_prep/assignment/key/assignment_1_key_train_sas.csv', dtype=col_types)
score_pd = pd.read_csv('/home/patrickh/workspace/GWU_data_mining/01_basic_data_prep/assignment/key/assignment_1_key_test_sas.csv', dtype=col_types)



In [4]:

    
# confirm shape
data_pd.shape









    Out[4]:





(160057, 14)



In [5]:

    
# confirm shape
score_pd.shape









    Out[5]:





(151484, 13)



In [6]:

    
# data['chain'].unique().shape    # 130 levels
# data['market'].unique().shape   # 34 levels
# data['category'].unique().shape # 13 levels
# data['brand'].unique().shape    # 12 levels
# data['company'].unique().shape  # 11 levels



In [7]:

    
# show column names
data_pd.columns









    Out[7]:





Index(['id', 'chain', 'offer', 'market', 'repeater', 'offerdate', 'category',
       'quantity', 'company', 'offervalue', 'brand', 'avg_category_quantity',
       'avg_category_amount', 'exact_item_bought'],
      dtype='object')



In [8]:

    
# ensure repeater==t is modeled as 1
data_pd.loc[data_pd['repeater'] == 't', 'repeater'] = 1
data_pd.loc[data_pd['repeater'] == 'f', 'repeater'] = 0



In [9]:

    
# fix unknown values in train and test
chain_unknowns =\
list(set(data_pd['chain'].unique()) - set(score_pd['chain'].unique())) +\
list(set(score_pd['chain'].unique()) - set(data_pd['chain'].unique()))
    
print(chain_unknowns)     

data_pd.loc[data_pd['chain'].isin(chain_unknowns), 'chain'] = 'unknown'
score_pd.loc[score_pd['chain'].isin(chain_unknowns), 'chain'] = 'unknown'
     
print(sorted(data_pd['chain'].unique()))
print(len(data_pd['chain'].unique()))
print(sorted(score_pd['chain'].unique()))
print(len(score_pd['chain'].unique()))









    



['9', '230', '369', '284', '387', '370', '421']
['10', '100', '101', '104', '106', '108', '109', '115', '116', '12', '122', '126', '133', '134', '14', '140', '143', '15', '151', '152', '153', '16', '161', '163', '165', '166', '169', '17', '18', '180', '184', '187', '191', '192', '2', '20', '205', '206', '21', '211', '214', '215', '217', '224', '23', '233', '24', '240', '241', '246', '26', '278', '285', '293', '3', '304', '306', '307', '31', '313', '356', '360', '362', '368', '373', '376', '377', '384', '386', '388', '389', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399', '4', '40', '400', '401', '402', '403', '404', '405', '42', '422', '424', '431', '46', '48', '507', '508', '509', '510', '520', '521', '522', '523', '524', '525', '526', '58', '6', '62', '63', '64', '65', '68', '69', '70', '71', '73', '77', '8', '81', '83', '85', '88', '89', '95', '96', '98', 'unknown']
128
['10', '100', '101', '104', '106', '108', '109', '115', '116', '12', '122', '126', '133', '134', '14', '140', '143', '15', '151', '152', '153', '16', '161', '163', '165', '166', '169', '17', '18', '180', '184', '187', '191', '192', '2', '20', '205', '206', '21', '211', '214', '215', '217', '224', '23', '233', '24', '240', '241', '246', '26', '278', '285', '293', '3', '304', '306', '307', '31', '313', '356', '360', '362', '368', '373', '376', '377', '384', '386', '388', '389', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399', '4', '40', '400', '401', '402', '403', '404', '405', '42', '422', '424', '431', '46', '48', '507', '508', '509', '510', '520', '521', '522', '523', '524', '525', '526', '58', '6', '62', '63', '64', '65', '68', '69', '70', '71', '73', '77', '8', '81', '83', '85', '88', '89', '95', '96', '98', 'unknown']
128



In [10]:

    
# fix unknown values in train and test
market_unknowns =\
list(set(data_pd['market'].unique()) - set(score_pd['market'].unique())) +\
list(set(score_pd['market'].unique()) - set(data_pd['market'].unique()))
    
print(market_unknowns)     
     
print(sorted(data_pd['market'].unique()))
print(len(data_pd['market'].unique()))
print(sorted(score_pd['market'].unique()))
print(len(score_pd['market'].unique()))









    



[]
['1', '10', '11', '12', '14', '15', '16', '17', '18', '2', '20', '21', '22', '23', '24', '26', '27', '28', '33', '34', '35', '37', '39', '4', '43', '45', '47', '5', '6', '7', '8', '9', '93', '96']
34
['1', '10', '11', '12', '14', '15', '16', '17', '18', '2', '20', '21', '22', '23', '24', '26', '27', '28', '33', '34', '35', '37', '39', '4', '43', '45', '47', '5', '6', '7', '8', '9', '93', '96']
34



In [11]:

    
# fix unknown values in train and test
category_unknowns =\
list(set(data_pd['category'].unique()) - set(score_pd['category'].unique())) +\
list(set(score_pd['category'].unique()) - set(data_pd['category'].unique()))
    
print(category_unknowns)     

data_pd.loc[data_pd['category'].isin(category_unknowns), 'category'] = 'unknown'
score_pd.loc[score_pd['category'].isin(category_unknowns), 'category'] = 'unknown'
     
print(sorted(data_pd['category'].unique()))
print(len(data_pd['category'].unique()))
print(sorted(score_pd['category'].unique()))
print(len(score_pd['category'].unique()))









    



['3509', '9909', '3203', '1703', '4401', '7205', '5824', '799', '4517', '9115', '706', '5122']
['1726', '2119', '2202', '3504', '5558', '5616', '5619', '6202', 'unknown']
9
['1726', '2119', '2202', '3504', '5558', '5616', '5619', '6202', 'unknown']
9



In [12]:

    
# fix unknown values in train and test
brand_unknowns =\
list(set(data_pd['brand'].unique()) - set(score_pd['brand'].unique())) +\
list(set(score_pd['brand'].unique()) - set(data_pd['brand'].unique()))
    
print(brand_unknowns)     

data_pd.loc[data_pd['brand'].isin(brand_unknowns), 'brand'] = 'unknown'
score_pd.loc[score_pd['brand'].isin(brand_unknowns), 'brand'] = 'unknown'
     
print(sorted(data_pd['brand'].unique()))
print(len(data_pd['brand'].unique()))
print(sorted(score_pd['brand'].unique()))
print(len(score_pd['brand'].unique()))









    



['13791', '6732', '875', '28840', '13474', '26456', '4294', '93904', '17286', '1322', '17311', '26189']
['102504', '15889', '3718', '5072', '64486', '6926', '7668', 'unknown']
8
['102504', '15889', '3718', '5072', '64486', '6926', '7668', 'unknown']
8



In [13]:

    
# fix unknown values in train and test
company_unknowns =\
list(set(data_pd['company'].unique()) - set(score_pd['company'].unique())) +\
list(set(score_pd['company'].unique()) - set(data_pd['company'].unique()))
    
print(company_unknowns)     
     
data_pd.loc[data_pd['company'].isin(company_unknowns), 'company'] = 'unknown'
score_pd.loc[score_pd['company'].isin(company_unknowns), 'company'] = 'unknown'    
    
print(sorted(data_pd['company'].unique()))
print(len(data_pd['company'].unique()))
print(sorted(score_pd['company'].unique()))
print(len(score_pd['company'].unique()))









    



['106414464', '105100050', '103320030', '1089520383', '107127979', '103700030', '105190050', '104127141', '107106878', '1076211171', '108500080', '105450050']
['104460040', '104610040', '107120272', '107717272', '108079383', '1087744888', 'unknown']
7
['104460040', '104610040', '107120272', '107717272', '108079383', '1087744888', 'unknown']
7



In [14]:

    
# fix unknown values in train and test
offer_unknowns =\
list(set(data_pd['offer'].unique()) - set(score_pd['offer'].unique())) +\
list(set(score_pd['offer'].unique()) - set(data_pd['offer'].unique()))
    
print(offer_unknowns)     

data_pd.loc[data_pd['offer'].isin(offer_unknowns), 'offer'] = 'unknown'
score_pd.loc[score_pd['offer'].isin(offer_unknowns), 'offer'] = 'unknown'    
    
print(sorted(data_pd['offer'].unique()))
print(len(data_pd['offer'].unique()))
print(sorted(score_pd['offer'].unique()))
print(len(score_pd['offer'].unique()))









    



['1199258', '1200578', '1197502', '1199256', '1200988', '1194044', '1200579', '1203052', '1190530', '1219903', '1221665', '1220502', '1219900', '1221663', '1221658', '1221667', '1230218', '1203439', '1221666', '1220503', '1213242']
['1198271', '1198272', '1198273', '1198274', '1198275', '1200581', '1200582', '1200584', '1204576', '1204821', '1204822', '1208251', '1208252', '1208329', '1208501', '1208503', 'unknown']
17
['1198271', '1198272', '1198273', '1198274', '1198275', '1200581', '1200582', '1200584', '1204576', '1204821', '1204822', '1208251', '1208252', '1208329', '1208501', '1208503', 'unknown']
17



In [15]:

    
# start and import h2o
# set seed
import h2o
h2o.init()
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

SEED = 12345









    



Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_201"; Java(TM) SE Runtime Environment (build 1.8.0_201-b09); Java HotSpot(TM) 64-Bit Server VM (build 25.201-b09, mixed mode)
  Starting server from /home/patrickh/anaconda3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpdauqerlp
  JVM stdout: /tmp/tmpdauqerlp/h2o_patrickh_started_from_python.out
  JVM stderr: /tmp/tmpdauqerlp/h2o_patrickh_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.






    




H2O cluster uptime:
00 secs
H2O cluster timezone:
America/New_York
H2O data parsing timezone:
UTC
H2O cluster version:
3.22.1.4
H2O cluster version age:
1 month and 1 day 
H2O cluster name:
H2O_from_python_patrickh_w6n0dr
H2O cluster total nodes:
1
H2O cluster free memory:
3.422 Gb
H2O cluster total cores:
8
H2O cluster allowed cores:
8
H2O cluster status:
accepting new members, healthy
H2O connection url:
http://127.0.0.1:54321
H2O connection proxy:
None
H2O internal security:
False
H2O API Extensions:
Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4
Python version:
3.6.4 final



In [16]:

    
# enforce same measurement levels in h2o and pandas
col_types = {'chain': 'enum',
             'offer': 'enum',
             'market': 'enum',
             'category': 'enum',
             'company': 'enum',
             'brand': 'enum',
             'exact_item_bought': 'enum'}

data_h2o = h2o.H2OFrame(data_pd, column_types=col_types)
score_h2o = h2o.H2OFrame(score_pd, column_types=col_types)









    



Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%



In [17]:

    
# expand date into new features
data_h2o = data_h2o.concat(data_h2o['offerdate'].month().rename({'offerdate': 'month'}).asfactor())
data_h2o = data_h2o.concat(data_h2o['offerdate'].week().rename({'offerdate': 'week'}).asfactor())
data_h2o = data_h2o.concat(data_h2o['offerdate'].dayOfWeek().rename({'offerdate': 'dayOfWeek'}))
data_h2o = data_h2o.concat(data_h2o['offerdate'].day().rename({'offerdate': 'day'}))

score_h2o = score_h2o.concat(score_h2o['offerdate'].month().rename({'offerdate': 'month'}).asfactor())
score_h2o = score_h2o.concat(score_h2o['offerdate'].week().rename({'offerdate': 'week'}).asfactor())
score_h2o = score_h2o.concat(score_h2o['offerdate'].dayOfWeek().rename({'offerdate': 'dayOfWeek'}))
score_h2o = score_h2o.concat(score_h2o['offerdate'].day().rename({'offerdate': 'day'}))



In [18]:

    
# look at training data
data_h2o.describe()









    



Rows:160057
Cols:18








    






       id                chain  offer  market  repeater           offerdate          category  quantity  company  offervalue        brand  avg_category_quantity  avg_category_amount  exact_item_bought  month  week  dayOfWeek  day               


type   int               enum   enum   enum    int                time               enum      int       enum     real              enum   real                   real                 enum               enum   enum  enum       int               
mins   86246.0                                 0.0                1362096000000.0              1.0                0.75                     -1.0                   -85.14                                                          1.0               
mean   1869324131.2555754                       0.27139081702143614 1365220653082.3323           1.0                1.25532310364433         0.6949999031818261     2.2441312757227134                                              19.004467158574705
maxs   4809911000.0                            1.0                1367280000000.0              1.0                3.0                      44.181818182           62.218                                                          31.0              
sigma  1570833451.4856217                       0.4446786220260453 1303790212.0504372           0.0                0.5246445261465219        0.7609072934172032     2.5989523363669176                                              9.632905002480289 
zeros  0                                       116619             0                            0                  0                        72549                  72591                                                           0                 
missing 0                 0      0      0       0                  0                  0         0         0        0                 0      0                      0                    0                  0      0     0          0                 
0      86246.0           205    1208251 34      1.0                2013-04-24 00:00:00 2202      1.0       104460040 2.0               3718   0.0                    0.0                  0                  4      17    Wed        24.0              
1      86252.0           205    unknown 34      1.0                2013-03-27 00:00:00 unknown   1.0       unknown  0.75              unknown 1.0                    2.425                1                  3      13    Wed        27.0              
2      12682470.0        18     unknown 11      0.0                2013-03-28 00:00:00 unknown   1.0       unknown  0.75              unknown 1.0                    2.5                  1                  3      13    Thu        28.0              
3      12996040.0        15     unknown 9       0.0                2013-03-25 00:00:00 unknown   1.0       unknown  0.75              unknown 0.0                    0.0                  0                  3      13    Mon        25.0              
4      13089312.0        15     1204821 9       0.0                2013-04-01 00:00:00 5619      1.0       107717272 1.5               102504 0.0                    0.0                  0                  4      14    Mon        1.0               
5      13179265.0        14     unknown 8       0.0                2013-03-29 00:00:00 unknown   1.0       unknown  0.75              unknown 0.0                    0.0                  0                  3      13    Fri        29.0              
6      13251776.0        15     1200581 9       0.0                2013-03-30 00:00:00 1726      1.0       104460040 1.25              7668   1.8                    4.563                1                  3      13    Sat        30.0              
7      13540129.0        14     1200581 8       0.0                2013-03-30 00:00:00 1726      1.0       104460040 1.25              7668   1.3333333333           4.6533333333         1                  3      13    Sat        30.0              
8      13807224.0        4      1204576 1       0.0                2013-04-05 00:00:00 5616      1.0       104610040 1.0               15889  1.4375                 3.365                1                  4      14    Fri        5.0               
9      13873775.0        4      unknown 1       0.0                2013-03-26 00:00:00 unknown   1.0       unknown  0.75              unknown 0.0                    0.0                  0                  3      13    Tue        26.0



In [19]:

    
# look at test data
score_h2o.describe()









    



Rows:151484
Cols:17








    






       id                chain  offer  market  offerdate          category  quantity          company  offervalue        brand  avg_category_quantity  avg_category_amount  exact_item_bought  month  week  dayOfWeek  day               


type   int               enum   enum   enum    time               enum      int               enum     real              enum   real                   real                 enum               enum   enum  enum       int               
mins   12262064.0                              1367366400000.0              1.0                        1.0                      -1.0                   -18.76                                                          1.0               
mean   2367235598.062943                       1372045742758.312            1.2141942383354016          1.8039594940719834        0.9798367012155557     3.918267425267764                                               20.815993768318656
maxs   4853598737.0                            1375228800000.0              2.0                        5.0                      40.0                   103.87                                                          31.0              
sigma  1629644569.9344237                       1768808429.4717584           0.4102635466557285          0.7684280524816689        0.6520652050938551     3.24780201201996                                                7.076741234531553 
zeros  0                                       0                            0                          0                        27950                  27970                                                           0                 
missing 0                 0      0      0       0                  0         0                 0        0                 0      0                      0                    0                  0      0     0          0                 
0      12262064.0        95     unknown 39      2013-06-27 00:00:00 unknown   1.0               unknown  1.5               unknown 0.0                    0.0                  0                  6      26    Thu        27.0              
1      12277270.0        95     unknown 39      2013-06-23 00:00:00 unknown   2.0               unknown  3.0               unknown 1.2                    3.882                1                  6      25    Sun        23.0              
2      12332190.0        95     unknown 39      2013-06-15 00:00:00 unknown   1.0               unknown  2.0               unknown 2.0                    6.98                 1                  6      24    Sat        15.0              
3      12524696.0        4      unknown 1       2013-06-20 00:00:00 unknown   1.0               unknown  1.5               unknown 1.0                    4.2425               1                  6      25    Thu        20.0              
4      13074629.0        14     unknown 8       2013-06-21 00:00:00 unknown   2.0               unknown  3.0               unknown 1.0                    5.6185714286         1                  6      25    Fri        21.0              
5      13387341.0        14     unknown 8       2013-06-22 00:00:00 unknown   2.0               unknown  3.0               unknown 1.0                    4.49                 1                  6      25    Sat        22.0              
6      13501141.0        4      unknown 1       2013-05-13 00:00:00 unknown   1.0               unknown  1.5               unknown 1.0                    6.74                 1                  5      20    Mon        13.0              
7      13558712.0        15     unknown 9       2013-05-12 00:00:00 unknown   1.0               unknown  1.5               unknown 1.0                    10.323333333         1                  5      19    Sun        12.0              
8      13563017.0        14     unknown 8       2013-06-24 00:00:00 unknown   1.0               unknown  1.5               unknown 1.0                    5.3066666667         1                  6      26    Mon        24.0              
9      13584134.0        14     unknown 8       2013-06-27 00:00:00 unknown   1.0               unknown  1.5               unknown 1.0                    2.84                 1                  6      26    Thu        27.0



In [20]:

    
# quantity unary, drop it
# drop other unusable variables
# set modeling roles
drops = ['id', 'chain', 'market', 'offerdate', 'quantity']
y = 'repeater'
X = [name for name in data_h2o.columns if name not in [y] + drops]
print(y)
print(X)









    



repeater
['offer', 'category', 'company', 'offervalue', 'brand', 'avg_category_quantity', 'avg_category_amount', 'exact_item_bought', 'month', 'week', 'dayOfWeek', 'day']



In [21]:

    
# create modeling partitions
train, valid, test = data_h2o.split_frame([0.4, 0.3], seed=SEED)



In [22]:

    
# check shape 
train.shape









    Out[22]:





(64122, 18)



In [23]:

    
# check shape 
valid.shape









    Out[23]:





(47980, 18)



In [24]:

    
# check shape 
test.shape









    Out[24]:





(47955, 18)



In [25]:

    
# elastic net regularized regression 
#   - L1 for variable selection
#   - L2 for handling multicollinearity
#   - IRLS for handling outliers
#   - with lamba parameter tuning for variable selection

# initialize
rptr_glm = H2OGeneralizedLinearEstimator(family='binomial',
                                         model_id='rptr_glm1',
                                         solver='IRLSM',
                                         nfolds=3,
                                         standardize=True,
                                         seed=SEED,
                                         lambda_search=True)

# train 
rptr_glm.train(X, y, training_frame=train, validation_frame=valid)









    



glm Model Build progress: |███████████████████████████████████████████████| 100%



In [26]:

    
# check for stability across folds -- looks good
rptr_glm.cross_validation_metrics_summary().as_data_frame()









    Out[26]:







  
    
      
      
      mean
      sd
      cv_1_valid
      cv_2_valid
      cv_3_valid
    
  
  
    
      0
      accuracy
      0.5633181
      0.0034985114
      0.5576324
      0.5626291
      0.56969273
    
    
      1
      auc
      0.66687614
      0.0013872668
      0.6641128
      0.6684734
      0.6680421
    
    
      2
      err
      0.43668193
      0.0034985114
      0.4423676
      0.4373709
      0.4303073
    
    
      3
      err_count
      9334.0
      99.13627
      9514.0
      9316.0
      9172.0
    
    
      4
      f0point5
      0.39562488
      8.234713E-4
      0.39596537
      0.3968501
      0.39405915
    
    
      5
      f1
      0.48101103
      0.0012617736
      0.48254105
      0.481984
      0.47850809
    
    
      6
      f2
      0.613405
      0.0024687524
      0.61756927
      0.6136203
      0.60902536
    
    
      7
      lift_top_group
      2.0142968
      0.050682575
      2.027856
      2.0945127
      1.920521
    
    
      8
      logloss
      0.5482143
      0.0020346912
      0.5511301
      0.5492147
      0.54429805
    
    
      9
      max_per_class_error
      0.50603294
      0.0070085404
      0.5175562
      0.5071829
      0.4933597
    
    
      10
      mcc
      0.21990056
      0.0016726145
      0.21801764
      0.21844749
      0.22323658
    
    
      11
      mean_per_class_accuracy
      0.6226167
      0.0014614518
      0.6208214
      0.62151676
      0.6255119
    
    
      12
      mean_per_class_error
      0.37738332
      0.0014614518
      0.37917858
      0.37848327
      0.37448812
    
    
      13
      mse
      0.18361354
      9.3339913E-4
      0.18492535
      0.18410787
      0.18180738
    
    
      14
      null_deviance
      24907.797
      143.848
      25161.13
      24899.21
      24663.049
    
    
      15
      precision
      0.3537607
      7.134559E-4
      0.35366338
      0.3550422
      0.35257646
    
    
      16
      r2
      0.0670119
      9.0821454E-4
      0.06541767
      0.068562925
      0.067055106
    
    
      17
      recall
      0.7512663
      0.004308973
      0.759199
      0.75021636
      0.7443835
    
    
      18
      residual_deviance
      23435.428
      146.4661
      23706.31
      23396.547
      23203.426
    
    
      19
      rmse
      0.42849872
      0.0010901511
      0.43002948
      0.42907792
      0.42638877
    
    
      20
      specificity
      0.49396706
      0.0070085404
      0.4824438
      0.4928171
      0.50664026



In [27]:

    
# train AUC
rptr_glm.auc(valid=False)









    Out[27]:





0.6682898515349129



In [28]:

    
# valid AUC
rptr_glm.auc(valid=True)









    Out[28]:





0.6642797159900581



In [29]:

    
# test AUC
rptr_glm.model_performance(test).auc()









    Out[29]:





0.66930388703468



In [30]:

    
# many validation metrics
rptr_glm.model_performance(valid)









    



ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.18433338418488365
RMSE: 0.4293406388695154
LogLoss: 0.5500303205580226
Null degrees of freedom: 47979
Residual degrees of freedom: 47926
Null deviance: 56034.26930760575
Residual deviance: 52780.909560747896
AIC: 52888.909560747896
AUC: 0.6642797159900581
pr_auc: 0.4082413348072177
Gini: 0.32855943198011617
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.23680137514237834: 






    





0
1
Error
Rate
0
17240.0
17753.0
0.5073
 (17753.0/34993.0)
1
3305.0
9682.0
0.2545
 (3305.0/12987.0)
Total
20545.0
27435.0
0.4389
 (21058.0/47980.0)






    



Maximum Metrics: Maximum metrics at their respective thresholds







    




metric
threshold
value
idx
max f1
0.2368014
0.4790461
274.0
max f2
0.1223229
0.6605821
358.0
max f0point5
0.3284445
0.4251892
202.0
max accuracy
0.4805090
0.7320342
76.0
max precision
0.7387865
0.6923077
8.0
max recall
0.0559133
1.0
399.0
max specificity
0.9378370
0.9999143
0.0
max absolute_mcc
0.2928417
0.2189525
229.0
max min_per_class_accuracy
0.2913873
0.6202098
231.0
max mean_per_class_accuracy
0.2928417
0.6222223
229.0






    



Gains/Lift Table: Avg response rate: 27.07 %, avg score: 26.98 %







    





group
cumulative_data_fraction
lower_threshold
lift
cumulative_lift
response_rate
score
cumulative_response_rate
cumulative_score
capture_rate
cumulative_capture_rate
gain
cumulative_gain

1
0.0100459
0.5366650
2.0005291
2.0005291
0.5414938
0.5850936
0.5414938
0.5850936
0.0200970
0.0200970
100.0529096
100.0529096

2
0.0200917
0.5128554
1.9698696
1.9851994
0.5331950
0.5228512
0.5373444
0.5539724
0.0197890
0.0398860
96.9869646
98.5199371

3
0.0300125
0.4961243
1.9093237
1.9601182
0.5168067
0.5044425
0.5305556
0.5376000
0.0189420
0.0588281
90.9323674
96.0118238

4
0.0400167
0.4826037
1.9472902
1.9569112
0.5270833
0.4896555
0.5296875
0.5256139
0.0194810
0.0783091
94.7290239
95.6911238

5
0.05
0.4676137
1.7353953
1.9126819
0.4697286
0.4742862
0.5177157
0.5153654
0.0173250
0.0956341
73.5395264
91.2681913

6
0.1
0.4247933
1.7386617
1.8256718
0.4706128
0.4440226
0.4941642
0.4796940
0.0869331
0.1825672
73.8661739
82.5671826

7
0.15
0.3940127
1.4676215
1.7063217
0.3972489
0.4091581
0.4618591
0.4561820
0.0733811
0.2559483
46.7621468
70.6321706

8
0.2
0.3653559
1.3043813
1.6058366
0.3530638
0.3791261
0.4346603
0.4369180
0.0652191
0.3211673
30.4381304
60.5836606

9
0.3152980
0.3233892
1.3456877
1.5107055
0.3642444
0.3402462
0.4089106
0.4015671
0.1551552
0.4763225
34.5687698
51.0705474

10
0.4043560
0.3039213
1.1542497
1.4321974
0.3124269
0.3130243
0.3876604
0.3820659
0.1027951
0.5791176
15.4249715
43.2197384

11
0.5002293
0.2727063
0.9581511
1.3413423
0.2593478
0.2891334
0.3630682
0.3642546
0.0918611
0.6709787
-4.1848872
34.1342304

12
0.6016048
0.2177024
0.9608340
1.2772234
0.2600740
0.2480176
0.3457128
0.3446676
0.0974051
0.7683838
-3.9166001
27.7223392

13
0.7013964
0.1838723
0.6767011
1.1917837
0.1831662
0.1940466
0.3225864
0.3232379
0.0675291
0.8359128
-32.3298943
19.1783730

14
0.8050855
0.1746898
0.6928512
1.1275249
0.1875377
0.1783366
0.3051931
0.3045757
0.0718411
0.9077539
-30.7148819
12.7524917

15
0.9000208
0.1270697
0.6277750
1.0748106
0.1699232
0.1564416
0.2909247
0.2889503
0.0595981
0.9673520
-37.2225050
7.4810629

16
1.0
0.0519661
0.3265484
1.0
0.0883886
0.0972660
0.2706753
0.2697859
0.0326480
1.0
-67.3451614
0.0






    









    Out[30]:



In [31]:

    
# print coefficients
for key_ in rptr_glm.coef():
    print(key_, rptr_glm.coef()[key_])









    



Intercept 0.9142362343476567
offer.1198271 0.03463083385893831
offer.1198272 0.14052728909397258
offer.1198273 -0.028110797822582217
offer.1198274 -0.13117283777912048
offer.1198275 -0.07475893905600814
offer.1200581 -0.2663110424214166
offer.1200582 0.0
offer.1200584 0.0
offer.1204576 -0.13863085089106314
offer.1204821 -0.16653161796482882
offer.1204822 0.0
offer.1208251 0.0
offer.1208252 0.90290635746174
offer.1208329 0.03988082619178288
offer.1208501 0.4323275425518459
offer.1208503 0.0
offer.unknown -0.24095314385202235
week.9 0.3882866662473721
week.10 0.34503199800881224
week.11 -0.3374664786715979
week.12 0.0
week.13 -0.3946215550940864
week.14 -0.22160719365547848
week.15 0.0
week.16 -0.09617188647612415
week.17 0.053510178998351796
week.18 0.15565259807093573
category.1726 -0.16996755533119035
category.2119 0.03988082619178368
category.2202 0.6589080081306594
category.3504 0.0
category.5558 -0.04144651740120281
category.5616 -0.13863085089106691
category.5619 -0.09783190861526495
category.6202 0.12060177946446535
category.unknown -0.2409531438519952
brand.102504 -0.09783190861526239
brand.15889 -0.13863085089106675
brand.3718 0.6589080081306663
brand.5072 -0.04144651740120234
brand.64486 0.12060177946446629
brand.6926 0.03988082619178345
brand.7668 -0.9124249815547996
brand.unknown 0.0
company.104460040 0.0
company.104610040 -0.1386308508910667
company.107120272 -0.041446517401202365
company.107717272 -0.09783190861526334
company.108079383 0.03988082619178309
company.1087744888 0.1206017794644651
company.unknown 0.0
dayOfWeek.Mon -0.06788130486673358
dayOfWeek.Tue -0.001410333705654479
dayOfWeek.Wed 0.007728177083505018
dayOfWeek.Thu 0.0
dayOfWeek.Fri 0.0
dayOfWeek.Sat 0.14530381687259547
dayOfWeek.Sun 0.1032571462037334
exact_item_bought.0 -0.1507968420390149
exact_item_bought.1 0.1263809799828914
month.3 0.19741069489701107
month.4 -0.1549780733058436
offervalue -0.9912946388035481
avg_category_quantity -0.03619275098428959
avg_category_amount 0.08680975561720691
day -0.03191424771152614



In [32]:

    
# find id == 13584134
score_h2o['id'].asfactor().head()



In [33]:

    
# get probability for id == 13584134
rptr_glm.predict(score_h2o)









    



glm prediction progress: |████████████████████████████████████████████████| 100%






    



/home/patrickh/anaconda3/lib/python3.6/site-packages/h2o/job.py:69: UserWarning: Test/Validation dataset column 'week' has levels not trained on: [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
  warnings.warn(w)
/home/patrickh/anaconda3/lib/python3.6/site-packages/h2o/job.py:69: UserWarning: Test/Validation dataset column 'month' has levels not trained on: [5, 6, 7]
  warnings.warn(w)






    






  predict       p0        p1


        0 0.931928 0.0680718
        0 0.964497 0.0355034
        0 0.854873 0.145127 
        0 0.856192 0.143808 
        0 0.960205 0.0397945
        0 0.959612 0.0403879
        0 0.804033 0.195967 
        1 0.71045 0.28955  
        0 0.868433 0.131567 
        0 0.893706 0.106294 








    Out[33]:



In [ ]:

H2O cluster uptime:	00 secs
H2O cluster timezone:	America/New_York
H2O data parsing timezone:	UTC
H2O cluster version:	3.22.1.4
H2O cluster version age:	1 month and 1 day
H2O cluster name:	H2O_from_python_patrickh_w6n0dr
H2O cluster total nodes:	1
H2O cluster free memory:	3.422 Gb
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster status:	accepting new members, healthy
H2O connection url:	http://127.0.0.1:54321
H2O connection proxy:	None
H2O internal security:	False
H2O API Extensions:	Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4
Python version:	3.6.4 final

	id	chain	offer	market	repeater	offerdate	category	quantity	company	offervalue	brand	avg_category_quantity	avg_category_amount	exact_item_bought	month	week	dayOfWeek	day
type	int	enum	enum	enum	int	time	enum	int	enum	real	enum	real	real	enum	enum	enum	enum	int
mins	86246.0				0.0	1362096000000.0		1.0		0.75		-1.0	-85.14					1.0
mean	1869324131.2555754				0.27139081702143614	1365220653082.3323		1.0		1.25532310364433		0.6949999031818261	2.2441312757227134					19.004467158574705
maxs	4809911000.0				1.0	1367280000000.0		1.0		3.0		44.181818182	62.218					31.0
sigma	1570833451.4856217				0.4446786220260453	1303790212.0504372		0.0		0.5246445261465219		0.7609072934172032	2.5989523363669176					9.632905002480289
zeros	0				116619	0		0		0		72549	72591					0
missing	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
0	86246.0	205	1208251	34	1.0	2013-04-24 00:00:00	2202	1.0	104460040	2.0	3718	0.0	0.0	0	4	17	Wed	24.0
1	86252.0	205	unknown	34	1.0	2013-03-27 00:00:00	unknown	1.0	unknown	0.75	unknown	1.0	2.425	1	3	13	Wed	27.0
2	12682470.0	18	unknown	11	0.0	2013-03-28 00:00:00	unknown	1.0	unknown	0.75	unknown	1.0	2.5	1	3	13	Thu	28.0
3	12996040.0	15	unknown	9	0.0	2013-03-25 00:00:00	unknown	1.0	unknown	0.75	unknown	0.0	0.0	0	3	13	Mon	25.0
4	13089312.0	15	1204821	9	0.0	2013-04-01 00:00:00	5619	1.0	107717272	1.5	102504	0.0	0.0	0	4	14	Mon	1.0
5	13179265.0	14	unknown	8	0.0	2013-03-29 00:00:00	unknown	1.0	unknown	0.75	unknown	0.0	0.0	0	3	13	Fri	29.0
6	13251776.0	15	1200581	9	0.0	2013-03-30 00:00:00	1726	1.0	104460040	1.25	7668	1.8	4.563	1	3	13	Sat	30.0
7	13540129.0	14	1200581	8	0.0	2013-03-30 00:00:00	1726	1.0	104460040	1.25	7668	1.3333333333	4.6533333333	1	3	13	Sat	30.0
8	13807224.0	4	1204576	1	0.0	2013-04-05 00:00:00	5616	1.0	104610040	1.0	15889	1.4375	3.365	1	4	14	Fri	5.0
9	13873775.0	4	unknown	1	0.0	2013-03-26 00:00:00	unknown	1.0	unknown	0.75	unknown	0.0	0.0	0	3	13	Tue	26.0

		mean	sd	cv_1_valid	cv_2_valid	cv_3_valid
0	accuracy	0.5633181	0.0034985114	0.5576324	0.5626291	0.56969273
1	auc	0.66687614	0.0013872668	0.6641128	0.6684734	0.6680421
2	err	0.43668193	0.0034985114	0.4423676	0.4373709	0.4303073
3	err_count	9334.0	99.13627	9514.0	9316.0	9172.0
4	f0point5	0.39562488	8.234713E-4	0.39596537	0.3968501	0.39405915
5	f1	0.48101103	0.0012617736	0.48254105	0.481984	0.47850809
6	f2	0.613405	0.0024687524	0.61756927	0.6136203	0.60902536
7	lift_top_group	2.0142968	0.050682575	2.027856	2.0945127	1.920521
8	logloss	0.5482143	0.0020346912	0.5511301	0.5492147	0.54429805
9	max_per_class_error	0.50603294	0.0070085404	0.5175562	0.5071829	0.4933597
10	mcc	0.21990056	0.0016726145	0.21801764	0.21844749	0.22323658
11	mean_per_class_accuracy	0.6226167	0.0014614518	0.6208214	0.62151676	0.6255119
12	mean_per_class_error	0.37738332	0.0014614518	0.37917858	0.37848327	0.37448812
13	mse	0.18361354	9.3339913E-4	0.18492535	0.18410787	0.18180738
14	null_deviance	24907.797	143.848	25161.13	24899.21	24663.049
15	precision	0.3537607	7.134559E-4	0.35366338	0.3550422	0.35257646
16	r2	0.0670119	9.0821454E-4	0.06541767	0.068562925	0.067055106
17	recall	0.7512663	0.004308973	0.759199	0.75021636	0.7443835
18	residual_deviance	23435.428	146.4661	23706.31	23396.547	23203.426
19	rmse	0.42849872	0.0010901511	0.43002948	0.42907792	0.42638877
20	specificity	0.49396706	0.0070085404	0.4824438	0.4928171	0.50664026

	0	1	Error	Rate
0	17240.0	17753.0	0.5073	(17753.0/34993.0)
1	3305.0	9682.0	0.2545	(3305.0/12987.0)
Total	20545.0	27435.0	0.4389	(21058.0/47980.0)

metric	threshold	value	idx
max f1	0.2368014	0.4790461	274.0
max f2	0.1223229	0.6605821	358.0
max f0point5	0.3284445	0.4251892	202.0
max accuracy	0.4805090	0.7320342	76.0
max precision	0.7387865	0.6923077	8.0
max recall	0.0559133	1.0	399.0
max specificity	0.9378370	0.9999143	0.0
max absolute_mcc	0.2928417	0.2189525	229.0
max min_per_class_accuracy	0.2913873	0.6202098	231.0
max mean_per_class_accuracy	0.2928417	0.6222223	229.0

group	cumulative_data_fraction	lower_threshold	lift	cumulative_lift	response_rate	score	cumulative_response_rate	cumulative_score	capture_rate	cumulative_capture_rate	gain	cumulative_gain
1	0.0100459	0.5366650	2.0005291	2.0005291	0.5414938	0.5850936	0.5414938	0.5850936	0.0200970	0.0200970	100.0529096	100.0529096
2	0.0200917	0.5128554	1.9698696	1.9851994	0.5331950	0.5228512	0.5373444	0.5539724	0.0197890	0.0398860	96.9869646	98.5199371
3	0.0300125	0.4961243	1.9093237	1.9601182	0.5168067	0.5044425	0.5305556	0.5376000	0.0189420	0.0588281	90.9323674	96.0118238
4	0.0400167	0.4826037	1.9472902	1.9569112	0.5270833	0.4896555	0.5296875	0.5256139	0.0194810	0.0783091	94.7290239	95.6911238
5	0.05	0.4676137	1.7353953	1.9126819	0.4697286	0.4742862	0.5177157	0.5153654	0.0173250	0.0956341	73.5395264	91.2681913
6	0.1	0.4247933	1.7386617	1.8256718	0.4706128	0.4440226	0.4941642	0.4796940	0.0869331	0.1825672	73.8661739	82.5671826
7	0.15	0.3940127	1.4676215	1.7063217	0.3972489	0.4091581	0.4618591	0.4561820	0.0733811	0.2559483	46.7621468	70.6321706
8	0.2	0.3653559	1.3043813	1.6058366	0.3530638	0.3791261	0.4346603	0.4369180	0.0652191	0.3211673	30.4381304	60.5836606
9	0.3152980	0.3233892	1.3456877	1.5107055	0.3642444	0.3402462	0.4089106	0.4015671	0.1551552	0.4763225	34.5687698	51.0705474
10	0.4043560	0.3039213	1.1542497	1.4321974	0.3124269	0.3130243	0.3876604	0.3820659	0.1027951	0.5791176	15.4249715	43.2197384
11	0.5002293	0.2727063	0.9581511	1.3413423	0.2593478	0.2891334	0.3630682	0.3642546	0.0918611	0.6709787	-4.1848872	34.1342304
12	0.6016048	0.2177024	0.9608340	1.2772234	0.2600740	0.2480176	0.3457128	0.3446676	0.0974051	0.7683838	-3.9166001	27.7223392
13	0.7013964	0.1838723	0.6767011	1.1917837	0.1831662	0.1940466	0.3225864	0.3232379	0.0675291	0.8359128	-32.3298943	19.1783730
14	0.8050855	0.1746898	0.6928512	1.1275249	0.1875377	0.1783366	0.3051931	0.3045757	0.0718411	0.9077539	-30.7148819	12.7524917
15	0.9000208	0.1270697	0.6277750	1.0748106	0.1699232	0.1564416	0.2909247	0.2889503	0.0595981	0.9673520	-37.2225050	7.4810629
16	1.0	0.0519661	0.3265484	1.0	0.0883886	0.0972660	0.2706753	0.2697859	0.0326480	1.0	-67.3451614	0.0

predict	p0	p1
0	0.931928	0.0680718
0	0.964497	0.0355034
0	0.854873	0.145127
0	0.856192	0.143808
0	0.960205	0.0397945
0	0.959612	0.0403879
0	0.804033	0.195967
1	0.71045	0.28955
0	0.868433	0.131567
0	0.893706	0.106294