In [1]:
import pandas as pd

In [2]:
# set measurement levels
col_types = {'chain': 'object',
             'offer': 'object',
             'market': 'object',
             'category': 'object',
             'company': 'object',
             'brand': 'object',
             'exact_item_bought': 'object'}

In [3]:
# read data created from SAS key
data_pd = pd.read_csv('/home/patrickh/workspace/GWU_data_mining/01_basic_data_prep/assignment/key/assignment_1_key_train_sas.csv', dtype=col_types)
score_pd = pd.read_csv('/home/patrickh/workspace/GWU_data_mining/01_basic_data_prep/assignment/key/assignment_1_key_test_sas.csv', dtype=col_types)

In [4]:
# confirm shape
data_pd.shape


Out[4]:
(160057, 14)

In [5]:
# confirm shape
score_pd.shape


Out[5]:
(151484, 13)

In [6]:
# data['chain'].unique().shape    # 130 levels
# data['market'].unique().shape   # 34 levels
# data['category'].unique().shape # 13 levels
# data['brand'].unique().shape    # 12 levels
# data['company'].unique().shape  # 11 levels

In [7]:
# show column names
data_pd.columns


Out[7]:
Index(['id', 'chain', 'offer', 'market', 'repeater', 'offerdate', 'category',
       'quantity', 'company', 'offervalue', 'brand', 'avg_category_quantity',
       'avg_category_amount', 'exact_item_bought'],
      dtype='object')

In [8]:
# ensure repeater==t is modeled as 1
data_pd.loc[data_pd['repeater'] == 't', 'repeater'] = 1
data_pd.loc[data_pd['repeater'] == 'f', 'repeater'] = 0

In [9]:
# fix unknown values in train and test
chain_unknowns =\
list(set(data_pd['chain'].unique()) - set(score_pd['chain'].unique())) +\
list(set(score_pd['chain'].unique()) - set(data_pd['chain'].unique()))
    
print(chain_unknowns)     

data_pd.loc[data_pd['chain'].isin(chain_unknowns), 'chain'] = 'unknown'
score_pd.loc[score_pd['chain'].isin(chain_unknowns), 'chain'] = 'unknown'
     
print(sorted(data_pd['chain'].unique()))
print(len(data_pd['chain'].unique()))
print(sorted(score_pd['chain'].unique()))
print(len(score_pd['chain'].unique()))


['9', '230', '369', '284', '387', '370', '421']
['10', '100', '101', '104', '106', '108', '109', '115', '116', '12', '122', '126', '133', '134', '14', '140', '143', '15', '151', '152', '153', '16', '161', '163', '165', '166', '169', '17', '18', '180', '184', '187', '191', '192', '2', '20', '205', '206', '21', '211', '214', '215', '217', '224', '23', '233', '24', '240', '241', '246', '26', '278', '285', '293', '3', '304', '306', '307', '31', '313', '356', '360', '362', '368', '373', '376', '377', '384', '386', '388', '389', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399', '4', '40', '400', '401', '402', '403', '404', '405', '42', '422', '424', '431', '46', '48', '507', '508', '509', '510', '520', '521', '522', '523', '524', '525', '526', '58', '6', '62', '63', '64', '65', '68', '69', '70', '71', '73', '77', '8', '81', '83', '85', '88', '89', '95', '96', '98', 'unknown']
128
['10', '100', '101', '104', '106', '108', '109', '115', '116', '12', '122', '126', '133', '134', '14', '140', '143', '15', '151', '152', '153', '16', '161', '163', '165', '166', '169', '17', '18', '180', '184', '187', '191', '192', '2', '20', '205', '206', '21', '211', '214', '215', '217', '224', '23', '233', '24', '240', '241', '246', '26', '278', '285', '293', '3', '304', '306', '307', '31', '313', '356', '360', '362', '368', '373', '376', '377', '384', '386', '388', '389', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399', '4', '40', '400', '401', '402', '403', '404', '405', '42', '422', '424', '431', '46', '48', '507', '508', '509', '510', '520', '521', '522', '523', '524', '525', '526', '58', '6', '62', '63', '64', '65', '68', '69', '70', '71', '73', '77', '8', '81', '83', '85', '88', '89', '95', '96', '98', 'unknown']
128

In [10]:
# fix unknown values in train and test
market_unknowns =\
list(set(data_pd['market'].unique()) - set(score_pd['market'].unique())) +\
list(set(score_pd['market'].unique()) - set(data_pd['market'].unique()))
    
print(market_unknowns)     
     
print(sorted(data_pd['market'].unique()))
print(len(data_pd['market'].unique()))
print(sorted(score_pd['market'].unique()))
print(len(score_pd['market'].unique()))


[]
['1', '10', '11', '12', '14', '15', '16', '17', '18', '2', '20', '21', '22', '23', '24', '26', '27', '28', '33', '34', '35', '37', '39', '4', '43', '45', '47', '5', '6', '7', '8', '9', '93', '96']
34
['1', '10', '11', '12', '14', '15', '16', '17', '18', '2', '20', '21', '22', '23', '24', '26', '27', '28', '33', '34', '35', '37', '39', '4', '43', '45', '47', '5', '6', '7', '8', '9', '93', '96']
34

In [11]:
# fix unknown values in train and test
category_unknowns =\
list(set(data_pd['category'].unique()) - set(score_pd['category'].unique())) +\
list(set(score_pd['category'].unique()) - set(data_pd['category'].unique()))
    
print(category_unknowns)     

data_pd.loc[data_pd['category'].isin(category_unknowns), 'category'] = 'unknown'
score_pd.loc[score_pd['category'].isin(category_unknowns), 'category'] = 'unknown'
     
print(sorted(data_pd['category'].unique()))
print(len(data_pd['category'].unique()))
print(sorted(score_pd['category'].unique()))
print(len(score_pd['category'].unique()))


['3509', '9909', '3203', '1703', '4401', '7205', '5824', '799', '4517', '9115', '706', '5122']
['1726', '2119', '2202', '3504', '5558', '5616', '5619', '6202', 'unknown']
9
['1726', '2119', '2202', '3504', '5558', '5616', '5619', '6202', 'unknown']
9

In [12]:
# fix unknown values in train and test
brand_unknowns =\
list(set(data_pd['brand'].unique()) - set(score_pd['brand'].unique())) +\
list(set(score_pd['brand'].unique()) - set(data_pd['brand'].unique()))
    
print(brand_unknowns)     

data_pd.loc[data_pd['brand'].isin(brand_unknowns), 'brand'] = 'unknown'
score_pd.loc[score_pd['brand'].isin(brand_unknowns), 'brand'] = 'unknown'
     
print(sorted(data_pd['brand'].unique()))
print(len(data_pd['brand'].unique()))
print(sorted(score_pd['brand'].unique()))
print(len(score_pd['brand'].unique()))


['13791', '6732', '875', '28840', '13474', '26456', '4294', '93904', '17286', '1322', '17311', '26189']
['102504', '15889', '3718', '5072', '64486', '6926', '7668', 'unknown']
8
['102504', '15889', '3718', '5072', '64486', '6926', '7668', 'unknown']
8

In [13]:
# fix unknown values in train and test
company_unknowns =\
list(set(data_pd['company'].unique()) - set(score_pd['company'].unique())) +\
list(set(score_pd['company'].unique()) - set(data_pd['company'].unique()))
    
print(company_unknowns)     
     
data_pd.loc[data_pd['company'].isin(company_unknowns), 'company'] = 'unknown'
score_pd.loc[score_pd['company'].isin(company_unknowns), 'company'] = 'unknown'    
    
print(sorted(data_pd['company'].unique()))
print(len(data_pd['company'].unique()))
print(sorted(score_pd['company'].unique()))
print(len(score_pd['company'].unique()))


['106414464', '105100050', '103320030', '1089520383', '107127979', '103700030', '105190050', '104127141', '107106878', '1076211171', '108500080', '105450050']
['104460040', '104610040', '107120272', '107717272', '108079383', '1087744888', 'unknown']
7
['104460040', '104610040', '107120272', '107717272', '108079383', '1087744888', 'unknown']
7

In [14]:
# fix unknown values in train and test
offer_unknowns =\
list(set(data_pd['offer'].unique()) - set(score_pd['offer'].unique())) +\
list(set(score_pd['offer'].unique()) - set(data_pd['offer'].unique()))
    
print(offer_unknowns)     

data_pd.loc[data_pd['offer'].isin(offer_unknowns), 'offer'] = 'unknown'
score_pd.loc[score_pd['offer'].isin(offer_unknowns), 'offer'] = 'unknown'    
    
print(sorted(data_pd['offer'].unique()))
print(len(data_pd['offer'].unique()))
print(sorted(score_pd['offer'].unique()))
print(len(score_pd['offer'].unique()))


['1199258', '1200578', '1197502', '1199256', '1200988', '1194044', '1200579', '1203052', '1190530', '1219903', '1221665', '1220502', '1219900', '1221663', '1221658', '1221667', '1230218', '1203439', '1221666', '1220503', '1213242']
['1198271', '1198272', '1198273', '1198274', '1198275', '1200581', '1200582', '1200584', '1204576', '1204821', '1204822', '1208251', '1208252', '1208329', '1208501', '1208503', 'unknown']
17
['1198271', '1198272', '1198273', '1198274', '1198275', '1200581', '1200582', '1200584', '1204576', '1204821', '1204822', '1208251', '1208252', '1208329', '1208501', '1208503', 'unknown']
17

In [15]:
# start and import h2o
# set seed
import h2o
h2o.init()
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

SEED = 12345


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_201"; Java(TM) SE Runtime Environment (build 1.8.0_201-b09); Java HotSpot(TM) 64-Bit Server VM (build 25.201-b09, mixed mode)
  Starting server from /home/patrickh/anaconda3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpdauqerlp
  JVM stdout: /tmp/tmpdauqerlp/h2o_patrickh_started_from_python.out
  JVM stderr: /tmp/tmpdauqerlp/h2o_patrickh_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.
H2O cluster uptime: 00 secs
H2O cluster timezone: America/New_York
H2O data parsing timezone: UTC
H2O cluster version: 3.22.1.4
H2O cluster version age: 1 month and 1 day
H2O cluster name: H2O_from_python_patrickh_w6n0dr
H2O cluster total nodes: 1
H2O cluster free memory: 3.422 Gb
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster status: accepting new members, healthy
H2O connection url: http://127.0.0.1:54321
H2O connection proxy: None
H2O internal security: False
H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4
Python version: 3.6.4 final

In [16]:
# enforce same measurement levels in h2o and pandas
col_types = {'chain': 'enum',
             'offer': 'enum',
             'market': 'enum',
             'category': 'enum',
             'company': 'enum',
             'brand': 'enum',
             'exact_item_bought': 'enum'}

data_h2o = h2o.H2OFrame(data_pd, column_types=col_types)
score_h2o = h2o.H2OFrame(score_pd, column_types=col_types)


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%

In [17]:
# expand date into new features
data_h2o = data_h2o.concat(data_h2o['offerdate'].month().rename({'offerdate': 'month'}).asfactor())
data_h2o = data_h2o.concat(data_h2o['offerdate'].week().rename({'offerdate': 'week'}).asfactor())
data_h2o = data_h2o.concat(data_h2o['offerdate'].dayOfWeek().rename({'offerdate': 'dayOfWeek'}))
data_h2o = data_h2o.concat(data_h2o['offerdate'].day().rename({'offerdate': 'day'}))

score_h2o = score_h2o.concat(score_h2o['offerdate'].month().rename({'offerdate': 'month'}).asfactor())
score_h2o = score_h2o.concat(score_h2o['offerdate'].week().rename({'offerdate': 'week'}).asfactor())
score_h2o = score_h2o.concat(score_h2o['offerdate'].dayOfWeek().rename({'offerdate': 'dayOfWeek'}))
score_h2o = score_h2o.concat(score_h2o['offerdate'].day().rename({'offerdate': 'day'}))

In [18]:
# look at training data
data_h2o.describe()


Rows:160057
Cols:18


id chain offer market repeater offerdate category quantity company offervalue brand avg_category_quantity avg_category_amount exact_item_bought month week dayOfWeek day
type int enum enum enum int time enum int enum real enum real real enum enum enum enum int
mins 86246.0 0.0 1362096000000.0 1.0 0.75 -1.0 -85.14 1.0
mean 1869324131.2555754 0.271390817021436141365220653082.3323 1.0 1.25532310364433 0.6949999031818261 2.2441312757227134 19.004467158574705
maxs 4809911000.0 1.0 1367280000000.0 1.0 3.0 44.181818182 62.218 31.0
sigma 1570833451.4856217 0.4446786220260453 1303790212.0504372 0.0 0.5246445261465219 0.7609072934172032 2.5989523363669176 9.632905002480289
zeros 0 116619 0 0 0 72549 72591 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 86246.0 205 120825134 1.0 2013-04-24 00:00:002202 1.0 1044600402.0 3718 0.0 0.0 0 4 17 Wed 24.0
1 86252.0 205 unknown34 1.0 2013-03-27 00:00:00unknown 1.0 unknown 0.75 unknown1.0 2.425 1 3 13 Wed 27.0
2 12682470.0 18 unknown11 0.0 2013-03-28 00:00:00unknown 1.0 unknown 0.75 unknown1.0 2.5 1 3 13 Thu 28.0
3 12996040.0 15 unknown9 0.0 2013-03-25 00:00:00unknown 1.0 unknown 0.75 unknown0.0 0.0 0 3 13 Mon 25.0
4 13089312.0 15 12048219 0.0 2013-04-01 00:00:005619 1.0 1077172721.5 102504 0.0 0.0 0 4 14 Mon 1.0
5 13179265.0 14 unknown8 0.0 2013-03-29 00:00:00unknown 1.0 unknown 0.75 unknown0.0 0.0 0 3 13 Fri 29.0
6 13251776.0 15 12005819 0.0 2013-03-30 00:00:001726 1.0 1044600401.25 7668 1.8 4.563 1 3 13 Sat 30.0
7 13540129.0 14 12005818 0.0 2013-03-30 00:00:001726 1.0 1044600401.25 7668 1.3333333333 4.6533333333 1 3 13 Sat 30.0
8 13807224.0 4 12045761 0.0 2013-04-05 00:00:005616 1.0 1046100401.0 15889 1.4375 3.365 1 4 14 Fri 5.0
9 13873775.0 4 unknown1 0.0 2013-03-26 00:00:00unknown 1.0 unknown 0.75 unknown0.0 0.0 0 3 13 Tue 26.0

In [19]:
# look at test data
score_h2o.describe()


Rows:151484
Cols:17


id chain offer market offerdate category quantity company offervalue brand avg_category_quantity avg_category_amount exact_item_bought month week dayOfWeek day
type int enum enum enum time enum int enum real enum real real enum enum enum enum int
mins 12262064.0 1367366400000.0 1.0 1.0 -1.0 -18.76 1.0
mean 2367235598.062943 1372045742758.312 1.2141942383354016 1.8039594940719834 0.9798367012155557 3.918267425267764 20.815993768318656
maxs 4853598737.0 1375228800000.0 2.0 5.0 40.0 103.87 31.0
sigma 1629644569.9344237 1768808429.4717584 0.4102635466557285 0.7684280524816689 0.6520652050938551 3.24780201201996 7.076741234531553
zeros 0 0 0 0 27950 27970 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 12262064.0 95 unknown39 2013-06-27 00:00:00unknown 1.0 unknown 1.5 unknown0.0 0.0 0 6 26 Thu 27.0
1 12277270.0 95 unknown39 2013-06-23 00:00:00unknown 2.0 unknown 3.0 unknown1.2 3.882 1 6 25 Sun 23.0
2 12332190.0 95 unknown39 2013-06-15 00:00:00unknown 1.0 unknown 2.0 unknown2.0 6.98 1 6 24 Sat 15.0
3 12524696.0 4 unknown1 2013-06-20 00:00:00unknown 1.0 unknown 1.5 unknown1.0 4.2425 1 6 25 Thu 20.0
4 13074629.0 14 unknown8 2013-06-21 00:00:00unknown 2.0 unknown 3.0 unknown1.0 5.6185714286 1 6 25 Fri 21.0
5 13387341.0 14 unknown8 2013-06-22 00:00:00unknown 2.0 unknown 3.0 unknown1.0 4.49 1 6 25 Sat 22.0
6 13501141.0 4 unknown1 2013-05-13 00:00:00unknown 1.0 unknown 1.5 unknown1.0 6.74 1 5 20 Mon 13.0
7 13558712.0 15 unknown9 2013-05-12 00:00:00unknown 1.0 unknown 1.5 unknown1.0 10.323333333 1 5 19 Sun 12.0
8 13563017.0 14 unknown8 2013-06-24 00:00:00unknown 1.0 unknown 1.5 unknown1.0 5.3066666667 1 6 26 Mon 24.0
9 13584134.0 14 unknown8 2013-06-27 00:00:00unknown 1.0 unknown 1.5 unknown1.0 2.84 1 6 26 Thu 27.0

In [20]:
# quantity unary, drop it
# drop other unusable variables
# set modeling roles
drops = ['id', 'chain', 'market', 'offerdate', 'quantity']
y = 'repeater'
X = [name for name in data_h2o.columns if name not in [y] + drops]
print(y)
print(X)


repeater
['offer', 'category', 'company', 'offervalue', 'brand', 'avg_category_quantity', 'avg_category_amount', 'exact_item_bought', 'month', 'week', 'dayOfWeek', 'day']

In [21]:
# create modeling partitions
train, valid, test = data_h2o.split_frame([0.4, 0.3], seed=SEED)

In [22]:
# check shape 
train.shape


Out[22]:
(64122, 18)

In [23]:
# check shape 
valid.shape


Out[23]:
(47980, 18)

In [24]:
# check shape 
test.shape


Out[24]:
(47955, 18)

In [25]:
# elastic net regularized regression 
#   - L1 for variable selection
#   - L2 for handling multicollinearity
#   - IRLS for handling outliers
#   - with lamba parameter tuning for variable selection

# initialize
rptr_glm = H2OGeneralizedLinearEstimator(family='binomial',
                                         model_id='rptr_glm1',
                                         solver='IRLSM',
                                         nfolds=3,
                                         standardize=True,
                                         seed=SEED,
                                         lambda_search=True)

# train 
rptr_glm.train(X, y, training_frame=train, validation_frame=valid)


glm Model Build progress: |███████████████████████████████████████████████| 100%

In [26]:
# check for stability across folds -- looks good
rptr_glm.cross_validation_metrics_summary().as_data_frame()


Out[26]:
mean sd cv_1_valid cv_2_valid cv_3_valid
0 accuracy 0.5633181 0.0034985114 0.5576324 0.5626291 0.56969273
1 auc 0.66687614 0.0013872668 0.6641128 0.6684734 0.6680421
2 err 0.43668193 0.0034985114 0.4423676 0.4373709 0.4303073
3 err_count 9334.0 99.13627 9514.0 9316.0 9172.0
4 f0point5 0.39562488 8.234713E-4 0.39596537 0.3968501 0.39405915
5 f1 0.48101103 0.0012617736 0.48254105 0.481984 0.47850809
6 f2 0.613405 0.0024687524 0.61756927 0.6136203 0.60902536
7 lift_top_group 2.0142968 0.050682575 2.027856 2.0945127 1.920521
8 logloss 0.5482143 0.0020346912 0.5511301 0.5492147 0.54429805
9 max_per_class_error 0.50603294 0.0070085404 0.5175562 0.5071829 0.4933597
10 mcc 0.21990056 0.0016726145 0.21801764 0.21844749 0.22323658
11 mean_per_class_accuracy 0.6226167 0.0014614518 0.6208214 0.62151676 0.6255119
12 mean_per_class_error 0.37738332 0.0014614518 0.37917858 0.37848327 0.37448812
13 mse 0.18361354 9.3339913E-4 0.18492535 0.18410787 0.18180738
14 null_deviance 24907.797 143.848 25161.13 24899.21 24663.049
15 precision 0.3537607 7.134559E-4 0.35366338 0.3550422 0.35257646
16 r2 0.0670119 9.0821454E-4 0.06541767 0.068562925 0.067055106
17 recall 0.7512663 0.004308973 0.759199 0.75021636 0.7443835
18 residual_deviance 23435.428 146.4661 23706.31 23396.547 23203.426
19 rmse 0.42849872 0.0010901511 0.43002948 0.42907792 0.42638877
20 specificity 0.49396706 0.0070085404 0.4824438 0.4928171 0.50664026

In [27]:
# train AUC
rptr_glm.auc(valid=False)


Out[27]:
0.6682898515349129

In [28]:
# valid AUC
rptr_glm.auc(valid=True)


Out[28]:
0.6642797159900581

In [29]:
# test AUC
rptr_glm.model_performance(test).auc()


Out[29]:
0.66930388703468

In [30]:
# many validation metrics
rptr_glm.model_performance(valid)


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.18433338418488365
RMSE: 0.4293406388695154
LogLoss: 0.5500303205580226
Null degrees of freedom: 47979
Residual degrees of freedom: 47926
Null deviance: 56034.26930760575
Residual deviance: 52780.909560747896
AIC: 52888.909560747896
AUC: 0.6642797159900581
pr_auc: 0.4082413348072177
Gini: 0.32855943198011617
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.23680137514237834: 
0 1 Error Rate
0 17240.0 17753.0 0.5073 (17753.0/34993.0)
1 3305.0 9682.0 0.2545 (3305.0/12987.0)
Total 20545.0 27435.0 0.4389 (21058.0/47980.0)
Maximum Metrics: Maximum metrics at their respective thresholds

metric threshold value idx
max f1 0.2368014 0.4790461 274.0
max f2 0.1223229 0.6605821 358.0
max f0point5 0.3284445 0.4251892 202.0
max accuracy 0.4805090 0.7320342 76.0
max precision 0.7387865 0.6923077 8.0
max recall 0.0559133 1.0 399.0
max specificity 0.9378370 0.9999143 0.0
max absolute_mcc 0.2928417 0.2189525 229.0
max min_per_class_accuracy 0.2913873 0.6202098 231.0
max mean_per_class_accuracy 0.2928417 0.6222223 229.0
Gains/Lift Table: Avg response rate: 27.07 %, avg score: 26.98 %

group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate score cumulative_response_rate cumulative_score capture_rate cumulative_capture_rate gain cumulative_gain
1 0.0100459 0.5366650 2.0005291 2.0005291 0.5414938 0.5850936 0.5414938 0.5850936 0.0200970 0.0200970 100.0529096 100.0529096
2 0.0200917 0.5128554 1.9698696 1.9851994 0.5331950 0.5228512 0.5373444 0.5539724 0.0197890 0.0398860 96.9869646 98.5199371
3 0.0300125 0.4961243 1.9093237 1.9601182 0.5168067 0.5044425 0.5305556 0.5376000 0.0189420 0.0588281 90.9323674 96.0118238
4 0.0400167 0.4826037 1.9472902 1.9569112 0.5270833 0.4896555 0.5296875 0.5256139 0.0194810 0.0783091 94.7290239 95.6911238
5 0.05 0.4676137 1.7353953 1.9126819 0.4697286 0.4742862 0.5177157 0.5153654 0.0173250 0.0956341 73.5395264 91.2681913
6 0.1 0.4247933 1.7386617 1.8256718 0.4706128 0.4440226 0.4941642 0.4796940 0.0869331 0.1825672 73.8661739 82.5671826
7 0.15 0.3940127 1.4676215 1.7063217 0.3972489 0.4091581 0.4618591 0.4561820 0.0733811 0.2559483 46.7621468 70.6321706
8 0.2 0.3653559 1.3043813 1.6058366 0.3530638 0.3791261 0.4346603 0.4369180 0.0652191 0.3211673 30.4381304 60.5836606
9 0.3152980 0.3233892 1.3456877 1.5107055 0.3642444 0.3402462 0.4089106 0.4015671 0.1551552 0.4763225 34.5687698 51.0705474
10 0.4043560 0.3039213 1.1542497 1.4321974 0.3124269 0.3130243 0.3876604 0.3820659 0.1027951 0.5791176 15.4249715 43.2197384
11 0.5002293 0.2727063 0.9581511 1.3413423 0.2593478 0.2891334 0.3630682 0.3642546 0.0918611 0.6709787 -4.1848872 34.1342304
12 0.6016048 0.2177024 0.9608340 1.2772234 0.2600740 0.2480176 0.3457128 0.3446676 0.0974051 0.7683838 -3.9166001 27.7223392
13 0.7013964 0.1838723 0.6767011 1.1917837 0.1831662 0.1940466 0.3225864 0.3232379 0.0675291 0.8359128 -32.3298943 19.1783730
14 0.8050855 0.1746898 0.6928512 1.1275249 0.1875377 0.1783366 0.3051931 0.3045757 0.0718411 0.9077539 -30.7148819 12.7524917
15 0.9000208 0.1270697 0.6277750 1.0748106 0.1699232 0.1564416 0.2909247 0.2889503 0.0595981 0.9673520 -37.2225050 7.4810629
16 1.0 0.0519661 0.3265484 1.0 0.0883886 0.0972660 0.2706753 0.2697859 0.0326480 1.0 -67.3451614 0.0

Out[30]:


In [31]:
# print coefficients
for key_ in rptr_glm.coef():
    print(key_, rptr_glm.coef()[key_])


Intercept 0.9142362343476567
offer.1198271 0.03463083385893831
offer.1198272 0.14052728909397258
offer.1198273 -0.028110797822582217
offer.1198274 -0.13117283777912048
offer.1198275 -0.07475893905600814
offer.1200581 -0.2663110424214166
offer.1200582 0.0
offer.1200584 0.0
offer.1204576 -0.13863085089106314
offer.1204821 -0.16653161796482882
offer.1204822 0.0
offer.1208251 0.0
offer.1208252 0.90290635746174
offer.1208329 0.03988082619178288
offer.1208501 0.4323275425518459
offer.1208503 0.0
offer.unknown -0.24095314385202235
week.9 0.3882866662473721
week.10 0.34503199800881224
week.11 -0.3374664786715979
week.12 0.0
week.13 -0.3946215550940864
week.14 -0.22160719365547848
week.15 0.0
week.16 -0.09617188647612415
week.17 0.053510178998351796
week.18 0.15565259807093573
category.1726 -0.16996755533119035
category.2119 0.03988082619178368
category.2202 0.6589080081306594
category.3504 0.0
category.5558 -0.04144651740120281
category.5616 -0.13863085089106691
category.5619 -0.09783190861526495
category.6202 0.12060177946446535
category.unknown -0.2409531438519952
brand.102504 -0.09783190861526239
brand.15889 -0.13863085089106675
brand.3718 0.6589080081306663
brand.5072 -0.04144651740120234
brand.64486 0.12060177946446629
brand.6926 0.03988082619178345
brand.7668 -0.9124249815547996
brand.unknown 0.0
company.104460040 0.0
company.104610040 -0.1386308508910667
company.107120272 -0.041446517401202365
company.107717272 -0.09783190861526334
company.108079383 0.03988082619178309
company.1087744888 0.1206017794644651
company.unknown 0.0
dayOfWeek.Mon -0.06788130486673358
dayOfWeek.Tue -0.001410333705654479
dayOfWeek.Wed 0.007728177083505018
dayOfWeek.Thu 0.0
dayOfWeek.Fri 0.0
dayOfWeek.Sat 0.14530381687259547
dayOfWeek.Sun 0.1032571462037334
exact_item_bought.0 -0.1507968420390149
exact_item_bought.1 0.1263809799828914
month.3 0.19741069489701107
month.4 -0.1549780733058436
offervalue -0.9912946388035481
avg_category_quantity -0.03619275098428959
avg_category_amount 0.08680975561720691
day -0.03191424771152614

In [32]:
# find id == 13584134
score_h2o['id'].asfactor().head()


id
12262064
12277270
12332190
12524696
13074629
13387341
13501141
13558712
13563017
13584134
Out[32]:


In [33]:
# get probability for id == 13584134
rptr_glm.predict(score_h2o)


glm prediction progress: |████████████████████████████████████████████████| 100%
/home/patrickh/anaconda3/lib/python3.6/site-packages/h2o/job.py:69: UserWarning: Test/Validation dataset column 'week' has levels not trained on: [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
  warnings.warn(w)
/home/patrickh/anaconda3/lib/python3.6/site-packages/h2o/job.py:69: UserWarning: Test/Validation dataset column 'month' has levels not trained on: [5, 6, 7]
  warnings.warn(w)
predict p0 p1
00.9319280.0680718
00.9644970.0355034
00.8548730.145127
00.8561920.143808
00.9602050.0397945
00.9596120.0403879
00.8040330.195967
10.71045 0.28955
00.8684330.131567
00.8937060.106294
Out[33]:


In [ ]: