In [1]:
import pandas as pd
In [2]:
# set measurement levels
col_types = {'chain': 'object',
'offer': 'object',
'market': 'object',
'category': 'object',
'company': 'object',
'brand': 'object',
'exact_item_bought': 'object'}
In [3]:
# read data created from SAS key
data_pd = pd.read_csv('/home/patrickh/workspace/GWU_data_mining/01_basic_data_prep/assignment/key/assignment_1_key_train_sas.csv', dtype=col_types)
score_pd = pd.read_csv('/home/patrickh/workspace/GWU_data_mining/01_basic_data_prep/assignment/key/assignment_1_key_test_sas.csv', dtype=col_types)
In [4]:
# confirm shape
data_pd.shape
Out[4]:
In [5]:
# confirm shape
score_pd.shape
Out[5]:
In [6]:
# data['chain'].unique().shape # 130 levels
# data['market'].unique().shape # 34 levels
# data['category'].unique().shape # 13 levels
# data['brand'].unique().shape # 12 levels
# data['company'].unique().shape # 11 levels
In [7]:
# show column names
data_pd.columns
Out[7]:
In [8]:
# ensure repeater==t is modeled as 1
data_pd.loc[data_pd['repeater'] == 't', 'repeater'] = 1
data_pd.loc[data_pd['repeater'] == 'f', 'repeater'] = 0
In [9]:
# fix unknown values in train and test
chain_unknowns =\
list(set(data_pd['chain'].unique()) - set(score_pd['chain'].unique())) +\
list(set(score_pd['chain'].unique()) - set(data_pd['chain'].unique()))
print(chain_unknowns)
data_pd.loc[data_pd['chain'].isin(chain_unknowns), 'chain'] = 'unknown'
score_pd.loc[score_pd['chain'].isin(chain_unknowns), 'chain'] = 'unknown'
print(sorted(data_pd['chain'].unique()))
print(len(data_pd['chain'].unique()))
print(sorted(score_pd['chain'].unique()))
print(len(score_pd['chain'].unique()))
In [10]:
# fix unknown values in train and test
market_unknowns =\
list(set(data_pd['market'].unique()) - set(score_pd['market'].unique())) +\
list(set(score_pd['market'].unique()) - set(data_pd['market'].unique()))
print(market_unknowns)
print(sorted(data_pd['market'].unique()))
print(len(data_pd['market'].unique()))
print(sorted(score_pd['market'].unique()))
print(len(score_pd['market'].unique()))
In [11]:
# fix unknown values in train and test
category_unknowns =\
list(set(data_pd['category'].unique()) - set(score_pd['category'].unique())) +\
list(set(score_pd['category'].unique()) - set(data_pd['category'].unique()))
print(category_unknowns)
data_pd.loc[data_pd['category'].isin(category_unknowns), 'category'] = 'unknown'
score_pd.loc[score_pd['category'].isin(category_unknowns), 'category'] = 'unknown'
print(sorted(data_pd['category'].unique()))
print(len(data_pd['category'].unique()))
print(sorted(score_pd['category'].unique()))
print(len(score_pd['category'].unique()))
In [12]:
# fix unknown values in train and test
brand_unknowns =\
list(set(data_pd['brand'].unique()) - set(score_pd['brand'].unique())) +\
list(set(score_pd['brand'].unique()) - set(data_pd['brand'].unique()))
print(brand_unknowns)
data_pd.loc[data_pd['brand'].isin(brand_unknowns), 'brand'] = 'unknown'
score_pd.loc[score_pd['brand'].isin(brand_unknowns), 'brand'] = 'unknown'
print(sorted(data_pd['brand'].unique()))
print(len(data_pd['brand'].unique()))
print(sorted(score_pd['brand'].unique()))
print(len(score_pd['brand'].unique()))
In [13]:
# fix unknown values in train and test
company_unknowns =\
list(set(data_pd['company'].unique()) - set(score_pd['company'].unique())) +\
list(set(score_pd['company'].unique()) - set(data_pd['company'].unique()))
print(company_unknowns)
data_pd.loc[data_pd['company'].isin(company_unknowns), 'company'] = 'unknown'
score_pd.loc[score_pd['company'].isin(company_unknowns), 'company'] = 'unknown'
print(sorted(data_pd['company'].unique()))
print(len(data_pd['company'].unique()))
print(sorted(score_pd['company'].unique()))
print(len(score_pd['company'].unique()))
In [14]:
# fix unknown values in train and test
offer_unknowns =\
list(set(data_pd['offer'].unique()) - set(score_pd['offer'].unique())) +\
list(set(score_pd['offer'].unique()) - set(data_pd['offer'].unique()))
print(offer_unknowns)
data_pd.loc[data_pd['offer'].isin(offer_unknowns), 'offer'] = 'unknown'
score_pd.loc[score_pd['offer'].isin(offer_unknowns), 'offer'] = 'unknown'
print(sorted(data_pd['offer'].unique()))
print(len(data_pd['offer'].unique()))
print(sorted(score_pd['offer'].unique()))
print(len(score_pd['offer'].unique()))
In [15]:
# start and import h2o
# set seed
import h2o
h2o.init()
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
SEED = 12345
In [16]:
# enforce same measurement levels in h2o and pandas
col_types = {'chain': 'enum',
'offer': 'enum',
'market': 'enum',
'category': 'enum',
'company': 'enum',
'brand': 'enum',
'exact_item_bought': 'enum'}
data_h2o = h2o.H2OFrame(data_pd, column_types=col_types)
score_h2o = h2o.H2OFrame(score_pd, column_types=col_types)
In [17]:
# expand date into new features
data_h2o = data_h2o.concat(data_h2o['offerdate'].month().rename({'offerdate': 'month'}).asfactor())
data_h2o = data_h2o.concat(data_h2o['offerdate'].week().rename({'offerdate': 'week'}).asfactor())
data_h2o = data_h2o.concat(data_h2o['offerdate'].dayOfWeek().rename({'offerdate': 'dayOfWeek'}))
data_h2o = data_h2o.concat(data_h2o['offerdate'].day().rename({'offerdate': 'day'}))
score_h2o = score_h2o.concat(score_h2o['offerdate'].month().rename({'offerdate': 'month'}).asfactor())
score_h2o = score_h2o.concat(score_h2o['offerdate'].week().rename({'offerdate': 'week'}).asfactor())
score_h2o = score_h2o.concat(score_h2o['offerdate'].dayOfWeek().rename({'offerdate': 'dayOfWeek'}))
score_h2o = score_h2o.concat(score_h2o['offerdate'].day().rename({'offerdate': 'day'}))
In [18]:
# look at training data
data_h2o.describe()
In [19]:
# look at test data
score_h2o.describe()
In [20]:
# quantity unary, drop it
# drop other unusable variables
# set modeling roles
drops = ['id', 'chain', 'market', 'offerdate', 'quantity']
y = 'repeater'
X = [name for name in data_h2o.columns if name not in [y] + drops]
print(y)
print(X)
In [21]:
# create modeling partitions
train, valid, test = data_h2o.split_frame([0.4, 0.3], seed=SEED)
In [22]:
# check shape
train.shape
Out[22]:
In [23]:
# check shape
valid.shape
Out[23]:
In [24]:
# check shape
test.shape
Out[24]:
In [25]:
# elastic net regularized regression
# - L1 for variable selection
# - L2 for handling multicollinearity
# - IRLS for handling outliers
# - with lamba parameter tuning for variable selection
# initialize
rptr_glm = H2OGeneralizedLinearEstimator(family='binomial',
model_id='rptr_glm1',
solver='IRLSM',
nfolds=3,
standardize=True,
seed=SEED,
lambda_search=True)
# train
rptr_glm.train(X, y, training_frame=train, validation_frame=valid)
In [26]:
# check for stability across folds -- looks good
rptr_glm.cross_validation_metrics_summary().as_data_frame()
Out[26]:
In [27]:
# train AUC
rptr_glm.auc(valid=False)
Out[27]:
In [28]:
# valid AUC
rptr_glm.auc(valid=True)
Out[28]:
In [29]:
# test AUC
rptr_glm.model_performance(test).auc()
Out[29]:
In [30]:
# many validation metrics
rptr_glm.model_performance(valid)
Out[30]:
In [31]:
# print coefficients
for key_ in rptr_glm.coef():
print(key_, rptr_glm.coef()[key_])
In [32]:
# find id == 13584134
score_h2o['id'].asfactor().head()
Out[32]:
In [33]:
# get probability for id == 13584134
rptr_glm.predict(score_h2o)
Out[33]:
In [ ]: