License


Copyright (C) 2017 J. Patrick Hall, jphall@gwu.edu

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


In [1]:
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [2]:
h2o.init()


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_112"; Java(TM) SE Runtime Environment (build 1.8.0_112-b16); Java HotSpot(TM) 64-Bit Server VM (build 25.112-b16, mixed mode)
  Starting server from /Users/phall/anaconda/lib/python3.5/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmpqzjvh3lj
  JVM stdout: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmpqzjvh3lj/h2o_phall_started_from_python.out
  JVM stderr: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmpqzjvh3lj/h2o_phall_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.
H2O cluster uptime: 02 secs
H2O cluster version: 3.11.0.3873
H2O cluster version age: 11 days
H2O cluster name: H2O_from_python_phall_ju4dpx
H2O cluster total nodes: 1
H2O cluster free memory: 3.556 Gb
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster status: accepting new members, healthy
H2O connection url: http://127.0.0.1:54321
H2O connection proxy: None
H2O internal security: False
Python version: 3.5.2 final

In [3]:
# location of clean file
path = '/Users/phall/workspace/GWU_data_mining/03_regression/data/loan_clean.csv'

In [4]:
# define input variable measurement levels 
# strings automatically parsed as enums (nominal)
# numbers automatically parsed as numeric
col_types = {'bad_loan': 'enum'}

In [5]:
frame = h2o.import_file(path=path, col_types=col_types) # multi-threaded import


Parse progress: |█████████████████████████████████████████████████████████| 100%

In [6]:
frame.describe()


Rows:163987
Cols:18


id bad_loan GRP_REP_home_ownership GRP_addr_state GRP_home_ownership GRP_purpose GRP_verification_status _WARN_ STD_IMP_REP_annual_inc STD_IMP_REP_delinq_2yrs STD_IMP_REP_dti STD_IMP_REP_emp_length STD_IMP_REP_int_rate STD_IMP_REP_loan_amnt STD_IMP_REP_longest_credit_lengt STD_IMP_REP_revol_util STD_IMP_REP_term_length STD_IMP_REP_total_acc
type int enum int int int int int int real real real real real real real real real real
mins 10001.0 1.0 1.0 1.0 1.0 1.0 NaN -1.767455639 -0.39219617 -2.119639396 -1.6213902740000001 -1.907046215 -1.587129405 -2.22445124 -2.164541326 -0.516495577 -2.058861889
mean 91994.0 2.5740028172964924 11.4093373255197032.5740028172964924 3.24494014769463452.340356247751345 0.0 2.38744452882879e-11 2.2959296297769782e-12 6.807013811211564e-11-3.566867876239133e-11 -8.948753565861857e-128.311927579716105e-11 5.0612534090153816e-11 -1.4734128080190765e-11 -1.5009542966560638e-10 8.060924856225354e-13
maxs 173987.0 5.0 37.0 5.0 14.0 3.0 NaN 4.6180619798 4.1566950661 3.0371487270000004 1.2288169612 2.8376799992 2.7671323946 3.1431598296 3.0363495275 1.9718787627 3.0684672884
sigma 47339.11363414683 0.6675260435449262 9.971926133461404 0.6675260435449262 2.26728920752597540.5040864341768772 -0.0 0.9999999999982868 0.9999999999212518 1.0000000000037712 1.0000000000339833 1.0000000000199503 0.999999999985285 0.9999999999850594 1.000000000017688 1.0000000000642086 1.0000000000331841
zeros 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
missing0 0 0 0 0 0 0 163987 0 0 0 0 0 0 0 0 0 0
0 10001.0 0 3.0 14.0 3.0 3.0 2.0 nan -1.1992995020000001 -0.39219617 1.5712460425 1.2288169612 -0.7047730510000001 -1.019182214 1.6839024850000002 1.1858716502 -0.516495577 -1.359278248
1 10002.0 1 3.0 10.0 3.0 8.0 2.0 nan -1.04507688 -0.39219617 -1.9861534850000002 -1.6213902740000001 0.3572732234 -1.3347084310000001 -0.42059567400000003 -1.7882703350000002 1.9718787627 -1.7965180230000002
2 10003.0 0 3.0 7.0 3.0 7.0 3.0 nan -1.501267394 -0.39219617 -0.9556422520000001 1.2288169612 0.5158905241 -1.34732948 -0.7212382690000001 1.7782983174 -0.516495577 -1.271830292
3 10004.0 0 3.0 2.0 3.0 4.0 2.0 nan -0.303921333 -0.39219617 0.5500788236 1.2288169612 -0.051913437 -0.388129779 0.0303682169 0.0325652593 -0.516495577 1.089264497
4 10005.0 0 3.0 14.0 3.0 10.0 2.0 nan -0.890854259 -0.39219617 -0.624597193 -0.7663281030000001 -1.3369434530000002 -1.019182214 -0.8220262690000001 -1.0317254690000002 -0.516495577 -1.0969343820000002
5 10006.0 0 3.0 2.0 3.0 8.0 2.0 nan -0.5824090160000001 -0.39219617 -1.4054897720000001 0.9437962377 1.1319693155000001 -1.271603188 -1.623166051 1.3379811999 -0.516495577 -1.7965180230000002
6 10007.0 1 4.0 2.0 4.0 7.0 2.0 nan -0.788039178 -0.39219617 -1.37879259 -0.48130738 1.7388529011 -0.9434559220000001 -1.17220216 -0.8596015050000001 1.9718787627 -1.0094864270000001
7 10008.0 1 3.0 4.0 3.0 4.0 2.0 nan -1.430633434 -0.39219617 0.2937858745 -1.6213902740000001 -0.235817553 -0.971853281 -1.17220216 -0.703489072 1.9718787627 -1.883965979
8 10009.0 0 4.0 14.0 4.0 2.0 3.0 nan 0.0344814697 -0.39219617 0.032153489 -0.196286656 0.2147475328 -0.8298664840000001 -0.270274377 -1.339947451 1.9718787627 -0.135006875
9 10010.0 0 4.0 2.0 4.0 2.0 2.0 nan 0.1115927805 -0.39219617 -0.680661276 1.2288169612 -0.235817553 -0.13570880500000002 1.0826172966 0.5213930910000001 -0.516495577 0.8269206315000001

In [7]:
# split into training, validation and test
train, test = frame.split_frame([0.7])

In [8]:
# assign target and inputs for linear regression
y = 'bad_loan'
X = [name for name in frame.columns if name not in ['id', '_WARN_', y]]

In [9]:
print(y)
print(X)


bad_loan
['GRP_REP_home_ownership', 'GRP_addr_state', 'GRP_home_ownership', 'GRP_purpose', 'GRP_verification_status', 'STD_IMP_REP_annual_inc', 'STD_IMP_REP_delinq_2yrs', 'STD_IMP_REP_dti', 'STD_IMP_REP_emp_length', 'STD_IMP_REP_int_rate', 'STD_IMP_REP_loan_amnt', 'STD_IMP_REP_longest_credit_lengt', 'STD_IMP_REP_revol_util', 'STD_IMP_REP_term_length', 'STD_IMP_REP_total_acc']

In [10]:
# set target to factor - for logisitic regression
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [11]:
# elastic net regularized regression
#   - binomial family for logistic regression
#   - L1 for variable selection
#   - L2 for handling multicollinearity
#   - IRLS for handling outliers
#   - with lamba parameter tuning for variable selection

# initialize
loan_glm = H2OGeneralizedLinearEstimator(family='binomial',
                                         model_id='loan_glm2',
                                         solver='IRLSM',
                                         standardize=True,
                                         lambda_search=True)

# train 
loan_glm.train(X, y, training_frame=train)

# view detailed results at http://host:ip/flow/index.html


glm Model Build progress: |███████████████████████████████████████████████| 100%

In [12]:
# measure train and test AUC
print(loan_glm.auc(train=True))
print(loan_glm.model_performance(test_data=test).auc())


0.6810696481515511
0.6736120208845109

In [13]:
# print non-zero model parameters
for name, val in loan_glm.coef().items():
    if val != 0.0:
        print(name, ': ', val)


GRP_REP_home_ownership :  0.020513214688462438
Intercept :  -1.7030791806036532
STD_IMP_REP_int_rate :  0.42135857374495084
GRP_verification_status :  -0.003938827536051814
STD_IMP_REP_term_length :  0.12487952945144505
STD_IMP_REP_dti :  0.14938938856970724
STD_IMP_REP_annual_inc :  -0.22640948089354618
STD_IMP_REP_delinq_2yrs :  0.007632494816369783
GRP_addr_state :  -0.0014574882705564829
GRP_purpose :  0.024782009588586243
STD_IMP_REP_revol_util :  0.07328727513849898
STD_IMP_REP_total_acc :  -0.10700058307918586
STD_IMP_REP_loan_amnt :  0.08314512862872152
STD_IMP_REP_longest_credit_lengt :  0.005262595727552371
GRP_home_ownership :  0.020513214688462626

In [14]:
h2o.cluster().shutdown(prompt=False)


H2O session _sid_96dd closed.