Copyright (C) 2017 J. Patrick Hall, jphall@gwu.edu
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
In [1]:
# imports
import pandas as pd # import pandas for easy data manipulation using data frames
import numpy as np # import numpy for numeric calculations on matrices
import time # for timers
# import h2o to check calculations
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
In [2]:
# data-related constants
IN_FILE_PATH = '../data/loan_clean.csv'
Y = 'STD_IMP_REP_loan_amnt'
DROPS = ['id', 'GRP_REP_home_ownership', 'GRP_addr_state', 'GRP_home_ownership',
'GRP_purpose', 'GRP_verification_status', '_WARN_']
# model-related constants
LEARN_RATE = 0.05 # how much each gradient descent step impacts parameters
CONV = 1e-10 # desired precision in parameters
MAX_ITERS = 10000 # maximum number of gradient descent steps to allow
In [3]:
# import data using Pandas
raw = pd.read_csv(IN_FILE_PATH)
raw.describe()
Out[3]:
In [4]:
# select target column
y = raw[Y].as_matrix()
print(y)
In [5]:
# create input matrix
# add an additional column of 1's for intercept
# by overlaying inputs onto matrix of 1's
numeric = raw.drop(DROPS + [Y], axis=1).as_matrix()
N, p = numeric.shape
X = np.ones(shape=(N, p + 1))
X[:,1:] = numeric
The normal equation estimates coefficients that globally minimize the squared error between a rectangular input data matrix $X$ and a column vector of target values $y$. I.e. it presents an optimal solution to an underspecified set of linear equations.
In [6]:
X_transpose = np.transpose(X)
beta_hat = np.linalg.inv(X_transpose.dot(X))
beta_hat = beta_hat.dot(X_transpose)
beta_hat = beta_hat.dot(y)
print('Model parameters:\n', beta_hat)
In [7]:
def squared_loss(n, x, y, betas, lambda_=0):
""" Squared loss function for multiple linear regression.
:param n: Number of rows in x.
:param x: Matrix of numeric inputs.
:param y: Vector of known target values.
:param beta: Vector of current model parameters.
:param lambda_: Scale factor for L2 regularization, default 0.
:return: Scalar MSE value.
"""
yhat = x.dot(betas)
return (1 / (2 * n)) * (((y - yhat)**2).sum() + lambda_ * (betas**2).sum())
In [8]:
def grad(n, y, yhat, x, beta_j, lambda_=0):
""" Analytical gradient of scaled MSE loss function with L2 regularization.
:param n: Number of rows in X.
:param y: Vector of known target values.
:param yhat: Vector of predicted target values.
:param x: Vector of input values.
:param beta_j: Model parameter for which to calculate gradient.
:param lambda_: Scale factor for L2 regularization, default 0.
:return: Vector of gradient values.
"""
return (1 / n) * (x * (yhat - y) + (lambda_ * beta_j))
In [9]:
def grad_descent(X, y, learn_rate, max_iters, sgd_mini_batch_n=0, lambda_=0):
""" Routine for executing simple gradient descent with stochastic gradient descent option.
:param X: Matrix of numeric data.
:param y: Vector of known target values.
:param learn_rate: Learning rate.
:param max_iters: Maximum number of gradient descent steps to perform.
:param sgd_mini_batch_n: Minibatch size for sgd optimization.
If > 0 minibatch stochastic gradient descent is performed.
:param lambda_: Scale factor for L2 regularization, default 0.
"""
tic = time.time() # start timer
n_betas = X.shape[1] # number of model parameters including bias
betas = np.zeros(shape=n_betas) # parameters start with value of 0
n = y.shape[0] # number of rows in X
# Pandas dataframe for iteration history
iteration_frame = pd.DataFrame(columns=['Iteration', 'Loss'])
print('Iteration history:')
# loop for gradient descent steps
for i in range(max_iters):
# stochastic gradient descent
if sgd_mini_batch_n > 0:
samp_idx = np.random.randint(n, size=sgd_mini_batch_n)
X_samp = X[samp_idx, :]
y_samp = y[samp_idx]
n_samp = X_samp.shape[0]
yhat_samp = X_samp.dot(betas) # model predictions for iteration
# loop for column-wise parameter updates
for j in range(n_betas):
# select column
# calculate column-wise gradient
# update corresponding parameter based on negative gradient
# calculate loss
xj_samp = X_samp[:, j]
beta_j = betas[j]
xj_grad_samp = grad(n_samp, y_samp, yhat_samp, xj_samp, beta_j, lambda_)
betas[j] = betas[j] - learn_rate * xj_grad_samp.sum()
iter_loss = squared_loss(n_samp, X_samp, y_samp, betas, lambda_)
# standard gradient descent
else:
yhat = X.dot(betas) # model predictions for iteration
# loop for column-wise parameter updates
for j in range(n_betas):
xj = X[:, j]
beta_j = betas[j]
xj_grad = grad(n, y, yhat, xj, beta_j, lambda_)
betas[j] = betas[j] - learn_rate * xj_grad.sum()
iter_loss = squared_loss(n, X, y, betas, lambda_)
# update loss history
iteration_frame = iteration_frame.append({'Iteration': i,
'Loss': iter_loss},
ignore_index=True)
# progress indicator
if i % 1000 == 0:
print('iter=%d loss=%.6f' % (i, iter_loss))
# convergence check
if i > 0:
if np.abs(iteration_frame.iat[i-1, 1] - iteration_frame.iat[i, 1]) < CONV:
break
# output
%matplotlib inline
iteration_frame.plot.line(title='Iteration Plot', x='Iteration', y='Loss')
print()
print('Model parameters at iteration ' + str(i) + ':')
print(betas)
print()
print('Model trained in %.2f s.' % (time.time()-tic))
In [10]:
grad_descent(X, y, LEARN_RATE, MAX_ITERS)
In [11]:
grad_descent(X, y, LEARN_RATE, MAX_ITERS, lambda_=0.01)
In [12]:
grad_descent(X, y, LEARN_RATE, MAX_ITERS, sgd_mini_batch_n=1000)
In [13]:
grad_descent(X, y, LEARN_RATE, MAX_ITERS, lambda_=0.01, sgd_mini_batch_n=1000)
In [14]:
# start h2o
h2o.init()
DROPS = ['id', 'GRP_REP_home_ownership', 'GRP_addr_state', 'GRP_home_ownership',
'GRP_purpose', 'GRP_verification_status', '_WARN_']
# numeric columns
train = h2o.import_file(IN_FILE_PATH)
train = train.drop(DROPS)
X = train.col_names
# initialize non-penalized GLM model
loan_glm = H2OGeneralizedLinearEstimator(family='gaussian', # uses squared error
solver='IRLSM', # necessary for non-penalized GLM
standardize=False, # data is already standardized
compute_p_values=True, # necessary for non-penalized GLM
lambda_=0) # necessary for non-penalized GLM
# train
loan_glm.train(train.col_names, Y, training_frame=train)
# print trained model info
print()
print('Model parameters:')
for name, val in loan_glm.coef().items():
print(name, val)
print()
# shutdown h2o
h2o.cluster().shutdown()