In [174]:

    
import numpy as np
import matplotlib.pyplot as plt



In [112]:

    
pima = np.genfromtxt('pima-indians-diabetes.data', delimiter=',')

def sghmc(Y, X, stogradU, M, eps, m, theta, C, V): n = X.shape[0] p = X.shape[1]

# Randomly sample momentum
r = np.random.multivariate_normal(np.zeros(M.shape[0]),M)[:,np.newaxis]

# Precompute
B = 0.5 * V * eps
D = 2*(C-B)*eps
Minv = np.linalg.inv(M)

# Hamiltonian dynamics
for i in range(m):
    theta = theta + (eps*np.linalg.inv(M) @ r).ravel()
    r = r - eps*stogradU(theta, Y, X, nbatch) - eps*C @ Minv @ r \
        + np.random.multivariate_normal(np.zeros(M.shape[0]),D)[:,np.newaxis]

return(theta)

def stogradU(theta, Y, X, nbatch): '''A function that returns the stochastic gradient. Adapted from Eq. 5. Inputs are: theta, the parameters Y, the response X, the covariates nbatch, the number of samples to take from the full data ''' alpha=5 n = X.shape[0] batch_id = np.random.choice(np.arange(n),nbatch,replace=False) grad = -n/nbatch * X[batch_id,:].T @ (Y[batch_id][:,np.newaxis] - \ 1/(1+np.exp(-X[batch_id,:] @ theta[:,np.newaxis]))) - theta[:,np.newaxis]/alpha return grad

def logistic(x): return 1/(1+np.exp(-x))

Y_pred = logistic(X[batch_id,:] @ theta[:,np.newaxis])
epsilon = (Y[batch_id][:,np.newaxis] - Y_pred)
grad = -n/nbatch * X[batch_id,:].T @ epsilon - theta[:,np.newaxis]/alpha

return grad/n



In [119]:

    
# Load data
X = np.concatenate((np.ones((pima.shape[0],1)),pima[:,0:8]), axis=1)
Y = pima[:,8]

Xs = (X - np.mean(X, axis=0))/np.concatenate((np.ones(1),np.std(X[:,1:], axis=0)))
n, p = X.shape

nsample = 1
nbatch = 768
M = np.identity(p)
C = 0 * np.identity(p)
eps = 0.1
m = 10
V = 0 * np.identity(p)
theta = np.zeros(p)



In [765]:

    
### HMC version
def logistic(x):
    return 1/(1+np.exp(-x))

def U(theta, Y, X):
    return - (Y.T @ X @ theta - np.sum(np.log(1+np.exp(X @ theta))) - 0.5 * phi * np.sum(theta**2))

def gradU(theta, Y, X, nbatch):
    '''A function that returns the stochastic gradient. Adapted from Eq. 5.
    Inputs are:
        theta, the parameters
        Y, the response
        X, the covariates
        nbatch, the number of samples to take from the full data
    '''
    n = X.shape[0]
    
    Y_pred = logistic(X @ theta)
    epsilon = (Y[:,np.newaxis] - Y_pred[:,np.newaxis])
    grad = X.T @ epsilon - phi * theta[:, np.newaxis]

    return -grad/n
    #temp = -grad/n
    #return temp / np.linalg.norm(temp)


def hmc(Y, X, gradU, M, eps, m, theta, C, V):
    theta0 = theta.copy()
    
    # This is just HMC for testing
    n = X.shape[0]
    p = X.shape[1]
    
    # Precompute
    Minv = np.linalg.inv(M)
    
    # Randomly sample momentum
    r = np.random.multivariate_normal(np.zeros(p),M)[:,np.newaxis]
    
    # Intial energy
    H0 = U(theta, Y, X) + 0.5 * np.asscalar(r.T @ Minv @ r)
    
    # Hamiltonian dynamics
    r = r - (eps/2)*gradU(theta, Y, X, nbatch)
    for i in range(m):
        theta = theta + (eps*Minv@r).ravel()
        r = r - eps*gradU(theta, Y, X, nbatch)
    theta = theta + (eps*Minv@r).ravel()
    r = r - (eps/2)*gradU(theta, Y, X, nbatch)  
    
    # Final energy
    H1 = U(theta, Y, X) + np.asscalar(0.5 * r.T @ Minv @ r)
    
    # MH step
    u = np.random.uniform()
    #rho = np.exp(H1 - H0)
    rho = np.exp(H0 - H1)
    #print('(H0, H1, rho): %s,%s,%s' % (H0, H1, rho))
    
    if u < np.min((1, rho)):
        return theta.copy()
    else:
        return theta0.copy() # reject
        
        
    return theta

def my_gd(Y, X, gradU, M, eps, m, theta, C, V):
    # gradient descent
    n = X.shape[0]
    p = X.shape[1]
    
    for i in range(m):
        theta = theta - eps*gradU(theta, Y, X, nbatch).ravel()
        
    return theta

Correct coefficients



In [731]:

    
from sklearn.linear_model import LogisticRegression



In [732]:

    
# Unscaled
mod_logis = LogisticRegression(fit_intercept=False, C=1e50)
mod_logis.fit(X,Y)
beta_true_unscale = mod_logis.coef_.ravel()
beta_true_unscale









    Out[732]:





array([ -8.31498612e+00,   1.22560027e-01,   3.49183220e-02,
        -1.34118967e-02,   6.28219471e-04,  -1.17179659e-03,
         8.86606033e-02,   9.30419443e-01,   1.46781178e-02])



In [733]:

    
# Scaled
mod_logis = LogisticRegression(fit_intercept=False, C=1e50)
mod_logis.fit(Xs,Y)
beta_true_scale = mod_logis.coef_.ravel()
beta_true_scale









    Out[733]:





array([ 0.        ,  0.39024907,  1.08791914, -0.24544979,  0.02250608,
       -0.1621995 ,  0.59035938,  0.32483104,  0.12120845])



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [760]:

    
U(np.ones(p)*.1,Y,X)









    Out[760]:





16642.096817683432



In [771]:

    
phi = 1
gradU(np.ones(p)*.1, Y, X, 1)*n









    Out[771]:





array([[   500.09988232],
       [  1649.09976291],
       [ 54990.09137731],
       [ 34092.0995767 ],
       [  9832.09982455],
       [ 34396.09999878],
       [ 15152.19962158],
       [   214.96697893],
       [ 15595.09738417]])

Our code - HMC



In [772]:

    
# HMC - Unscaled
nsample = 10000
m = 20
eps = .0001
#theta = np.zeros(p)
theta = beta_true_unscale.copy()
phi = 0.01

np.random.seed(2)
samples = np.zeros((nsample, p))
u = np.zeros(nsample)
for i in range(nsample):
    theta = hmc(Y, X, gradU, M, eps, m, theta, C, V)
    samples[i] = theta
    u[i] = U(theta, Y, X)
    
np.mean(samples, axis=0) - beta_true_unscale









    Out[772]:





array([  7.84926902e-02,   1.25885115e-02,   4.00451155e-04,
        -1.45856533e-03,  -1.01913591e-04,  -2.07337557e-05,
         1.59065406e-03,   2.80808621e-03,  -3.79604038e-03])



In [775]:

    
plt.plot((samples - beta_true_unscale)[:,4])
plt.show()



In [757]:

    
plt.plot(u)
plt.show()



In [752]:

    
beta_true_unscale









    Out[752]:





array([ -8.31498612e+00,   1.22560027e-01,   3.49183220e-02,
        -1.34118967e-02,   6.28219471e-04,  -1.17179659e-03,
         8.86606033e-02,   9.30419443e-01,   1.46781178e-02])



In [776]:

    
# HMC - Scaled
nsample = 10000
m = 20
eps = .001
theta = np.zeros(p)
#theta = beta_true_scale.copy()
phi = 0.1

np.random.seed(2)
samples = np.zeros((nsample, p))
u = np.zeros(nsample)
for i in range(nsample):
    theta = hmc(Y, Xs, gradU, M, eps, m, theta, C, V)
    samples[i] = theta
    u[i] = U(theta, Y, Xs)
    
np.mean(samples, axis=0) - beta_true_scale









    Out[776]:





array([  1.77287426e+00,  -1.20903988e-02,   1.39210349e-03,
         6.95503380e-03,   7.29648752e-03,  -1.66820537e-02,
        -6.06719146e-03,  -2.00404523e-03,  -7.61733226e-03])



In [777]:

    
plt.plot((samples - beta_true_scale)[:,1])
plt.show()



In [670]:

    
plt.plot(u)
plt.show()



In [ ]:



In [ ]:



In [ ]:



In [711]:

    
# HMC - Scaled (no intercept)
nsample = 10000
m = 20
eps = .001
theta = np.zeros(p-1)
#theta = beta_true_scale.copy()[1:]
phi = 1

np.random.seed(2)
samples = np.zeros((nsample, p-1))
u = np.zeros(nsample)
for i in range(nsample):
    theta = hmc(Y, Xs[:,1:], gradU, np.identity(p-1), eps, m, theta, C, V)
    samples[i] = theta
    u[i] = U(theta, Y, Xs[:,1:])
    
np.mean(samples, axis=0) - beta_true_scale[1:]









    Out[711]:





array([ 0.00837124,  0.0476728 , -0.00164605,  0.00438147, -0.01766982,
       -0.01875592, -0.00289623, -0.02114125])



In [715]:

    
plt.plot((samples - beta_true_scale[1:])[:,5])
plt.show()



In [692]:

    
plt.plot(u)
plt.show()

Our code - Gradient descent



In [324]:

    
# Gradient descent - Unscaled
np.random.seed(2)
#res = my_gd(Y, X, gradU, M, .0001, 10000, np.zeros(p), C, V) # Starting at zero
#res = my_gd(Y, X, gradU, M, .0001, 10000, beta_true_unscale.copy(), C, V) # Starting at true values

res = my_gd(Y, X, gradU, M, .0001, 10000, beta_true_unscale.copy(), C, V) # Starting at true values

res - beta_true_unscale









    Out[324]:





array([ -2.19413632e-04,   2.36331548e-04,   7.48727924e-06,
         2.31994484e-06,   9.33934233e-06,  -3.92206222e-07,
        -2.15577090e-05,   1.00758322e-04,  -4.00087824e-05])



In [325]:

    
# Gradient descent - Scaled
np.random.seed(2)
res = my_gd(Y, Xs, gradU, M, .1, 20000, np.zeros(p), C, V)

res - beta_true_scale









    Out[325]:





array([  0.00000000e+00,   4.17596046e-06,   5.92833240e-06,
         3.17764100e-06,   8.84196257e-06,  -2.98662348e-06,
        -1.76233441e-05,   8.02028628e-06,  -7.35280641e-06])



In [ ]:



In [ ]:



In [ ]:

Cliburn's code



In [12]:

    
# Cliburn's gradient descent code

def gd(X, y, beta, alpha, niter):
    """Gradient descent algorihtm."""
    n, p = X.shape
    Xt = X.T
    for i in range(niter):
        y_pred = logistic(X @ beta)
        epsilon = y - y_pred
        grad = Xt @ epsilon / n
        beta += alpha * grad
    return beta



In [13]:

    
# Unscaled
#res = gd(X, Y.ravel(), np.zeros(p), alpha=.1, niter=2) # Starting at zero
res = gd(X, Y.ravel(), beta_true_unscale.copy(), alpha=.0001, niter=10000) # Starting at true coefficients

res - beta_true_unscale









    Out[13]:





array([ -2.19413632e-04,   2.36331548e-04,   7.48727924e-06,
         2.31994484e-06,   9.33934233e-06,  -3.92206222e-07,
        -2.15577090e-05,   1.00758322e-04,  -4.00087824e-05])



In [14]:

    
# Scaled
res = gd(Xs, Y.ravel(), np.zeros(p), alpha=.1, niter=20000)

res - beta_true_scale









    Out[14]:





array([  0.00000000e+00,   4.17596046e-06,   5.92833240e-06,
         3.17764100e-06,   8.84196257e-06,  -2.98662348e-06,
        -1.76233441e-05,   8.02028628e-06,  -7.35280641e-06])



In [ ]:



In [ ]: