In [31]:

    
import numpy as np
import matplotlib.pyplot as plt



In [32]:

    
pima = np.genfromtxt('pima-indians-diabetes.data', delimiter=',')

def sghmc(Y, X, stogradU, M, eps, m, theta, C, V): n = X.shape[0] p = X.shape[1]

# Randomly sample momentum
r = np.random.multivariate_normal(np.zeros(M.shape[0]),M)[:,np.newaxis]

# Precompute
B = 0.5 * V * eps
D = 2*(C-B)*eps
Minv = np.linalg.inv(M)

# Hamiltonian dynamics
for i in range(m):
    theta = theta + (eps*np.linalg.inv(M) @ r).ravel()
    r = r - eps*stogradU(theta, Y, X, nbatch) - eps*C @ Minv @ r \
        + np.random.multivariate_normal(np.zeros(M.shape[0]),D)[:,np.newaxis]

return(theta)

def stogradU(theta, Y, X, nbatch): '''A function that returns the stochastic gradient. Adapted from Eq. 5. Inputs are: theta, the parameters Y, the response X, the covariates nbatch, the number of samples to take from the full data ''' alpha=5 n = X.shape[0] batch_id = np.random.choice(np.arange(n),nbatch,replace=False) grad = -n/nbatch * X[batch_id,:].T @ (Y[batch_id][:,np.newaxis] - \ 1/(1+np.exp(-X[batch_id,:] @ theta[:,np.newaxis]))) - theta[:,np.newaxis]/alpha return grad

def logistic(x): return 1/(1+np.exp(-x))

Y_pred = logistic(X[batch_id,:] @ theta[:,np.newaxis])
epsilon = (Y[batch_id][:,np.newaxis] - Y_pred)
grad = -n/nbatch * X[batch_id,:].T @ epsilon - theta[:,np.newaxis]/alpha

return grad/n



In [33]:

    
# Load data
X = np.concatenate((np.ones((pima.shape[0],1)),pima[:,0:8]), axis=1)
Y = pima[:,8]

Xs = (X - np.mean(X, axis=0))/np.concatenate((np.ones(1),np.std(X[:,1:], axis=0)))
n, p = X.shape

M = np.identity(p)



In [116]:

    
### HMC version
def logistic(x):
    return 1/(1+np.exp(-x))

def U(theta, Y, X):
    return - (Y.T @ X @ theta - np.sum(np.log(1+np.exp(X @ theta))) - 0.5 * phi * np.sum(theta**2))

def gradU(theta, Y, X, nbatch):
    '''A function that returns the stochastic gradient. Adapted from Eq. 5.
    Inputs are:
        theta, the parameters
        Y, the response
        X, the covariates
        nbatch, the number of samples to take from the full data
    '''
    n = X.shape[0]
    
    Y_pred = logistic(X @ theta)
    epsilon = (Y[:,np.newaxis] - Y_pred[:,np.newaxis])
    grad = X.T @ epsilon - phi * theta[:, np.newaxis]

    return -grad/n

def stogradU(theta, Y, X, nbatch):
    '''A function that returns the stochastic gradient. Adapted from Eq. 5.
    Inputs are:
        theta, the parameters
        Y, the response
        X, the covariates
        nbatch, the number of samples to take from the full data
    '''
    n, p = X.shape
    
    # Sample minibatch
    batch_id = np.random.choice(np.arange(n),nbatch,replace=False)
    
    Y_pred = logistic(X[batch_id,:] @ theta[:,np.newaxis])
    epsilon = (Y[batch_id,np.newaxis] - Y_pred)
    grad = n/nbatch * X[batch_id,:].T @ epsilon - phi * theta[:, np.newaxis]

    #return -grad/n
    return -grad


def sghmc(Y, X, gradU, M, eps, m, theta, C, V):
    
    n, p = X.shape
    
    # Precompute
    Minv = np.linalg.inv(M)
    B = 0.5 * V * eps
    D = 2*(C-B)*eps
    
    # Randomly sample momentum
    r = np.random.multivariate_normal(np.zeros(p),M)[:,np.newaxis]
    
    # Hamiltonian dynamics
    #r = r - (eps/2)*gradU(theta, Y, X, nbatch)
    for i in range(m):        
        theta = theta + (eps*Minv@r).ravel()
        r = r - eps*stogradU(theta, Y, X, nbatch) - eps*C @ Minv @ r \
            + np.random.multivariate_normal(np.zeros(p),D)[:,np.newaxis]
    #theta = theta + (eps*Minv@r).ravel()
    #r = r - (eps/2)*gradU(theta, Y, X, nbatch)  
    
    return theta

def my_gd(Y, X, gradU, M, eps, m, theta, C, V):
    # gradient descent
    n = X.shape[0]
    p = X.shape[1]
    
    for i in range(m):
        theta = theta - eps*gradU(theta, Y, X, nbatch).ravel()
        
    return theta

Correct coefficients



In [35]:

    
from sklearn.linear_model import LogisticRegression



In [36]:

    
# Unscaled
mod_logis = LogisticRegression(fit_intercept=False, C=1e50)
mod_logis.fit(X,Y)
beta_true_unscale = mod_logis.coef_.ravel()
beta_true_unscale









    Out[36]:





array([ -8.31498612e+00,   1.22560027e-01,   3.49183220e-02,
        -1.34118967e-02,   6.28219471e-04,  -1.17179659e-03,
         8.86606033e-02,   9.30419443e-01,   1.46781178e-02])



In [37]:

    
# Scaled
mod_logis = LogisticRegression(fit_intercept=False, C=1e50)
mod_logis.fit(Xs,Y)
beta_true_scale = mod_logis.coef_.ravel()
beta_true_scale









    Out[37]:





array([ 0.        ,  0.39024907,  1.08791914, -0.24544979,  0.02250608,
       -0.1621995 ,  0.59035938,  0.32483104,  0.12120845])



In [ ]:



In [ ]:



In [ ]:



In [19]:

    
X.shape, Y.shape









    Out[19]:





((768, 9), (768,))



In [38]:

    
U(np.ones(p)*.1,Y,X)









    Out[38]:





16642.097267683432



In [39]:

    
gradU(np.ones(p)*.1, Y, X, 1)*n









    Out[39]:





array([[   500.00088232],
       [  1649.00076291],
       [ 54989.99237731],
       [ 34092.0005767 ],
       [  9832.00082455],
       [ 34396.00099878],
       [ 15152.10062158],
       [   214.86797893],
       [ 15594.99838417]])



In [40]:

    
stogradU(np.ones(p)*.1, Y, X, 768)*n









    Out[40]:





array([[   500.00088232],
       [  1649.00076291],
       [ 54989.99237731],
       [ 34092.0005767 ],
       [  9832.00082455],
       [ 34396.00099878],
       [ 15152.10062158],
       [   214.86797893],
       [ 15594.99838417]])

Our code - SGHMC



In [46]:

    
# HMC - Unscaled
nsample = 100
m = 20
eps = .0001
#theta = np.zeros(p)
theta = beta_true_unscale.copy()
phi = 0.01
nbatch = 500
C = 0 * np.identity(p)
V = 0 * np.identity(p)

np.random.seed(2)
samples = np.zeros((nsample, p))
u = np.zeros(nsample) 
for i in range(nsample):
    theta = sghmc(Y, X, stogradU, M, eps, m, theta, C, V)
    samples[i] = theta
    u[i] = U(theta, Y, X)
    
np.mean(samples, axis=0) - beta_true_unscale









    Out[46]:





array([-0.01989821, -0.00538216,  0.00134785, -0.00025997,  0.0208011 ,
        0.01279253, -0.00012713, -0.00163586, -0.0071869 ])



In [48]:

    
plt.plot((samples - beta_true_unscale)[:,3])
plt.show()



In [757]:

    
plt.plot(u)
plt.show()



In [ ]:



In [716]:

    
# SGHMC - Scaled
nsample = 10000
m = 20
eps = .001
theta = np.zeros(p)
#theta = beta_true_scale.copy()
phi = 0.1
nbatch = 768
C = 0 * np.identity(p)
V = 0 * np.identity(p)

np.random.seed(2)
samples = np.zeros((nsample, p))
u = np.zeros(nsample)
for i in range(nsample):
    theta = sghmc(Y, Xs, stogradU, M, eps, m, theta, C, V)
    samples[i] = theta
    u[i] = U(theta, Y, Xs)
    
np.mean(samples, axis=0) - beta_true_scale









    Out[716]:





array([ 1.62428113, -0.00560003,  0.00625545,  0.01065339,  0.00263785,
       -0.01313617, -0.00682813,  0.00379025, -0.01856026])



In [722]:

    
plt.plot((samples - beta_true_scale)[:,1])
plt.show()



In [670]:

    
plt.plot(u)
plt.show()



In [ ]:



In [ ]:



In [ ]:



In [147]:

    
# HMC - Scaled (no intercept)
nsample = 1000
m = 20
eps = .002
theta = np.zeros(p-1)
#theta = beta_true_scale.copy()[1:]
phi = 5
nbatch = 500
C = 1 * np.identity(p-1)
V = 0 * np.identity(p-1)

np.random.seed(2)
samples = np.zeros((nsample, p-1))
u = np.zeros(nsample)
for i in range(nsample):
    theta = sghmc(Y, Xs[:,1:], stogradU, np.identity(p-1), eps, m, theta, C, V)
    samples[i] = theta
    u[i] = U(theta, Y, Xs[:,1:])
    
np.mean(samples, axis=0) - beta_true_scale[1:]









    Out[147]:





array([ 0.0014384 , -0.05489451,  0.00552889,  0.04475773,  0.00330102,
       -0.04933019, -0.01584681,  0.00078972])



In [152]:

    
plt.plot((samples - beta_true_scale[1:])[:,0])
plt.show()



In [149]:

    
plt.plot(u)
plt.show()

Our code - Gradient descent



In [324]:

    
# Gradient descent - Unscaled
np.random.seed(2)
#res = my_gd(Y, X, gradU, M, .0001, 10000, np.zeros(p), C, V) # Starting at zero
#res = my_gd(Y, X, gradU, M, .0001, 10000, beta_true_unscale.copy(), C, V) # Starting at true values

res = my_gd(Y, X, gradU, M, .0001, 10000, beta_true_unscale.copy(), C, V) # Starting at true values

res - beta_true_unscale









    Out[324]:





array([ -2.19413632e-04,   2.36331548e-04,   7.48727924e-06,
         2.31994484e-06,   9.33934233e-06,  -3.92206222e-07,
        -2.15577090e-05,   1.00758322e-04,  -4.00087824e-05])



In [325]:

    
# Gradient descent - Scaled
np.random.seed(2)
res = my_gd(Y, Xs, gradU, M, .1, 20000, np.zeros(p), C, V)

res - beta_true_scale









    Out[325]:





array([  0.00000000e+00,   4.17596046e-06,   5.92833240e-06,
         3.17764100e-06,   8.84196257e-06,  -2.98662348e-06,
        -1.76233441e-05,   8.02028628e-06,  -7.35280641e-06])



In [ ]:



In [ ]:



In [ ]:

Cliburn's code



In [12]:

    
# Cliburn's gradient descent code

def gd(X, y, beta, alpha, niter):
    """Gradient descent algorihtm."""
    n, p = X.shape
    Xt = X.T
    for i in range(niter):
        y_pred = logistic(X @ beta)
        epsilon = y - y_pred
        grad = Xt @ epsilon / n
        beta += alpha * grad
    return beta



In [13]:

    
# Unscaled
#res = gd(X, Y.ravel(), np.zeros(p), alpha=.1, niter=2) # Starting at zero
res = gd(X, Y.ravel(), beta_true_unscale.copy(), alpha=.0001, niter=10000) # Starting at true coefficients

res - beta_true_unscale









    Out[13]:





array([ -2.19413632e-04,   2.36331548e-04,   7.48727924e-06,
         2.31994484e-06,   9.33934233e-06,  -3.92206222e-07,
        -2.15577090e-05,   1.00758322e-04,  -4.00087824e-05])



In [14]:

    
# Scaled
res = gd(Xs, Y.ravel(), np.zeros(p), alpha=.1, niter=20000)

res - beta_true_scale









    Out[14]:





array([  0.00000000e+00,   4.17596046e-06,   5.92833240e-06,
         3.17764100e-06,   8.84196257e-06,  -2.98662348e-06,
        -1.76233441e-05,   8.02028628e-06,  -7.35280641e-06])



In [ ]:



In [ ]: