In [31]:
import numpy as np
import matplotlib.pyplot as plt
In [32]:
pima = np.genfromtxt('pima-indians-diabetes.data', delimiter=',')
def sghmc(Y, X, stogradU, M, eps, m, theta, C, V): n = X.shape[0] p = X.shape[1]
# Randomly sample momentum
r = np.random.multivariate_normal(np.zeros(M.shape[0]),M)[:,np.newaxis]
# Precompute
B = 0.5 * V * eps
D = 2*(C-B)*eps
Minv = np.linalg.inv(M)
# Hamiltonian dynamics
for i in range(m):
theta = theta + (eps*np.linalg.inv(M) @ r).ravel()
r = r - eps*stogradU(theta, Y, X, nbatch) - eps*C @ Minv @ r \
+ np.random.multivariate_normal(np.zeros(M.shape[0]),D)[:,np.newaxis]
return(theta)
def stogradU(theta, Y, X, nbatch): '''A function that returns the stochastic gradient. Adapted from Eq. 5. Inputs are: theta, the parameters Y, the response X, the covariates nbatch, the number of samples to take from the full data ''' alpha=5 n = X.shape[0] batch_id = np.random.choice(np.arange(n),nbatch,replace=False) grad = -n/nbatch * X[batch_id,:].T @ (Y[batch_id][:,np.newaxis] - \ 1/(1+np.exp(-X[batch_id,:] @ theta[:,np.newaxis]))) - theta[:,np.newaxis]/alpha return grad
def logistic(x): return 1/(1+np.exp(-x))
def stogradU(theta, Y, X, nbatch): '''A function that returns the stochastic gradient. Adapted from Eq. 5. Inputs are: theta, the parameters Y, the response X, the covariates nbatch, the number of samples to take from the full data ''' alpha=5 n = X.shape[0] batch_id = np.random.choice(np.arange(n),nbatch,replace=False)
Y_pred = logistic(X[batch_id,:] @ theta[:,np.newaxis])
epsilon = (Y[batch_id][:,np.newaxis] - Y_pred)
grad = -n/nbatch * X[batch_id,:].T @ epsilon - theta[:,np.newaxis]/alpha
return grad/n
In [33]:
# Load data
X = np.concatenate((np.ones((pima.shape[0],1)),pima[:,0:8]), axis=1)
Y = pima[:,8]
Xs = (X - np.mean(X, axis=0))/np.concatenate((np.ones(1),np.std(X[:,1:], axis=0)))
n, p = X.shape
M = np.identity(p)
In [116]:
### HMC version
def logistic(x):
return 1/(1+np.exp(-x))
def U(theta, Y, X):
return - (Y.T @ X @ theta - np.sum(np.log(1+np.exp(X @ theta))) - 0.5 * phi * np.sum(theta**2))
def gradU(theta, Y, X, nbatch):
'''A function that returns the stochastic gradient. Adapted from Eq. 5.
Inputs are:
theta, the parameters
Y, the response
X, the covariates
nbatch, the number of samples to take from the full data
'''
n = X.shape[0]
Y_pred = logistic(X @ theta)
epsilon = (Y[:,np.newaxis] - Y_pred[:,np.newaxis])
grad = X.T @ epsilon - phi * theta[:, np.newaxis]
return -grad/n
def stogradU(theta, Y, X, nbatch):
'''A function that returns the stochastic gradient. Adapted from Eq. 5.
Inputs are:
theta, the parameters
Y, the response
X, the covariates
nbatch, the number of samples to take from the full data
'''
n, p = X.shape
# Sample minibatch
batch_id = np.random.choice(np.arange(n),nbatch,replace=False)
Y_pred = logistic(X[batch_id,:] @ theta[:,np.newaxis])
epsilon = (Y[batch_id,np.newaxis] - Y_pred)
grad = n/nbatch * X[batch_id,:].T @ epsilon - phi * theta[:, np.newaxis]
#return -grad/n
return -grad
def sghmc(Y, X, gradU, M, eps, m, theta, C, V):
n, p = X.shape
# Precompute
Minv = np.linalg.inv(M)
B = 0.5 * V * eps
D = 2*(C-B)*eps
# Randomly sample momentum
r = np.random.multivariate_normal(np.zeros(p),M)[:,np.newaxis]
# Hamiltonian dynamics
#r = r - (eps/2)*gradU(theta, Y, X, nbatch)
for i in range(m):
theta = theta + (eps*Minv@r).ravel()
r = r - eps*stogradU(theta, Y, X, nbatch) - eps*C @ Minv @ r \
+ np.random.multivariate_normal(np.zeros(p),D)[:,np.newaxis]
#theta = theta + (eps*Minv@r).ravel()
#r = r - (eps/2)*gradU(theta, Y, X, nbatch)
return theta
def my_gd(Y, X, gradU, M, eps, m, theta, C, V):
# gradient descent
n = X.shape[0]
p = X.shape[1]
for i in range(m):
theta = theta - eps*gradU(theta, Y, X, nbatch).ravel()
return theta
In [35]:
from sklearn.linear_model import LogisticRegression
In [36]:
# Unscaled
mod_logis = LogisticRegression(fit_intercept=False, C=1e50)
mod_logis.fit(X,Y)
beta_true_unscale = mod_logis.coef_.ravel()
beta_true_unscale
Out[36]:
In [37]:
# Scaled
mod_logis = LogisticRegression(fit_intercept=False, C=1e50)
mod_logis.fit(Xs,Y)
beta_true_scale = mod_logis.coef_.ravel()
beta_true_scale
Out[37]:
In [ ]:
In [ ]:
In [ ]:
In [19]:
X.shape, Y.shape
Out[19]:
In [38]:
U(np.ones(p)*.1,Y,X)
Out[38]:
In [39]:
gradU(np.ones(p)*.1, Y, X, 1)*n
Out[39]:
In [40]:
stogradU(np.ones(p)*.1, Y, X, 768)*n
Out[40]:
In [46]:
# HMC - Unscaled
nsample = 100
m = 20
eps = .0001
#theta = np.zeros(p)
theta = beta_true_unscale.copy()
phi = 0.01
nbatch = 500
C = 0 * np.identity(p)
V = 0 * np.identity(p)
np.random.seed(2)
samples = np.zeros((nsample, p))
u = np.zeros(nsample)
for i in range(nsample):
theta = sghmc(Y, X, stogradU, M, eps, m, theta, C, V)
samples[i] = theta
u[i] = U(theta, Y, X)
np.mean(samples, axis=0) - beta_true_unscale
Out[46]:
In [48]:
plt.plot((samples - beta_true_unscale)[:,3])
plt.show()
In [757]:
plt.plot(u)
plt.show()
In [ ]:
In [716]:
# SGHMC - Scaled
nsample = 10000
m = 20
eps = .001
theta = np.zeros(p)
#theta = beta_true_scale.copy()
phi = 0.1
nbatch = 768
C = 0 * np.identity(p)
V = 0 * np.identity(p)
np.random.seed(2)
samples = np.zeros((nsample, p))
u = np.zeros(nsample)
for i in range(nsample):
theta = sghmc(Y, Xs, stogradU, M, eps, m, theta, C, V)
samples[i] = theta
u[i] = U(theta, Y, Xs)
np.mean(samples, axis=0) - beta_true_scale
Out[716]:
In [722]:
plt.plot((samples - beta_true_scale)[:,1])
plt.show()
In [670]:
plt.plot(u)
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [147]:
# HMC - Scaled (no intercept)
nsample = 1000
m = 20
eps = .002
theta = np.zeros(p-1)
#theta = beta_true_scale.copy()[1:]
phi = 5
nbatch = 500
C = 1 * np.identity(p-1)
V = 0 * np.identity(p-1)
np.random.seed(2)
samples = np.zeros((nsample, p-1))
u = np.zeros(nsample)
for i in range(nsample):
theta = sghmc(Y, Xs[:,1:], stogradU, np.identity(p-1), eps, m, theta, C, V)
samples[i] = theta
u[i] = U(theta, Y, Xs[:,1:])
np.mean(samples, axis=0) - beta_true_scale[1:]
Out[147]:
In [152]:
plt.plot((samples - beta_true_scale[1:])[:,0])
plt.show()
In [149]:
plt.plot(u)
plt.show()
In [324]:
# Gradient descent - Unscaled
np.random.seed(2)
#res = my_gd(Y, X, gradU, M, .0001, 10000, np.zeros(p), C, V) # Starting at zero
#res = my_gd(Y, X, gradU, M, .0001, 10000, beta_true_unscale.copy(), C, V) # Starting at true values
res = my_gd(Y, X, gradU, M, .0001, 10000, beta_true_unscale.copy(), C, V) # Starting at true values
res - beta_true_unscale
Out[324]:
In [325]:
# Gradient descent - Scaled
np.random.seed(2)
res = my_gd(Y, Xs, gradU, M, .1, 20000, np.zeros(p), C, V)
res - beta_true_scale
Out[325]:
In [ ]:
In [ ]:
In [ ]:
In [12]:
# Cliburn's gradient descent code
def gd(X, y, beta, alpha, niter):
"""Gradient descent algorihtm."""
n, p = X.shape
Xt = X.T
for i in range(niter):
y_pred = logistic(X @ beta)
epsilon = y - y_pred
grad = Xt @ epsilon / n
beta += alpha * grad
return beta
In [13]:
# Unscaled
#res = gd(X, Y.ravel(), np.zeros(p), alpha=.1, niter=2) # Starting at zero
res = gd(X, Y.ravel(), beta_true_unscale.copy(), alpha=.0001, niter=10000) # Starting at true coefficients
res - beta_true_unscale
Out[13]:
In [14]:
# Scaled
res = gd(Xs, Y.ravel(), np.zeros(p), alpha=.1, niter=20000)
res - beta_true_scale
Out[14]:
In [ ]:
In [ ]: