Here is the library of functions:
In [2]:
def logistic(x):
'''
'''
return 1/(1+np.exp(-x))
def U_logistic(theta, Y, X, phi):
'''
'''
return - (Y.T @ X @ theta - np.sum(np.log(1+np.exp(X @ theta))) - 0.5 * phi * np.sum(theta**2))
def gradU_logistic(theta, Y, X, phi):
'''
'''
n = X.shape[0]
Y_pred = logistic(X @ theta)
epsilon = (Y[:,np.newaxis] - Y_pred[:,np.newaxis])
grad = X.T @ epsilon - phi * theta[:, np.newaxis]
return -grad/n
def hmc(Y, X, U, gradU, M, eps, m, theta0, phi):
'''
'''
theta = theta0.copy()
n, p = X.shape
# Precompute
Minv = np.linalg.inv(M)
# Randomly sample momentum
r = np.random.multivariate_normal(np.zeros(p),M)[:,np.newaxis]
# Intial energy
H0 = U(theta0, Y, X, phi) + 0.5 * np.asscalar(r.T @ Minv @ r)
# Hamiltonian dynamics
r -= (eps/2)*gradU(theta, Y, X, phi)
for i in range(m):
theta += (eps*Minv@r).ravel()
r -= eps*gradU(theta, Y, X, phi)
r -= (eps/2)*gradU(theta, Y, X, phi)
# Final energy
H1 = U(theta, Y, X, phi) + np.asscalar(0.5 * r.T @ Minv @ r)
# MH step
u = np.random.uniform()
rho = np.exp(H0 - H1) # Acceptance probability
if u < np.min((1, rho)):
# accept
accept = True
H = H1
else:
# reject
theta = theta0
accept = False
H = H0
return theta, accept, rho, H
def run_hmc(Y, X, U, gradU, M, eps, m, theta, phi, nsample):
n, p = X.shape
# Allocate space
samples = np.zeros((nsample, p))
accept = np.zeros(nsample)
rho = np.zeros(nsample)
H = np.zeros(nsample)
# Run hmc
for i in range(nsample):
theta, accept[i], rho[i], H[i] = hmc(Y, X, U, gradU, M, eps, m, theta, phi)
samples[i] = theta
return samples, accept, rho, H
def stogradU(theta, Y, X, nbatch, phi):
'''A function that returns the stochastic gradient. Adapted from Eq. 5.
Inputs are:
theta, the parameters
Y, the response
X, the covariates
nbatch, the number of samples to take from the full data
'''
n, p = X.shape
# Sample minibatch
batch_id = np.random.choice(np.arange(n),nbatch,replace=False)
Y_pred = logistic(X[batch_id,:] @ theta[:,np.newaxis])
epsilon = (Y[batch_id,np.newaxis] - Y_pred)
grad = n/nbatch * X[batch_id,:].T @ epsilon - phi * theta[:, np.newaxis]
#return -grad/n
return -grad
def sghmc(Y, X, U, gradU, M, Minv, eps, m, theta, B, D, phi):
n, p = X.shape
# Randomly sample momentum
r = np.random.multivariate_normal(np.zeros(p),M)[:,np.newaxis]
# Hamiltonian dynamics
for i in range(m):
theta += (eps*Minv@r).ravel()
r -= eps*stogradU(theta, Y, X, nbatch,phi) - eps*C @ Minv @ r \
+ np.random.multivariate_normal(np.zeros(p),D)[:,np.newaxis]
# Record the energy
H = U(theta, Y, X, phi) + np.asscalar(0.5 * r.T @ Minv @ r)
return theta, H
def run_sghmc(Y, X, U, gradU, M, eps, m, theta, C, V, phi, nsample):
n, p = X.shape
# Precompute
Minv = np.linalg.inv(M)
B = 0.5 * V * eps
D = 2*(C-B)*eps
# Allocate space
samples = np.zeros((nsample, p))
H = np.zeros(nsample)
# Run sghmc
for i in range(nsample):
theta, H[i] = sghmc(Y, X, U, gradU, M, Minv, eps, m, theta, B, D, phi)
samples[i] = theta
return samples, H
def gd(Y, X, gradU, eps, m, theta, phi):
'''
'''
samples = np.zeros((nsample, p))
for i in range(m):
theta -= eps*gradU(theta, Y, X, phi).ravel()
return theta
Everything after here is the script that runs the simulation:
In [3]:
import numpy as np
import matplotlib.pyplot as plt
In [4]:
n = 500
p = 50
beta = np.random.normal(0, 1, p+1)
Sigma = np.zeros((p, p))
Sigma_diags = np.array([25, 5, 0.2**2])
distribution = np.random.multinomial(p, pvals=[.05, .05, .9], size=1).tolist()
np.fill_diagonal(Sigma, np.repeat(Sigma_diags, distribution[0], axis=0))
X = np.random.multivariate_normal(np.zeros(p), Sigma, n)
X = np.hstack((np.ones((n, 1)), X))
p = np.exp(X @ beta)/np.exp(1 + np.exp(X @ beta))
Y = np.random.binomial(1, p, n)
In [5]:
Xs = (X - np.mean(X, axis=0))/np.concatenate((np.ones(1),np.std(X[:,1:], axis=0)))
Xs = Xs[:,1:]
p = Xs.shape[1]
In [6]:
from sklearn.linear_model import LogisticRegression
In [7]:
# Unscaled
mod_logis = LogisticRegression(fit_intercept=False, C=1e50)
mod_logis.fit(X,Y)
beta_true_unscale = mod_logis.coef_.ravel()
beta_true_unscale
Out[7]:
In [8]:
# Scaled
mod_logis = LogisticRegression(fit_intercept=False, C=1e50)
mod_logis.fit(Xs,Y)
beta_true_scale = mod_logis.coef_.ravel()
beta_true_scale
Out[8]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [60]:
# HMC - Scaled
nsample = 1000
m = 20
eps = .0005
theta = np.zeros(p)
#theta = beta_true_scale.copy()
phi = 5
M = np.identity(p)
samples, accept, rho, H = run_hmc(Y, Xs, U_logistic, gradU_logistic, M, eps, m, theta, phi, nsample)
hmc_mean = np.mean(samples, axis=0)
np.mean(samples, axis=0) - beta_true_scale
Out[60]:
In [61]:
plt.plot((samples - beta_true_scale)[:,3])
plt.show()
In [62]:
plt.plot(H)
plt.show()
In [ ]:
plt.plot((samples - beta_true_unscale)[:,3]) plt.show()
plt.plot(H) plt.show()
In [27]:
# HMC - Scaled (no intercept)
nsample = 1000
m = 20
eps = .01
theta = np.zeros(p)
#theta = beta_true_scale.copy()
phi = 5
nbatch = 500
C = 1 * np.identity(p)
V = 0 * np.identity(p)
M = np.identity(p)
samples, H = run_sghmc(Y, Xs, U_logistic, gradU_logistic, M, eps, m, theta, C, V, phi, nsample)
print(np.mean(samples, axis=0) - beta_true_scale)
In [28]:
plt.plot((samples - beta_true_scale)[:,0])
plt.show()
In [29]:
plt.plot(H)
plt.show()
In [ ]:
nsample = 1000 m = 20 eps = .00001 theta = np.zeros(p+1)
phi = 5 nbatch = 500 C = 1 np.identity(p+1) V = 0 np.identity(p+1) M = np.identity(p+1)
samples, H = run_sghmc(Y, X, U_logistic, gradU_logistic, M, eps, m, theta, C, V, phi, nsample)
print(np.mean(samples, axis=0) - beta_true_unscale)
plt.plot((samples - beta_true_unscale)[:,0]) plt.show()
plt.plot(H) plt.show()
In [ ]:
# Gradient descent - Scaled
np.random.seed(2)
phi = .1
res = gd(Y, Xs, gradU_logistic, .1, 20000, np.zeros(p), phi)
res - beta_true_scale
In [ ]:
In [ ]:
In [ ]: