Baysian Ridge and Lasso regression


In [578]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
import pymc
import sys

%matplotlib inline

In [585]:
n = 10000

x1 = norm.rvs(0, 1, size=n) + norm.rvs(0, 10**-3, size=n)
x2 = -x1 + norm.rvs(0, 10**-3, size=n)
x3 = norm.rvs(0, 1, size=n)

X = np.column_stack([x1, x2, x3])
y = 10 * x1 + 10 * x2 + 0.1 * x3

Exercise: implement ordinary least squares

$$ \beta = (X^{T}X)^{-1}X^{T}y $$

In [586]:
#your code here
np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), y)


Out[586]:
array([10. , 10. ,  0.1])

Bayesian ordinary least squares


In [587]:
beta_min = -10**6
beta_max = 10**6

beta1_ols = pymc.Uniform(name='beta1', lower=beta_min, upper=beta_max)
beta2_ols = pymc.Uniform(name='beta2', lower=beta_min, upper=beta_max)
beta3_ols = pymc.Uniform(name='beta3', lower=beta_min, upper=beta_max)

@pymc.deterministic
def y_hat_ols(beta1=beta1_ols, beta2=beta2_ols, beta3=beta3_ols, x1=x1, x2=x2, x3=x3):
    return beta1 * x1 + beta2 * x2 + beta3 * x3

Y_ols = pymc.Normal(name='Y', mu=y_hat_ols, tau=1.0, value=y, observed=True)

ols_model = pymc.Model([Y_ols, beta1_ols, beta2_ols, beta3_ols])
ols_map = pymc.MAP(ols_model)
ols_map.fit(method='fmin_l_bfgs_b', iterlim=100000, tol=.0001)

def get_coefficients(map_):
    return [{str(variable): variable.value} for variable in map_.variables if str(variable).startswith('beta')]

get_coefficients(ols_map)


Out[587]:
[{'beta1': array(9.999992647027748)},
 {'beta3': array(0.09999999949981003)},
 {'beta2': array(9.999992647143856)}]

Exercise: bayesian ridge regression

Note baysian ridge regression is constructed by assuming normal prior on betas, use $tau=1.0, mu=0$


In [588]:
#your code here

tau = 1.0
beta1_ridge = pymc.Normal('beta1', mu=0, tau=tau)
beta2_ridge = pymc.Normal('beta2', mu=0, tau=tau)
beta3_ridge = pymc.Normal('beta3', mu=0, tau=tau)

@pymc.deterministic
def y_hat_ridge(beta1=beta1_ridge, beta2=beta2_ridge, beta3=beta3_ridge, x1=x1, x2=x2, x3=x3):
    return beta1 * x1 + beta2 * x2 + beta3 * x3

Y_ridge = pymc.Normal('Y', mu=y_hat_ridge, tau=1.0, value=y, observed=True)

ridge_model = pymc.Model([Y_ridge, beta1_ridge, beta2_ridge, beta3_ridge])
ridge_map = pymc.MAP(ridge_model)
ridge_map.fit(method='fmin_l_bfgs_b', iterlim=1000, tol=.0001)

compare to scikit learn


In [590]:
from sklearn.linear_model import RidgeCV

skl_ridge_model = RidgeCV(fit_intercept=False)
skl_ridge_model.fit(X, y)

print ("scikit results")
print (skl_ridge_model.coef_)
print ("bayesian results")
print (get_coefficients(ridge_map))


scikit results
[ 0.47753588  0.47747008  0.09998158]
bayesian results
[{'beta2': array(0.04986004809962333)}, {'beta3': array(0.09997146897270294)}, {'beta1': array(0.049928845398971244)}]

Exercise: bayesian lasso regression

Note baysian lasso regression is constructed by assuming normal prior on betas, use $tau=1.0 * sqrt(2 * sigma), mu=0$


In [591]:
sigma = 1.0e1
b =  np.sqrt(2.0 * sigma)

beta1_lasso = pymc.Laplace('beta1', mu=0, tau=1.0 * b)
beta2_lasso = pymc.Laplace('beta2', mu=0, tau=1.0 * b)
beta3_lasso = pymc.Laplace('beta3', mu=0, tau=1.0 * b)

@pymc.deterministic
def y_hat_lasso(beta1=beta1_lasso, beta2=beta2_lasso, beta3=beta3_lasso, x1=x1, x2=x2, x3=x3):
    return beta1 * x1 + beta2 * x2 + beta3 * x3

Y_lasso = pymc.Normal('Y', mu=y_hat_lasso, tau=1.0, value=y, observed=True)

lasso_model = pymc.Model([Y_lasso, beta1_lasso, beta2_lasso, beta3_lasso])
lasso_map = pymc.MAP(lasso_model)
lasso_map.fit(method='fmin_l_bfgs_b', iterlim=10000, tol=.0001)


Out[591]:
[{'beta3': array(0.0994902103176779)},
 {'beta2': array(1.5699064750795007e-05)},
 {'beta1': array(1.4451544851713915e-05)}]

Compare to scikit learn


In [593]:
from sklearn.linear_model import LassoCV

skl_lasso_model = LassoCV(fit_intercept=False)
skl_lasso_model.fit(X, y)

print ("Scikit results")
print (skl_lasso_model.coef_)

print ("Bayesian results")
get_coefficients(lasso_map)


Scikit results
[ 0.         -0.          0.09988218]
Bayesian results
Out[593]:
[{'beta3': array(0.0994902103176779)},
 {'beta2': array(1.5699064750795007e-05)},
 {'beta1': array(1.4451544851713915e-05)}]

Ridge and Lasso on Boston dataset


In [573]:
from sklearn.datasets import load_boston
boston = load_boston()
(n, d)= boston.data.shape
print (n, d)


506 13

In [595]:
#lasso
sigma = 1.0e3
b =  np.sqrt(2.0 * sigma2)
beta_lasso = pymc.Laplace('beta', mu=0, tau=1.0 * b, size=(d,1))

@pymc.deterministic
def y_hat_lasso(beta=beta_lasso, x=boston.data):
    return np.dot(x, beta).T

Y_lasso = pymc.Normal('Y', mu=y_hat_lasso, tau=1.0, value=boston.target, observed=True)
lasso_model = pymc.Model([Y_lasso, beta_lasso])
lasso_map = pymc.MAP(lasso_model)
lasso_map.fit(method='fmin_l_bfgs_b', iterlim=1000, tol=.0001)

In [575]:
skl_lasso_model = LassoCV(fit_intercept=False)
skl_lasso_model.fit(boston.data, boston.target)


Out[575]:
array([-0.        ,  0.10985957, -0.        ,  0.        ,  0.        ,
        0.        ,  0.07909977,  0.        , -0.        ,  0.00556566,
        0.        ,  0.05332189, -0.48125905])

In [599]:
#Ridge
beta_ridge = pymc.Normal('beta', mu=0, tau=1.0, size=(d,1))

@pymc.deterministic
def y_hat_ridge(beta=beta_ridge, x=boston.data):
    return np.dot(x, beta).T

Y_ridge = pymc.Normal('Y', mu=y_hat_ridge, tau=1.0, value=boston.target, observed=True)
ridge_model = pymc.Model([Y_ridge, beta_ridge])
ridge_map = pymc.MAP(ridge_model)
ridge_map.fit(method='fmin_l_bfgs_b', iterlim=1000, tol=.0001)

In [600]:
skl_ridge_model = RidgeCV(fit_intercept=False)
skl_ridge_model.fit(boston.data, boston.target)
skl_ridge_model.coef_


Out[600]:
array([-0.09167603,  0.05109077, -0.0128826 ,  2.17031927, -0.27569278,
        5.66203246, -0.00571236, -0.92332479,  0.17363205, -0.00999711,
       -0.36592389,  0.01526748, -0.44222824])

In [601]:
print ("Scikit lasso results")
skl_lasso_model.coef_

print ("Bayesian lasso results")
get_coefficients(lasso_map)[0]['beta'][:, 0]


Scikit lasso results
Bayesian lasso results
Out[601]:
array([ -9.06958078e-02,   4.89279611e-02,   1.81810284e-04,
         4.94448158e-01,  -4.29956043e-04,   5.73292063e+00,
        -4.54058458e-03,  -8.95827391e-01,   1.83368027e-01,
        -1.08034967e-02,  -3.94875862e-01,   1.53232089e-02,
        -4.45124800e-01])

In [602]:
print ("Scikit Ridge results")
print(skl_ridge_model.coef_)

print ("Bayesian Ridge results")
print (get_coefficients(ridge_map)[0]['beta'])


Scikit Ridge results
[-0.09167603  0.05109077 -0.0128826   2.17031927 -0.27569278  5.66203246
 -0.00571236 -0.92332479  0.17363205 -0.00999711 -0.36592389  0.01526748
 -0.44222824]
Bayesian Ridge results
[[-0.09120855]
 [ 0.0491298 ]
 [-0.01264773]
 [ 2.74258039]
 [-1.05110452]
 [ 5.82607127]
 [-0.0087232 ]
 [-0.95247011]
 [ 0.17202232]
 [-0.00979164]
 [-0.38919641]
 [ 0.01488236]
 [-0.42646519]]