notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
# Learning about Ridge Regression
# Ridge Regression introduces a regularization parameter to
# "shrink" coefficients.
# Useful when the dataset has collinear factors (when the points
# lie in a line or in a row)



In [3]:

    
# For this example we need a dataset with a low effective rank
# From linear algebra, the rank of a matrix is the number of
# independent rows or columns.
# Important Fact: Linear Regression assumes full rank (meaning 
# all the rows are linearly independent)



In [28]:

    
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
reg_data, reg_target = make_regression(n_samples = 2000, n_features = 3, effective_rank=2, noise=10)



In [29]:

    
import numpy as np
n_bootstraps = 1000
len_data = len(reg_data)
lr = LinearRegression()



In [30]:

    
subsample_size = np.int(0.75 * len_data)
subsample = lambda: np.random.choice(np.arange(0, len_data), size=subsample_size)



In [31]:

    
coefs = np.ones((n_bootstraps, 3))
for i in range(n_bootstraps):
    subsample_idx = subsample()
    subsample_X = reg_data[subsample_idx]
    subsample_y = reg_target[subsample_idx]
    lr.fit(subsample_X, subsample_y)
    coefs[i][0] = lr.coef_[0]
    coefs[i][1] = lr.coef_[1]
    coefs[i][2] = lr.coef_[2]



In [32]:

    
import matplotlib.pyplot as plt

f = plt.figure(figsize=(7,5))
ax = f.add_subplot(111)
ax.hist([c[0] for c in coefs], color='r', bins=30)
ax = f.add_subplot(111)
ax.hist([c[1] for c in coefs], color='g', bins=30)
ax = f.add_subplot(111)
ax.hist([c[2] for c in coefs], color='b', bins=30)
ax.set_title("Histogram of coefficients for Lin Reg")









    Out[32]:





<matplotlib.text.Text at 0x1178db710>



In [33]:

    
f, ax = plt.subplots(figsize=(10,7), nrows=3)
f.tight_layout()
ax[0].hist([c[0] for c in coefs], color='r', bins=10)
ax[1].hist([c[1] for c in coefs], color='g', bins=10)
ax[2].hist([c[2] for c in coefs], color='b', bins=10)
ax[0].set_title("Histogram of coefficients for Lin Reg")









    Out[33]:





<matplotlib.text.Text at 0x117b89350>



In [34]:

    
from sklearn.linear_model import Ridge



In [35]:

    
r = Ridge()



In [36]:

    
n_bootstraps = 1000
len_data = len(reg_data)
subsample_size = np.int(0.75 * len_data)



In [37]:

    
subsample = lambda: np.random.choice(np.arange(0, len_data), size=subsample_size)



In [38]:

    
coefs_r = np.ones((n_bootstraps, 3))



In [39]:

    
for i in range(n_bootstraps):
    subsample_idx = subsample()
    subsample_X = reg_data[subsample_idx]
    subsample_y = reg_target[subsample_idx]
    r.fit(subsample_X, subsample_y)
    coefs_r[i][0] = r.coef_[0]
    coefs_r[i][1] = r.coef_[1]
    coefs_r[i][2] = r.coef_[2]



In [40]:

    
f, ax = plt.subplots(figsize=(10,7), nrows=3)
f.tight_layout()
ax[0].hist([c[0] for c in coefs_r], color='r', bins=10)
ax[1].hist([c[1] for c in coefs_r], color='g', bins=10)
ax[2].hist([c[2] for c in coefs_r], color='b', bins=10)
ax[0].set_title("Coefficent 0")
ax[1].set_title("Coefficent 1")
ax[2].set_title("Coefficent 2")









    Out[40]:





<matplotlib.text.Text at 0x118241110>



In [46]:

    
f, ax = plt.subplots(figsize=(10,7), nrows=3)
f.tight_layout()
ax[0].hist([c[0] for c in coefs_r], color='r', bins=10, label='Ridge')
ax[1].hist([c[1] for c in coefs_r], color='r', bins=10, label='Ridge')
ax[2].hist([c[2] for c in coefs_r], color='r', bins=10, label='Ridge')

ax[0].hist([c[0] for c in coefs], color='b', alpha=.5, bins=10, label='Linear')
ax[1].hist([c[1] for c in coefs], color='b', alpha=.5, bins=10, label='Linear')
ax[2].hist([c[2] for c in coefs], color='b', alpha=.5, bins=10, label='Linear')
ax[0].legend(loc='best')
ax[1].legend(loc='best')
ax[2].legend(loc='best')
ax[0].set_title("Coefficent 0")
ax[1].set_title("Coefficent 1")
ax[2].set_title("Coefficent 2")









    Out[46]:





<matplotlib.text.Text at 0x11a72db90>



In [ ]: