notebook.community

Edit and run



In [9]:

    
%matplotlib inline
# Linear regression is just a straight line through data



In [2]:

    
from sklearn.datasets import load_boston



In [10]:

    
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt



In [6]:

    
lr = LinearRegression()
boston = load_boston()



In [7]:

    
lr.fit(boston.data, boston.target)









    Out[7]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [8]:

    
predictions = lr.predict(boston.data)



In [13]:

    
f, ax = plt.subplots(figsize=(7,5))
f.tight_layout()
ax.hist(predictions - boston.target, label='Residuals', color='b', alpha=.5)

ax.set_title('Histogram of Residuals')
ax.legend(loc='best')









    Out[13]:





<matplotlib.legend.Legend at 0x115701350>



In [14]:

    
lr.coef_









    Out[14]:





array([ -1.07170557e-01,   4.63952195e-02,   2.08602395e-02,
         2.68856140e+00,  -1.77957587e+01,   3.80475246e+00,
         7.51061703e-04,  -1.47575880e+00,   3.05655038e-01,
        -1.23293463e-02,  -9.53463555e-01,   9.39251272e-03,
        -5.25466633e-01])



In [15]:

    
# common pattern to express coefficients of the features
# and their names is:
zip(boston.feature_names, lr.coef_)









    Out[15]:





[('CRIM', -0.10717055656035547),
 ('ZN', 0.046395219529799255),
 ('INDUS', 0.020860239532169256),
 ('CHAS', 2.6885613993178934),
 ('NOX', -17.795758660308845),
 ('RM', 3.804752460258007),
 ('AGE', 0.00075106170332241325),
 ('DIS', -1.4757587965198151),
 ('RAD', 0.3056550383390973),
 ('TAX', -0.012329346305270163),
 ('PTRATIO', -0.95346355469055943),
 ('B', 0.0093925127221890312),
 ('LSTAT', -0.5254666329007851)]



In [16]:

    
# We can normal and scale the inputs before regression:
lr2 = LinearRegression(normalize=True)



In [17]:

    
lr2.fit(boston.data, boston.target)









    Out[17]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)



In [18]:

    
predictions2 = lr2.predict(boston.data)



In [20]:

    
f, ax = plt.subplots(figsize=(10,10))
f.tight_layout()
ax.hist(predictions - boston.target, label='Original', color='b', alpha=.5)

ax.hist(predictions2 - boston.target, label='Normalized', color='r', alpha=.5)
ax.set_title('Histogram of Residuals')
ax.legend(loc='best')









    Out[20]:





<matplotlib.legend.Legend at 0x1156807d0>



In [ ]:

    
# How this works:
# Find the coefficiencts (Beta) that satisfy y = X*Beta where X
# is the data matrix. Since we won't find a solution exactly,
# an error term (Err) is added so the equation becomes minimizing
# y = (X*Beta) + Err
# Note: Err is assumed to be normally distributed and independent
# of X.