In [9]:
%matplotlib inline
# Linear regression is just a straight line through data

In [2]:
from sklearn.datasets import load_boston

In [10]:
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt

In [6]:
lr = LinearRegression()
boston = load_boston()

In [7]:
lr.fit(boston.data, boston.target)


Out[7]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
predictions = lr.predict(boston.data)

In [13]:
f, ax = plt.subplots(figsize=(7,5))
f.tight_layout()
ax.hist(predictions - boston.target, label='Residuals', color='b', alpha=.5)

ax.set_title('Histogram of Residuals')
ax.legend(loc='best')


Out[13]:
<matplotlib.legend.Legend at 0x115701350>

In [14]:
lr.coef_


Out[14]:
array([ -1.07170557e-01,   4.63952195e-02,   2.08602395e-02,
         2.68856140e+00,  -1.77957587e+01,   3.80475246e+00,
         7.51061703e-04,  -1.47575880e+00,   3.05655038e-01,
        -1.23293463e-02,  -9.53463555e-01,   9.39251272e-03,
        -5.25466633e-01])

In [15]:
# common pattern to express coefficients of the features
# and their names is:
zip(boston.feature_names, lr.coef_)


Out[15]:
[('CRIM', -0.10717055656035547),
 ('ZN', 0.046395219529799255),
 ('INDUS', 0.020860239532169256),
 ('CHAS', 2.6885613993178934),
 ('NOX', -17.795758660308845),
 ('RM', 3.804752460258007),
 ('AGE', 0.00075106170332241325),
 ('DIS', -1.4757587965198151),
 ('RAD', 0.3056550383390973),
 ('TAX', -0.012329346305270163),
 ('PTRATIO', -0.95346355469055943),
 ('B', 0.0093925127221890312),
 ('LSTAT', -0.5254666329007851)]

In [16]:
# We can normal and scale the inputs before regression:
lr2 = LinearRegression(normalize=True)

In [17]:
lr2.fit(boston.data, boston.target)


Out[17]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [18]:
predictions2 = lr2.predict(boston.data)

In [20]:
f, ax = plt.subplots(figsize=(10,10))
f.tight_layout()
ax.hist(predictions - boston.target, label='Original', color='b', alpha=.5)

ax.hist(predictions2 - boston.target, label='Normalized', color='r', alpha=.5)
ax.set_title('Histogram of Residuals')
ax.legend(loc='best')


Out[20]:
<matplotlib.legend.Legend at 0x1156807d0>

In [ ]:
# How this works:
# Find the coefficiencts (Beta) that satisfy y = X*Beta where X
# is the data matrix. Since we won't find a solution exactly,
# an error term (Err) is added so the equation becomes minimizing
# y = (X*Beta) + Err
# Note: Err is assumed to be normally distributed and independent
# of X.