In [9]:
%matplotlib inline
# Linear regression is just a straight line through data
In [2]:
from sklearn.datasets import load_boston
In [10]:
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
In [6]:
lr = LinearRegression()
boston = load_boston()
In [7]:
lr.fit(boston.data, boston.target)
Out[7]:
In [8]:
predictions = lr.predict(boston.data)
In [13]:
f, ax = plt.subplots(figsize=(7,5))
f.tight_layout()
ax.hist(predictions - boston.target, label='Residuals', color='b', alpha=.5)
ax.set_title('Histogram of Residuals')
ax.legend(loc='best')
Out[13]:
In [14]:
lr.coef_
Out[14]:
In [15]:
# common pattern to express coefficients of the features
# and their names is:
zip(boston.feature_names, lr.coef_)
Out[15]:
In [16]:
# We can normal and scale the inputs before regression:
lr2 = LinearRegression(normalize=True)
In [17]:
lr2.fit(boston.data, boston.target)
Out[17]:
In [18]:
predictions2 = lr2.predict(boston.data)
In [20]:
f, ax = plt.subplots(figsize=(10,10))
f.tight_layout()
ax.hist(predictions - boston.target, label='Original', color='b', alpha=.5)
ax.hist(predictions2 - boston.target, label='Normalized', color='r', alpha=.5)
ax.set_title('Histogram of Residuals')
ax.legend(loc='best')
Out[20]:
In [ ]:
# How this works:
# Find the coefficiencts (Beta) that satisfy y = X*Beta where X
# is the data matrix. Since we won't find a solution exactly,
# an error term (Err) is added so the equation becomes minimizing
# y = (X*Beta) + Err
# Note: Err is assumed to be normally distributed and independent
# of X.