In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_boston
%matplotlib inline
def standardize(x):
return (x - np.mean(x)) / np.std(x)
boston = load_boston()
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
x_range = [dataset['RM'].min(), dataset['RM'].max()]
y_range = [dataset['target'].min(), dataset['target'].max()]
Linear regression defines a straight line through a given set of points minimizing the sum of the squared errors.
Its expression is:
X = matrix of predictors
β = matrix of coefficients
β0 = constant value (bias)
In [2]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
y = dataset['target']
X = dataset['RM']
X = sm.add_constant(X)
X.head()
Out[2]:
In [17]:
linear_regression = sm.OLS(y, X)
fitted_model = linear_regression.fit()
In [4]:
linear_regression = smf.ols(formula='target ~ RM', data=dataset)
fitted_model = linear_regression.fit()
In [5]:
fitted_model.summary()
Out[5]:
In [6]:
fitted_model.params
Out[6]:
In [7]:
betas = np.array(fitted_model.params)
fitted_values = fitted_model.predict(X)
fitted_values.head()
Out[7]:
In [8]:
mean_sum_squared_errors = np.sum((dataset['target'] - dataset['target'].mean())**2)
regr_sum_squared_errors = np.sum((dataset['target'] - fitted_values)**2)
(mean_sum_squared_errors - regr_sum_squared_errors) / mean_sum_squared_errors
Out[8]:
In [9]:
from scipy.stats.stats import pearsonr
(pearsonr(dataset['RM'], dataset['target'])[0])**2
Out[9]:
In [10]:
residuals = dataset['target'] - fitted_values
normalized_residuals = standardize(residuals)
normalized_residuals.head()
Out[10]:
In [13]:
residual_scatter_plot = plt.plot(dataset['RM'], normalized_residuals, 'bp')
mean_residual = plt.plot([int(x_range[0]), round(x_range[1], 0)], [0, 0], '-', color='red', linewidth=2)
upper_bound = plt.plot([int(x_range[0]), round(x_range[1], 0)], [3, 3], '--', color='red', linewidth=1)
lower_bound = plt.plot([int(x_range[0]), round(x_range[1], 0)], [-3, -3], '--', color='red', linewidth=1)
plt.grid()
In [19]:
# For this cell to work, ensure that fitted_model is set from sm.OLS instead of smf.ols
RM = 5
Xp = np.array([1, RM])
print('Out model predicts if RM = {0} the answer value is {1}'.format(RM, fitted_model.predict(Xp)))
In [20]:
x_range = [dataset['RM'].min(), dataset['RM'].max()]
y_range = [dataset['target'].min(), dataset['target'].max()]
scatter_plot = dataset.plot(kind='scatter', x='RM', y='target', xlim=x_range, ylim=y_range)
mean_y = scatter_plot.plot(x_range, [dataset['target'].mean(), dataset['target'].mean()], '--', color='red', linewidth=1)
mean_x = scatter_plot.plot([dataset['RM'].mean(), dataset['RM'].mean()], y_range, '--', color='red', linewidth=1)
regression_line = scatter_plot.plot(dataset['RM'], fitted_values, '-', color='orange', linewidth=1)
In [44]:
predictions_by_dot_product = np.dot(X, betas)
print('Using the prediction method: {0}'.format(fitted_values[:10]))
print('Using betas and a dot product: {0}'.format(predictions_by_dot_product))
In [46]:
from sklearn import linear_model
linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)
In [47]:
observations = len(dataset)
dataset['RM'].values.reshape((observations, 1)) # X should always be a matrix, never a vector
y = dataset['target'].values # y can be a vector
In [48]:
linear_regression.fit(X, y)
Out[48]:
In [49]:
print(linear_regression.coef_)
print(linear_regression.intercept_)
In [50]:
print(linear_regression.predict(X)[:10])
Manually calculate results using matrix-vector multiplication:
In [51]:
Xp = np.column_stack((X, np.ones(observations)))
v_coef = list(linear_regression.coef_) + [linear_regression.intercept_]
In [52]:
np.dot(Xp, v_coef)[:10]
Out[52]:
Scikit calculation is much faster:
In [63]:
from sklearn.datasets import make_regression
HX, Hy = make_regression(n_samples=10000000, n_features=1, n_targets=1, random_state=101)
%time sk_linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True); \
sk_linear_regression.fit(HX, Hy)
In [62]:
%time sm_linear_regression = sm.OLS(Hy, sm.add_constant(HX)); \
sm_linear_regression.fit()
In [ ]: