In [ ]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
import scipy.sparse.linalg as linalg
import scipy.cluster.hierarchy as hr
from scipy.spatial.distance import pdist, squareform
import sklearn.datasets as datasets
import sklearn.metrics as metrics
import sklearn.utils as utils
import sklearn.linear_model as linear_model
import sklearn.cross_validation as cross_validation
import sklearn.cluster as cluster
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from patsy import dmatrices
import seaborn as sns
%matplotlib inline
We know: $X$ and $Y$
We do not know : $\alpha$, $\beta$ and $\epsilon$
Goal: Given $X$ and $Y$ produce estimates of $\alpha$ and $\beta$ denoted by $\widehat{\alpha}$ and $\widehat{\beta}$
Input data comes in the form of pairs $\left(X_i,Y_i\right)$ for $i=1,\ldots ,n$
The true regression line : For every individual it should hold that: $$Y_i = \alpha +\beta X_i +\epsilon_i$$
Error for the $i$-th data point is: $$ \epsilon_i = Y_i-\alpha-\beta X_i $$
The estimated regression line : $$\widehat{Y_i}=\widehat{\alpha}+\widehat{\beta}X_i$$
Residuals measure distance between each observation from the estimated regression line and are defined as follows: $$\widehat{\epsilon_i} = Y_i-\widehat{Y_i}$$
Question: How do we find $\widehat{\alpha}$ and $\widehat{\beta}$?
Answer: By minimizing the residuals, or sum of squared residuals :
\begin{eqnarray} \text{SSR} & = & \sum_{i=1}^n \widehat{\epsilon_i}^2 \\ & = & \sum_{i=1}^n \left(Y_i-\widehat{Y_i}\right)^2 \end{eqnarray}$\alpha,\beta$ are called regression coefficients
$\widehat{\alpha},\widehat{\beta}$ are called OLS (Ordinary Least Squares} regression coefficients
$\alpha$ (or $\widehat{\alpha}$) is not so interesting
$\beta$ (or $\widehat{\beta}$) is more interesting as it shows the change in $Y$ that can be caused by a unit of change in $X$
The most common measure of fit is referred to as $R^2$
$R^2$ measures the variability of $Y$ that can be explained by $X$
Derivation: $$\text{Var}(Y) =\frac{\sum_{i=1}^n \left(Y_i-\overline{Y}\right)}{n-1} $$ where: $\overline{Y}=\frac{1}{n}\sum_{i=1}^nY_i$
Total Sum of Squares (TSS) is:
$$\text{TSS} = \sum_{i=1}^n\left(Y_i-\overline{Y}\right)^2 $$Now we can show that:
$$\text{TSS} = \text{SSR} + \text{RSS},$$where Sum of Squares Residuals (SSR) is: $$\text{SSR} = \sum_{i=1}^n \left(Y_i-\widehat{Y_i}\right)^2,$$
and Regresion Sum of Squares (RSS) is:
$$\text{RSS} = \sum_{i=1}^n \left(\widehat{Y_i}-\overline{Y}\right)^2,$$Variability in $Y$ can be due to explained (RSS) and unexplained parts (SSR).
Measure of fit $R^2$:
\begin{eqnarray} R^2 & = & \frac{\text{RSS}}{\text{TSS}} = 1-\frac{\text{SSR}}{\text{TSS}} \end{eqnarray}$0\leq R^2\leq 1$; the closer the value of $R^2$ is to $1$ the better the fit of the regression; small values of SSR imply that the residuals are small and therefore we have a better fit.
95% Confidence interval $[x,y]$ for $\widehat{\alpha}$: We are 95% certain that $\alpha$ lies in $[x,y]$
Generate a dataset using the datasets.makeregression( ) function
In [ ]:
X, y = datasets.make_regression(n_samples=100, n_features=1, bias=0.1, noise=30, random_state=1)
print X.shape, y.shape
In [ ]:
plt.scatter(X, y, c="slategray")
In [ ]:
model = sm.OLS(y, X)
results = model.fit()
print results.summary()
print "Confidence Intervals:", results.conf_int()
print "Parameters:", results.params
A more detailed explanation of the results can be found here: http://connor-johnson.com/2014/02/18/linear-regression-with-python/
In [ ]:
plt.scatter(X,y, c="slategray")
plt.plot(X,results.predict(X), c='seagreen', alpha=0.8)
Input : $(Y,X_1,X_2,\ldots , X_k)$
$Y_i = \alpha +\beta_1X_1+\beta_2X_2+\ldots + \beta_k X_k$
Output : Estimates $\widehat{\alpha},\widehat{\beta_i}$ for $i=1\ldots k$ that minimize
$\text{SSR} = \sum_{i=1}^n\left(\widehat{Y_i} - \widehat{\alpha} -\widehat{\beta_1}X_1-\widehat{\beta_2}X_2-\ldots - \widehat{\beta_k} X_k\right)^2$
Use again the datasets.makeregression( ) function
In [ ]:
X, y = datasets.make_regression(n_samples=100, n_features=20, n_informative=5, bias=0.1, noise=30, random_state=1)
print X.shape, y.shape
In [ ]:
model = sm.OLS(y, X)
results = model.fit()
print results.summary()
print "Confidence Intervals:", results.conf_int()
print "Parameters:", results.params
In [ ]:
ca = datasets.fetch_california_housing();
In [ ]:
print type(ca)
print type(ca.data)
X = ca.data
print X.shape
print ca.feature_names
print ca.keys()
print ca.target
y = ca.target;
print y.shape
plt.scatter(range(len(y)), y, c="slategray", alpha=0.3, linewidths=0.2)
print ca.values()
In [ ]:
X, y = utils.shuffle(X, y, random_state=1)
Split the data into training and testing
In [ ]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
print(X_train.shape), y_train.shape
print(X_test.shape), y_test.shape
In [ ]:
subX_train = X_train[:,0]
subX_test = X_test[:,0]
plt.scatter(subX_train, y_train, c="slategray", alpha=0.4, linewidths=0.3)
# plt.scatter(subX_test, y_test, c="seagreen", alpha=0.2, linewidths=0.3)
plt.xlabel('Data')
plt.ylabel('Target');
In [ ]:
fig, axes = plt.subplots(2,4,figsize=(15,10))
for i in range(8):
plt_i = i // 4
plt_j = i % 4
subX_train = X_train[:,i]
# plt.subplot(2, 4, 1 + i)
axes[plt_i][plt_j].scatter(subX_train, y_train, c="slategray", alpha=0.4, linewidths=0.3)
#plt.scatter(subX_test, y_test)
axes[plt_i][plt_j].set_xlabel('Data')
axes[plt_i][plt_j].set_ylabel('Target');
In [ ]:
regr = linear_model.LinearRegression()
Fit the model
In [ ]:
print X_train.shape
regr.fit(X_train, y_train);
The coefficients and the bias are now computed
In [ ]:
# The mean square error
print("Training error: ", metrics.mean_squared_error(regr.predict(X_train),y_train))
print("Test error: ", metrics.mean_squared_error(regr.predict(X_test),y_test))
Returns the coefficient of determination R^2 of the prediction.
The coefficient $R^2$ is defined as $(1 - u/v)$, where u is the regression sum of squares ((y_true - y_pred)^2).sum( ) and v is the residual sum of squares ((y_true - y_true.mean( ))^2).sum( ). Best possible score is 1.0, lower values are worse.
In [ ]:
train_score = regr.score(X_train,y_train)
test_score = regr.score(X_test,y_test)
print("Training score: ", train_score)
print("Test score: ", test_score)
In [ ]:
coefficients = regr.coef_
for i in range(len(coefficients)):
print ca.feature_names[i],"\t",coefficients[i]
In [ ]:
print pd.DataFrame(zip(ca.feature_names, np.transpose(coefficients)))
In [ ]:
regr = linear_model.LinearRegression()
scores = cross_validation.cross_val_score(regr, X, y, cv=5)
print scores
In [ ]:
print("Regression score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
In [ ]:
subX_train = X_train[:,0]
subX_test = X_test[:,0]
plt.scatter(subX_train, y_train, c="slategray", alpha=0.4, linewidths=0.3)
plt.plot(subX_train, coefficients[0]*subX_train, color='seagreen', linewidth=3, alpha=.8);
plt.xlabel('Data')
plt.ylabel('Target');
In [ ]:
boston = datasets.load_boston()
In [ ]:
print boston.data.shape
print type(boston.data)
X = boston.data
print X.shape
print boston.feature_names
print boston.target
y = boston.target;
print y.shape
plt.scatter(range(len(y)), y, c="slategray", alpha=0.4, linewidths=0.3)
In [ ]:
X, y = utils.shuffle(X, y, random_state=1)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
print(X_train.shape), y_train.shape
print(X_test.shape), y_test.shape
In [ ]:
regr = linear_model.LinearRegression()
print X_train.shape
regr.fit(X_train, y_train);
# The mean square error
print("Training error: ", np.mean((regr.predict(X_train) - y_train) ** 2))
print("Test error: ", np.mean((regr.predict(X_test) - y_test) ** 2))
In [ ]:
train_score = regr.score(X_train,y_train)
test_score = regr.score(X_test,y_test)
print("Training score: ", train_score)
print("Test score: ", test_score)
In [ ]:
coefficients = regr.coef_
for i in range(len(coefficients)):
print boston.feature_names[i],"\t",coefficients[i]
In [1]:
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
styles = open("../theme/custom.css", "r").read()
return HTML(styles)
css_styling()
Out[1]: