version 0.2, May 2016
This notebook is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. Special thanks goes to Kevin Markham
In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import itertools
In [2]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 99999;
In [3]:
# Test Dataset
# Load dataset
import zipfile
with zipfile.ZipFile('../datasets/houses_portland.csv.zip', 'r') as z:
f = z.open('houses_portland.csv')
data = pd.io.parsers.read_table(f, sep=',')
data.head()
Out[3]:
In [4]:
data.columns
Out[4]:
In [5]:
y = data[' price'].values
X = data['area'].values
plt.scatter(X, y)
plt.xlabel('Area')
plt.ylabel('Price')
Out[5]:
In [6]:
y_mean, y_std = y.mean(), y.std()
X_mean, X_std = X.mean(), X.std()
y = (y - y_mean)/ y_std
X = (X - X_mean)/ X_std
plt.scatter(X, y)
plt.xlabel('Area')
plt.ylabel('Price')
Out[6]:
The $\beta$ values are called the model coefficients:
In the diagram above:
In [7]:
# create X and y
n_samples = X.shape[0]
X_ = np.c_[np.ones(n_samples), X]
Lets suppose the following betas
In [8]:
beta_ini = np.array([-1, 1])
In [9]:
# h
def lr_h(beta,x):
return np.dot(beta, x.T)
In [10]:
# scatter plot
plt.scatter(X, y)
# Plot the linear regression
x = np.c_[np.ones(2), [X.min(), X.max()]]
plt.plot(x[:, 1], lr_h(beta_ini, x), 'r', lw=5)
plt.xlabel('Area')
plt.ylabel('Price')
Out[10]:
Lets calculate the error of such regression
In [11]:
# Cost function
def lr_cost_func(beta, x, y):
# Can be vectorized
res = 0
for i in range(x.shape[0]):
res += (lr_h(beta,x[i, :]) - y[i]) ** 2
res *= 1 / (2*x.shape[0])
return res
lr_cost_func(beta_ini, X_, y)
Out[11]:
In [12]:
beta0 = np.arange(-15, 20, 1)
beta1 = 2
In [13]:
cost_func=[]
for beta_0 in beta0:
cost_func.append(lr_cost_func(np.array([beta_0, beta1]), X_, y) )
plt.plot(beta0, cost_func)
plt.xlabel('beta_0')
plt.ylabel('J(beta)')
Out[13]:
In [14]:
beta0 = 0
beta1 = np.arange(-15, 20, 1)
In [15]:
cost_func=[]
for beta_1 in beta1:
cost_func.append(lr_cost_func(np.array([beta0, beta_1]), X_, y) )
plt.plot(beta1, cost_func)
plt.xlabel('beta_1')
plt.ylabel('J(beta)')
Out[15]:
Analyzing both at the same time
In [16]:
beta0 = np.arange(-5, 7, 0.2)
beta1 = np.arange(-5, 7, 0.2)
In [17]:
cost_func = pd.DataFrame(index=beta0, columns=beta1)
for beta_0 in beta0:
for beta_1 in beta1:
cost_func.ix[beta_0, beta_1] = lr_cost_func(np.array([beta_0, beta_1]), X_, y)
In [18]:
betas = np.transpose([np.tile(beta0, beta1.shape[0]), np.repeat(beta1, beta0.shape[0])])
fig = plt.figure(figsize=(10, 10))
ax = fig.gca(projection='3d')
ax.plot_trisurf(betas[:, 0], betas[:, 1], cost_func.T.values.flatten(), cmap=cm.jet, linewidth=0.1)
ax.set_xlabel('beta_0')
ax.set_ylabel('beta_1')
ax.set_zlabel('J(beta)')
plt.show()
It can also be seen as a contour plot
In [19]:
contour_levels = [0, 0.1, 0.25, 0.5, 0.75, 1, 1.5, 2, 3, 4, 5, 7, 10, 12, 15, 20]
plt.contour(beta0, beta1, cost_func.T.values, contour_levels)
plt.xlabel('beta_0')
plt.ylabel('beta_1')
Out[19]:
Lets understand how different values of betas are observed on the contour plot
In [20]:
betas = np.array([[0, 0],
[-1, -1],
[-5, 5],
[3, -2]])
In [21]:
for beta in betas:
print('\n\nLinear Regression with betas ', beta)
f, (ax1, ax2) = plt.subplots(1,2, figsize=(12, 6))
ax2.contour(beta0, beta1, cost_func.T.values, contour_levels)
ax2.set_xlabel('beta_0')
ax2.set_ylabel('beta_1')
ax2.scatter(beta[0], beta[1], s=50)
# scatter plot
ax1.scatter(X, y)
# Plot the linear regression
x = np.c_[np.ones(2), [X.min(), X.max()]]
ax1.plot(x[:, 1], lr_h(beta, x), 'r', lw=5)
ax1.set_xlabel('Area')
ax1.set_ylabel('Price')
plt.show()
For the particular case of linear regression with one variable and one intercept the gradient is calculated as:
In [22]:
# gradient calculation
beta_ini = np.array([-1.5, 0.])
def gradient(beta, x, y):
# Not vectorized
gradient_0 = 1 / x.shape[0] * ((lr_h(beta, x) - y).sum())
gradient_1 = 1 / x.shape[0] * ((lr_h(beta, x) - y)* x[:, 1]).sum()
return np.array([gradient_0, gradient_1])
gradient(beta_ini, X_, y)
Out[22]:
In [23]:
def gradient_descent(x, y, beta_ini, alpha, iters):
betas = np.zeros((iters, beta_ini.shape[0] + 1))
beta = beta_ini
for iter_ in range(iters):
betas[iter_, :-1] = beta
betas[iter_, -1] = lr_cost_func(beta, x, y)
beta -= alpha * gradient(beta, x, y)
return betas
In [24]:
iters = 100
alpha = 0.05
beta_ini = np.array([-4., -4.])
betas = gradient_descent(X_, y, beta_ini, alpha, iters)
Lets see the evolution of the cost per iteration
In [25]:
plt.plot(range(iters), betas[:, -1])
plt.xlabel('iteration')
plt.ylabel('J(beta)')
Out[25]:
Understanding what it is doing in each iteration
In [26]:
betas_ = betas[range(0, iters, 10), :-1]
for i, beta in enumerate(betas_):
print('\n\nLinear Regression with betas ', beta)
f, (ax1, ax2) = plt.subplots(1,2, figsize=(12, 6))
ax2.contour(beta0, beta1, cost_func.T.values, contour_levels)
ax2.set_xlabel('beta_0')
ax2.set_ylabel('beta_1')
ax2.scatter(beta[0], beta[1], c='r', s=50)
if i > 0:
for beta_ in betas_[:i]:
ax2.scatter(beta_[0], beta_[1], s=50)
# scatter plot
ax1.scatter(X, y)
# Plot the linear regression
x = np.c_[np.ones(2), [X.min(), X.max()]]
ax1.plot(x[:, 1], lr_h(beta, x), 'r', lw=5)
ax1.set_xlabel('Area')
ax1.set_ylabel('Price')
plt.show()
Estimated Betas
In [27]:
betas[-1, :-1]
Out[27]:
In [28]:
beta = np.dot(np.linalg.inv(np.dot(X_.T, X_)),np.dot(X_.T, y))
In [29]:
beta
Out[29]:
In [30]:
# import
from sklearn.linear_model import LinearRegression
In [31]:
# Initialize
linreg = LinearRegression(fit_intercept=False)
In [32]:
# Fit
linreg.fit(X_, y)
Out[32]:
In [33]:
linreg.coef_
Out[33]:
In [34]:
# import
from sklearn.linear_model import SGDRegressor
In [35]:
# Initialize
linreg2 = SGDRegressor(fit_intercept=False, n_iter=100)
In [36]:
# Fit
linreg2.fit(X_, y)
Out[36]:
In [37]:
linreg2.coef_
Out[37]:
Gradient Descent | Normal Equation |
---|---|
Need to choose $\alpha$ | No need to choose $\alpha$ |
Needs many iterations | Don't need to iterate |
Works weel even when $k$ is large | Slow if $k$ is very large |
Need to compute $(X^TX)^{-1}$ |
Lets create a new freature $ area^2 $
In [38]:
data['area2'] = data['area'] ** 2
data.head()
Out[38]:
In [39]:
i = 2
data.ix[2, ['area', 'area2']]
Out[39]:
In [40]:
i = 2
j = 2
data.ix[2, 'area2']
Out[40]:
In [41]:
X = data[['area', 'area2']].values
X[0:5]
Out[41]:
In [42]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler(with_mean=True, with_std=True)
ss.fit(X.astype(np.float))
X = ss.transform(X.astype(np.float))
ss.mean_, ss.scale_
Out[42]:
In [43]:
X[0:5]
Out[43]:
In [44]:
X_ = np.c_[np.ones(n_samples), X]
X_[0:5]
Out[44]:
The goal became to estimate the parameters $\beta$ that minimize the sum of squared residuals
Note that $x^0$ is refering to the column of ones
In [45]:
beta_ini = np.array([0., 0., 0.])
# gradient calculation
def gradient(beta, x, y):
return 1 / x.shape[0] * np.dot((lr_h(beta, x) - y).T, x)
gradient(beta_ini, X_, y)
Out[45]:
In [46]:
beta_ini = np.array([0., 0., 0.])
alpha = 0.005
iters = 100
betas = gradient_descent(X_, y, beta_ini, alpha, iters)
# Print iteration vs J
plt.plot(range(iters), betas[:, -1])
plt.xlabel('iteration')
plt.ylabel('J(beta)')
Out[46]:
Aparently the cost function is not converging
Lets change alpha and increase the number of iterations
In [47]:
beta_ini = np.array([0., 0., 0.])
alpha = 0.5
iters = 1000
betas = gradient_descent(X_, y, beta_ini, alpha, iters)
# Print iteration vs J
plt.plot(range(iters), betas[:, -1])
plt.xlabel('iteration')
plt.ylabel('J(beta)')
Out[47]:
In [48]:
print('betas using gradient descent\n', betas[-1, :-1])
In [49]:
betas_ols = np.dot(np.linalg.inv(np.dot(X_.T, X_)),np.dot(X_.T, y))
betas_ols
Out[49]:
Difference
In [50]:
betas_ols - betas[-1, :-1]
Out[50]:
In [51]:
x = np.array([3000., 3000.**2])
# scale
x_scaled = ss.transform(x.reshape(1, -1))
x_ = np.c_[1, x_scaled]
x_
Out[51]:
In [52]:
y_pred = lr_h(betas_ols, x_)
y_pred
Out[52]:
In [53]:
y_pred = y_pred * y_std + y_mean
y_pred
Out[53]:
In [54]:
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
clf1 = LinearRegression()
clf2 = SGDRegressor(n_iter=10000)
When using sklearn there is no need to create the intercept
Also sklearn works with pandas
In [55]:
clf1.fit(data[['area', 'area2']], data[' price'])
Out[55]:
In [56]:
clf2.fit(X, y)
Out[56]:
Making predictions
In [57]:
clf1.predict(x.reshape(1, -1))
Out[57]:
In [58]:
clf2.predict(x_scaled.reshape(1, -1)) * y_std + y_mean
Out[58]:
Evaluation metrics for classification problems, such as accuracy, are not useful for regression problems. We need evaluation metrics designed for comparing continuous values.
Here are three common evaluation metrics for regression problems:
Mean Absolute Error (MAE) is the mean of the absolute value of the errors:
$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$Mean Squared Error (MSE) is the mean of the squared errors:
$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors:
$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$
In [59]:
y_pred = clf1.predict(data[['area', 'area2']])
In [60]:
from sklearn import metrics
import numpy as np
print('MAE:', metrics.mean_absolute_error(data[' price'], y_pred))
print('MSE:', metrics.mean_squared_error(data[' price'], y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(data[' price'], y_pred)))
Comparing these metrics:
All of these are loss functions, because we want to minimize them.
Advantages of linear regression:
Disadvantages of linear regression: