$\textbf{x}$ is a vector of $n$ input features/variables and $y$ is the output/target variable. Here $\textbf{x} \in \mathbb{R}^n$ and $y \in \mathbb{R}$. There are $m$ training examples of type $(\textbf{x}^{(i)},y^{(i)})$.
Hypothesis function $h_{\theta}(\textbf{x}) = \theta^T \mathbf{\hat{x}}$, where $\theta \in \mathbb{R}^{n+1}$ are the parameters and $\mathbf{\hat{x}} \in \mathbb{R}^{n+1}$ is obtained by preappending $1$ to $\textbf{x}$.
Cost function $J(\theta) = \frac{1}{2m}\sum_{i=1}^{m}\left(h_\theta(\textbf{x}^{(i)}) - y^{(i)}\right)^2$.
Minimizing this cost function gives us the parameters $\theta$ that best fit the given data.
$\theta := \theta - \frac{\alpha}{m}X^T\left(X\theta - \textbf{y}\right)$, where $X$ has $\mathbf{\hat{x}}$ as rows so that $X\theta$ gives the value of $h_\theta$ as a column vector of dimension $m$. $\alpha \in \mathbb{R}$ is called the learning rate.
This update is continued for a fixed number of iterations rather than until the cost function is reduced to a certain value. The reason for this can be seen in the multivariate linear regression example below where the cost is pretty high but still the fitting cannot be improved since the result is the same as that obtained by the normal equation.
$\theta = (X^T X)^{-1}X^T\textbf{y}$
$X \in \mathbb{R}^{m\times (n+1)}$, is a matrix of training inputs preappended with $1$s.
$\textbf{y} \in \mathbb{R}^m$, is the vector of training outputs.
$\textbf{x} \in \mathbb{R}^n$, is a vector of a single input feature.
In this part of this exercise, we will implement linear regression with one variable to predict profits for a food truck. Suppose you are the CEO of a restaurant franchise and are considering different cities for opening a new outlet. The chain already has trucks in various cities and you have data for profits and populations from the cities.
In [49]:
# Import necessary modules
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=5)
In [50]:
##################################
# Define the cost function
# Mean squared error
##################################
# Input
# X - Input feature with 1s preappended
# y - Output or target variables
# theta - The parameters fitted for linear regression
# Output
# J - The value of the cost function
def computeCost(X, y, theta):
m = len(X)
h = X.dot(theta) - y
J = h.dot(h)/2./m
return J
##################################
# Gradient descent algorithm
# for Linear regression
##################################
# Input
# X - Input features with 1s preappended
# y - Output or target variables
# theta - The parameters to be fit
# alpha - Learning rate
# num_iter - Number of iterations to run
# Output
# theta - The fitted parameters
# J_hist - Value of cost function for each iteration
def gradientDescent(X, y, theta, alpha, num_iter):
m = len(X)
J_hist = np.zeros(num_iter)
for i in range(num_iter):
h = X.dot(theta) - y
theta = theta - (alpha/m)*X.T.dot(h)
J_hist[i] = computeCost(X, y, theta)
return theta, J_hist
##################################
# Feature normalization using
# mean and standard deviation
##################################
# Input
# X - Input features without 1s preappended
# Output
# X - Normalized features
# mu - Vector of means of all features
# sigma - Vector of standard deviations of all features
def normalizeFeatures(X):
m = X.shape[0]
mu = np.mean(X,axis=0)
sigma = np.std(X,axis=0)
for i,(m,s) in enumerate(zip(mu,sigma)):
X[:,i] = (X[:,i]-m)/s
return X, mu, sigma
##################################
# Normal equation algorithm
# for Linear regression
##################################
# Input
# X - Input features with 1s preappended
# y - Output or target variables
# Output
# theta - Fitted parameters
def normalEqn(X, y):
theta = np.linalg.pinv(X.T.dot(X)).dot(X.T.dot(y))
return theta
In [51]:
# Load the data
data = np.loadtxt('ex1data1.txt', delimiter=',')
X = data[:,0] # Population in 10,000s
y = data[:,1] # Profit in $10,000s
m = len(data)
del data
# Plot the data
plt.figure(dpi=120)
plt.plot(X,y,'r+')
plt.title('Given data')
plt.xlabel('Population in 10,000s')
plt.ylabel('Profit in $10,000s')
plt.show()
# Preappend 1s to X
tmp = np.ones([m,2])
tmp[:,1] = X
X = tmp
del tmp
In [52]:
# Test cost function
test_theta = np.zeros(2)
J = computeCost(X,y,test_theta)
print('With theta = ',test_theta,', Cost computed = %.5f'%(J))
print('Expected cost value (approx) 32.07\n');
# Test some more
test_theta = [-1., 2.]
J = computeCost(X,y,test_theta)
print('With theta = ',test_theta,', Cost computed = %.5f'%(J))
print('Expected cost value (approx) 54.24');
In [53]:
# Training parameters
alpha = 0.01
num_iter = 1500
init_theta = [0., 0.]
# Perform gradient descent algorithm
theta, J_hist = gradientDescent(X,y,init_theta,alpha,num_iter)
# Plot the cost function over iterations
plt.figure(dpi=120)
plt.plot(J_hist)
plt.title('Cost vs iterations')
plt.xlabel('# iterations')
plt.ylabel('Cost')
plt.show()
# Test the values of theta
print('Theta found by gradient descent:', theta)
print('Expected theta values (approx):',[-3.6303, 1.1664])
In [54]:
# Plot the fitted line
plt.figure(dpi=120)
plt.plot(X[:,1],y,'r+',label='Given data')
plt.plot(X[:,1],X.dot(theta),label='Fitted line')
plt.legend()
plt.title('Given Data with fitted line')
plt.xlabel('Population in 10,000s')
plt.ylabel('Profit in $10,000s')
plt.show()
# Predict something
predict1 = np.array([1., 3.5]).dot(theta)
print('For population = 35,000, we predict a profit of %.5f'%(predict1*10000))
predict2 = np.array([1., 7.]).dot(theta)
print('For population = 70,000, we predict a profit of %.5f'%(predict2*10000))
In [55]:
# Visualize the contours
t0 = np.arange(-10., 10.2, 0.2)
t1 = np.arange(-1., 4.05, 0.05)
p,q = np.meshgrid(t0,t1)
J_vals = np.zeros([len(t0), len(t1)])
for i,th0 in enumerate(t0):
for j,th1 in enumerate(t1):
J_vals[i,j] = computeCost(X,y,[th0, th1])
plt.figure(dpi=120)
plt.contour(t0,t1,J_vals.T,np.logspace(-2,3,30),label='Cost function')
plt.plot(init_theta[0],init_theta[1],'kx',ms=10, mew=3,label='Initial params')
plt.plot(theta[0],theta[1],'rx',ms=10, mew=3,label='Learned params')
plt.xlabel(r'$\theta_0$')
plt.ylabel(r'$\theta_1$')
plt.title('Contour plot of cost function')
plt.legend()
plt.show()
In [56]:
data = np.loadtxt('ex1data2.txt', delimiter=',')
X = data[:,0:2]
y = data[:,2]
m,n = X.shape
del data
# Normalize the features
X, mu, sigma = normalizeFeatures(X)
# Preappend 1s to X
tmp = np.ones([m,n+1])
tmp[:,1:n+1] = X
X = tmp
del tmp
In [57]:
# Training parameters
alpha = 0.1
num_iter = 400
init_theta = np.zeros(n+1)
# Perform gradient descent algorithm
theta, J_hist = gradientDescent(X,y,init_theta,alpha,num_iter)
# Plot the cost function over iterations
plt.figure(dpi=120)
plt.plot(J_hist)
plt.title('Cost vs iterations')
plt.xlabel('# iterations')
plt.ylabel('Cost')
plt.show()
# Log
print("Theta computed using gradient descent: ",theta)
x = np.array([1., (1650.-mu[0])/sigma[0], (3.-mu[1])/sigma[1]])
print("Predicted price of a 1650 sq-ft, 3 br house (using gradient descent): %.5f"%(x.dot(theta)))
In [58]:
# Solve using the normal equation
theta = normalEqn(X, y)
# Log
print("Theta computed using normal equation: ",theta)
print("Predicted price of a 1650 sq-ft, 3 br house (using normal equation): %.5f"%(x.dot(theta)))