In [1]:
# Setup code for this notebook
import random
import numpy as np
import matplotlib.pyplot as plt
# make matplotlib figures appear inline
%matplotlib inline
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
In [2]:
# Load the data
# The data is copied from the linear regression exercise of machine learning course at https://www.coursera.org/course/ml
# The file ex1data2.txt contains a training set of housing prices in Portland, Oregon. The first column is the
# size of the house (in square feet), the second column is the number of bedrooms,
# and the third column is the price of the house, which we want to predict.
file_name = 'datasets/ex1data2.csv'
with open(file_name, 'r') as f:
house_data = np.loadtxt(file_name, delimiter=',')
num_sample = house_data.shape[0] # number of all the samples
X = house_data[:, :2]
y = house_data[:, 2]
print y.shape
# Add intercept term or bias to X
print 'X shape: ', X.shape
print 'y shape: ', y.shape
print 'First 10 examples from the dataset'
print house_data[0:10, :]
In [3]:
# Feature Normalization
# By looking at the data, features differ by orders of magnitude
# we need to perform feature scaling to make gradient descent converge much more quickly
# Normalization: (x - mean) / std
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
# Add bias dimension
X = np.hstack((X, np.ones((num_sample, 1))))
X = X.T
print 'First 10 examples from the dataset'
print X[:, :10].T
In [4]:
# Implement linear_loss_grad(W, X, y) to compute loss and gradients
# and use numeric gradient checking as a debugging tool to check the implementation of gradient
from algorithms.regression import linear_loss_grad
from algorithms.gradient_check import grad_check_sparse
test_W = np.random.randn(1, X.shape[0]) * 0.001 # [1, D]
grad = linear_loss_grad(test_W, X, y)[1]
f = lambda W: linear_loss_grad(W, X, y)[0]
grad_numerical = grad_check_sparse(f, test_W, grad, 10)
In [5]:
# Compare the naive version (for loop) and vectorized version to compute loss and gradient
import time
from algorithms.regression import linear_loss_grad
from algorithms.regression import linear_loss_grad_naive
tic = time.time()
loss_naive, grad_naive = linear_loss_grad_naive(test_W, X, y)
toc = time.time()
print 'naive loss: %e computed in %fs' % (loss_naive, toc - tic)
tic = time.time()
loss_vectorized, grad_vectorized = linear_loss_grad(test_W, X, y)
toc = time.time()
print 'vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic)
# Use Frobenius norm to compare the two versions of the gradients
grad_diff = np.linalg.norm(grad_naive - grad_vectorized, ord='fro')
print 'Loss difference: %f' % np.abs(loss_naive - loss_vectorized)
print 'Gradient difference: %f' % grad_diff
In [7]:
# Now train linear regression model by BGD and SGD algorithms
from algorithms.regression import LinearRegression
lr_bgd = LinearRegression()
tic = time.time()
losses_bgd = lr_bgd.train(X, y, method='bgd', learning_rate=1e-2, num_iters=1000, verbose=True)
toc = time.time()
print 'Traning time for BGD with vectorized version is %f \n' % (toc - tic)
lr_sgd = LinearRegression()
tic = time.time()
losses_sgd = lr_sgd.train(X, y, method='sgd', learning_rate=1e-2, num_iters=1000, verbose=True)
toc = time.time()
print 'Traning time for SGD with vectorized version is %f' % (toc - tic)
In [8]:
#A useful degugging strategy is to plot the loss
# as a function of iteration number:
from ggplot import *
# plot losses from BDG
qplot(xrange(len(losses_bgd)), losses_bgd) + labs(x='Iteration number', y='BGD loss value')
Out[8]:
In [9]:
# plot losses from SGD
qplot(xrange(len(losses_sgd)), losses_sgd) + labs(x='Iteration number', y='SGD loss value')
Out[9]: