In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append('..')

from helper import linear_regression as lr

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# load in raw data, not intercept term
X, y, Xval, yval, Xtest, ytest = lr.load_data()

create polynomial features


In [3]:
lr.poly_features(X, power=3)


Out[3]:
f1 f2 f3
0 -15.936758 253.980260 -4047.621971
1 -29.152979 849.896197 -24777.006175
2 36.189549 1309.683430 47396.852168
3 37.492187 1405.664111 52701.422173
4 -48.058829 2309.651088 -110999.127750
5 -8.941458 79.949670 -714.866612
6 15.307793 234.328523 3587.052500
7 -34.706266 1204.524887 -41804.560890
8 1.389154 1.929750 2.680720
9 -44.383760 1969.918139 -87432.373590
10 7.013502 49.189211 344.988637
11 22.762749 518.142738 11794.353058

prepare polynomial regression data

  1. expand feature with power = 8, or any power you want to use
  2. Apply normalization to combat $x^n$ situation
  3. don't forget intercept term

In [4]:
X_poly, Xval_poly, Xtest_poly= lr.prepare_poly_data(X, Xval, Xtest, power=8)
X_poly[:3, :]


Out[4]:
array([[  1.00000000e+00,  -3.62140776e-01,  -7.55086688e-01,
          1.82225876e-01,  -7.06189908e-01,   3.06617917e-01,
         -5.90877673e-01,   3.44515797e-01,  -5.08481165e-01],
       [  1.00000000e+00,  -8.03204845e-01,   1.25825266e-03,
         -2.47936991e-01,  -3.27023420e-01,   9.33963187e-02,
         -4.35817606e-01,   2.55416116e-01,  -4.48912493e-01],
       [  1.00000000e+00,   1.37746700e+00,   5.84826715e-01,
          1.24976856e+00,   2.45311974e-01,   9.78359696e-01,
         -1.21556976e-02,   7.56568484e-01,  -1.70352114e-01]])

plot learning curve

again, first we don't apply regularization $\lambda=0$


In [5]:
lr.plot_learning_curve(X_poly, y, Xval_poly, yval, l=0)


as you can see the training cost is too low to be true. This is over fitting

try $\lambda=1$


In [6]:
lr.plot_learning_curve(X_poly, y, Xval_poly, yval, l=1)


training cost increat a little bit, not 0 anymore.
Say we alleviate over fitting a little bit

try $\lambda=100$


In [7]:
lr.plot_learning_curve(X_poly, y, Xval_poly, yval, l=100)


too much regularization.
back to underfit situation

find the best $\lambda$


In [8]:
l_candidate = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
training_cost, cv_cost = [], []

In [9]:
for l in l_candidate:
    res = lr.linear_regression_np(X_poly, y, l)
    
    tc = lr.cost(res.x, X_poly, y)
    cv = lr.cost(res.x, Xval_poly, yval)
    
    training_cost.append(tc)
    cv_cost.append(cv)

In [10]:
plt.plot(l_candidate, training_cost, label='training')
plt.plot(l_candidate, cv_cost, label='cross validation')
plt.legend(loc=2)

plt.xlabel('lambda')

plt.ylabel('cost')


Out[10]:
<matplotlib.text.Text at 0x11a410ef0>

In [11]:
# best cv I got from all those candidates
l_candidate[np.argmin(cv_cost)]


Out[11]:
1

In [12]:
# use test data to compute the cost
for l in l_candidate:
    theta = lr.linear_regression_np(X_poly, y, l).x
    print('test cost(l={}) = {}'.format(l, lr.cost(theta, Xtest_poly, ytest)))


test cost(l=0) = 9.982275423899827
test cost(l=0.001) = 10.96403493885111
test cost(l=0.003) = 11.264458872657682
test cost(l=0.01) = 10.880094765571297
test cost(l=0.03) = 10.022266931655883
test cost(l=0.1) = 8.632063139750382
test cost(l=0.3) = 7.336640278544401
test cost(l=1) = 7.466289435179381
test cost(l=3) = 11.64393193727906
test cost(l=10) = 27.715080291767972

turns out $\lambda = 0.3$ is even better choice XD


In [ ]: