In [1]:

    
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append('..')

from helper import linear_regression as lr

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [2]:

    
# load in raw data, not intercept term
X, y, Xval, yval, Xtest, ytest = lr.load_data()

create polynomial features



In [3]:

    
lr.poly_features(X, power=3)









    Out[3]:






  
    
      
      f1
      f2
      f3
    
  
  
    
      0
      -15.936758
      253.980260
      -4047.621971
    
    
      1
      -29.152979
      849.896197
      -24777.006175
    
    
      2
      36.189549
      1309.683430
      47396.852168
    
    
      3
      37.492187
      1405.664111
      52701.422173
    
    
      4
      -48.058829
      2309.651088
      -110999.127750
    
    
      5
      -8.941458
      79.949670
      -714.866612
    
    
      6
      15.307793
      234.328523
      3587.052500
    
    
      7
      -34.706266
      1204.524887
      -41804.560890
    
    
      8
      1.389154
      1.929750
      2.680720
    
    
      9
      -44.383760
      1969.918139
      -87432.373590
    
    
      10
      7.013502
      49.189211
      344.988637
    
    
      11
      22.762749
      518.142738
      11794.353058

prepare polynomial regression data

expand feature with power = 8, or any power you want to use
Apply normalization to combat $x^n$ situation
don't forget intercept term



In [4]:

    
X_poly, Xval_poly, Xtest_poly= lr.prepare_poly_data(X, Xval, Xtest, power=8)
X_poly[:3, :]









    Out[4]:





array([[  1.00000000e+00,  -3.62140776e-01,  -7.55086688e-01,
          1.82225876e-01,  -7.06189908e-01,   3.06617917e-01,
         -5.90877673e-01,   3.44515797e-01,  -5.08481165e-01],
       [  1.00000000e+00,  -8.03204845e-01,   1.25825266e-03,
         -2.47936991e-01,  -3.27023420e-01,   9.33963187e-02,
         -4.35817606e-01,   2.55416116e-01,  -4.48912493e-01],
       [  1.00000000e+00,   1.37746700e+00,   5.84826715e-01,
          1.24976856e+00,   2.45311974e-01,   9.78359696e-01,
         -1.21556976e-02,   7.56568484e-01,  -1.70352114e-01]])

plot learning curve

again, first we don't apply regularization $\lambda=0$



In [5]:

    
lr.plot_learning_curve(X_poly, y, Xval_poly, yval, l=0)

as you can see the training cost is too low to be true. This is over fitting

try $\lambda=1$



In [6]:

    
lr.plot_learning_curve(X_poly, y, Xval_poly, yval, l=1)

training cost increat a little bit, not 0 anymore.
Say we alleviate over fitting a little bit

try $\lambda=100$



In [7]:

    
lr.plot_learning_curve(X_poly, y, Xval_poly, yval, l=100)

too much regularization.
back to underfit situation

find the best $\lambda$



In [8]:

    
l_candidate = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
training_cost, cv_cost = [], []



In [9]:

    
for l in l_candidate:
    res = lr.linear_regression_np(X_poly, y, l)
    
    tc = lr.cost(res.x, X_poly, y)
    cv = lr.cost(res.x, Xval_poly, yval)
    
    training_cost.append(tc)
    cv_cost.append(cv)



In [10]:

    
plt.plot(l_candidate, training_cost, label='training')
plt.plot(l_candidate, cv_cost, label='cross validation')
plt.legend(loc=2)

plt.xlabel('lambda')

plt.ylabel('cost')









    Out[10]:





<matplotlib.text.Text at 0x11a410ef0>



In [11]:

    
# best cv I got from all those candidates
l_candidate[np.argmin(cv_cost)]









    Out[11]:





1



In [12]:

    
# use test data to compute the cost
for l in l_candidate:
    theta = lr.linear_regression_np(X_poly, y, l).x
    print('test cost(l={}) = {}'.format(l, lr.cost(theta, Xtest_poly, ytest)))









    



test cost(l=0) = 9.982275423899827
test cost(l=0.001) = 10.96403493885111
test cost(l=0.003) = 11.264458872657682
test cost(l=0.01) = 10.880094765571297
test cost(l=0.03) = 10.022266931655883
test cost(l=0.1) = 8.632063139750382
test cost(l=0.3) = 7.336640278544401
test cost(l=1) = 7.466289435179381
test cost(l=3) = 11.64393193727906
test cost(l=10) = 27.715080291767972

turns out $\lambda = 0.3$ is even better choice XD



In [ ]:

	f1	f2	f3
0	-15.936758	253.980260	-4047.621971
1	-29.152979	849.896197	-24777.006175
2	36.189549	1309.683430	47396.852168
3	37.492187	1405.664111	52701.422173
4	-48.058829	2309.651088	-110999.127750
5	-8.941458	79.949670	-714.866612
6	15.307793	234.328523	3587.052500
7	-34.706266	1204.524887	-41804.560890
8	1.389154	1.929750	2.680720
9	-44.383760	1969.918139	-87432.373590
10	7.013502	49.189211	344.988637
11	22.762749	518.142738	11794.353058