Initialise the libs



In [119]:

    
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
import numpy as np

from math import ceil

Load the data



In [141]:

    
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

regressionDir = '/home/weenkus/workspace/Machine Learning - University of Washington/Regression/datasets/'

sales = pd.read_csv(regressionDir + 'kc_house_data.csv', dtype = dtype_dict)
sales = sales.sort(['sqft_living','price'])

# dtype_dict same as above
set_1 = pd.read_csv(regressionDir + 'wk3_kc_house_set_1_data.csv', dtype=dtype_dict)
set_2 = pd.read_csv(regressionDir + 'wk3_kc_house_set_2_data.csv', dtype=dtype_dict)
set_3 = pd.read_csv(regressionDir + 'wk3_kc_house_set_3_data.csv', dtype=dtype_dict)
set_4 = pd.read_csv(regressionDir + 'wk3_kc_house_set_4_data.csv', dtype=dtype_dict)

train_valid_shuffled = pd.read_csv(regressionDir + 'wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)
test = pd.read_csv(regressionDir + 'wk3_kc_house_test_data.csv', dtype=dtype_dict)
training =  pd.read_csv(regressionDir + 'wk3_kc_house_train_data.csv', dtype=dtype_dict)









    



/home/weenkus/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:6: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)

Data exploration



In [121]:

    
# Show plots in jupyter
%matplotlib inline

sales.head()









    Out[121]:






  
    
      
      id
      date
      price
      bedrooms
      bathrooms
      sqft_living
      sqft_lot
      floors
      waterfront
      view
      ...
      grade
      sqft_above
      sqft_basement
      yr_built
      yr_renovated
      zipcode
      lat
      long
      sqft_living15
      sqft_lot15
    
  
  
    
      19452
      3980300371
      20140926T000000
      142000
      0
      0.00
      290
      20875
      1
      0
      0
      ...
      1
      290
      0
      1963
      0
      98024
      47.5308
      -121.888
      1620
      22850
    
    
      15381
      2856101479
      20140701T000000
      276000
      1
      0.75
      370
      1801
      1
      0
      0
      ...
      5
      370
      0
      1923
      0
      98117
      47.6778
      -122.389
      1340
      5000
    
    
      860
      1723049033
      20140620T000000
      245000
      1
      0.75
      380
      15000
      1
      0
      0
      ...
      5
      380
      0
      1963
      0
      98168
      47.4810
      -122.323
      1170
      15000
    
    
      18379
      1222029077
      20141029T000000
      265000
      0
      0.75
      384
      213444
      1
      0
      0
      ...
      4
      384
      0
      2003
      0
      98070
      47.4177
      -122.491
      1920
      224341
    
    
      4868
      6896300380
      20141002T000000
      228000
      0
      1.00
      390
      5900
      1
      0
      0
      ...
      4
      390
      0
      1953
      0
      98118
      47.5260
      -122.261
      2170
      6000
    
  

5 rows × 21 columns



In [122]:

    
sales['price'].head()









    Out[122]:





19452    142000
15381    276000
860      245000
18379    265000
4868     228000
Name: price, dtype: float64

Helper functions



In [123]:

    
def polynomial_dataframe(feature, degree): # feature is pandas.Series type
    # assume that degree >= 1
    # initialize the dataframe:
    poly_dataframe = pd.DataFrame()
    # and set poly_dataframe['power_1'] equal to the passed feature
    poly_dataframe['power_1'] = feature

    # first check if degree > 1
    if degree > 1:
        # then loop over the remaining degrees:
        for power in range(2, degree+1):
            # first we'll give the column a name:
            name = 'power_' + str(power)
            # assign poly_dataframe[name] to be feature^power; use apply(*)
            poly_dataframe[name] = feature;
            poly_dataframe[name] = poly_dataframe[name].apply(lambda x: x**power)
    return poly_dataframe

Ridge regression model fitting



In [124]:

    
poly15_data = polynomial_dataframe(sales['sqft_living'], 15) # use equivalent of `polynomial_sframe`
print(poly15_data)









    



       power_1    power_2       power_3       power_4       power_5  \
19452      290      84100  2.438900e+07  7.072810e+09  2.051115e+12   
15381      370     136900  5.065300e+07  1.874161e+10  6.934396e+12   
860        380     144400  5.487200e+07  2.085136e+10  7.923517e+12   
18379      384     147456  5.662310e+07  2.174327e+10  8.349416e+12   
4868       390     152100  5.931900e+07  2.313441e+10  9.022420e+12   
21332      390     152100  5.931900e+07  2.313441e+10  9.022420e+12   
8623       410     168100  6.892100e+07  2.825761e+10  1.158562e+13   
11500      420     176400  7.408800e+07  3.111696e+10  1.306912e+13   
14466      420     176400  7.408800e+07  3.111696e+10  1.306912e+13   
465        430     184900  7.950700e+07  3.418801e+10  1.470084e+13   
1168       440     193600  8.518400e+07  3.748096e+10  1.649162e+13   
18052      460     211600  9.733600e+07  4.477456e+10  2.059630e+13   
12075      470     220900  1.038230e+08  4.879681e+10  2.293450e+13   
15248      470     220900  1.038230e+08  4.879681e+10  2.293450e+13   
17394      480     230400  1.105920e+08  5.308416e+10  2.548040e+13   
8133       480     230400  1.105920e+08  5.308416e+10  2.548040e+13   
4203       490     240100  1.176490e+08  5.764801e+10  2.824752e+13   
4651       500     250000  1.250000e+08  6.250000e+10  3.125000e+13   
2141       520     270400  1.406080e+08  7.311616e+10  3.802040e+13   
3223       520     270400  1.406080e+08  7.311616e+10  3.802040e+13   
18059      520     270400  1.406080e+08  7.311616e+10  3.802040e+13   
2930       520     270400  1.406080e+08  7.311616e+10  3.802040e+13   
6779       520     270400  1.406080e+08  7.311616e+10  3.802040e+13   
9641       520     270400  1.406080e+08  7.311616e+10  3.802040e+13   
12484      530     280900  1.488770e+08  7.890481e+10  4.181955e+13   
3582       530     280900  1.488770e+08  7.890481e+10  4.181955e+13   
7017       530     280900  1.488770e+08  7.890481e+10  4.181955e+13   
14450      540     291600  1.574640e+08  8.503056e+10  4.591650e+13   
12757      540     291600  1.574640e+08  8.503056e+10  4.591650e+13   
18508      550     302500  1.663750e+08  9.150625e+10  5.032844e+13   
...        ...        ...           ...           ...           ...   
12872     6980   48720400  3.400684e+11  2.373677e+15  1.656827e+19   
15482     7000   49000000  3.430000e+11  2.401000e+15  1.680700e+19   
7035      7050   49702500  3.504026e+11  2.470339e+15  1.741589e+19   
4149      7080   50126400  3.548949e+11  2.512656e+15  1.778960e+19   
7907      7100   50410000  3.579110e+11  2.541168e+15  1.804229e+19   
21050     7120   50694400  3.609441e+11  2.569922e+15  1.829785e+19   
6501      7220   52128400  3.763670e+11  2.717370e+15  1.961941e+19   
21506     7270   52852900  3.842406e+11  2.793429e+15  2.030823e+19   
18594     7320   53582400  3.922232e+11  2.871074e+15  2.101626e+19   
2713      7350   54022500  3.970654e+11  2.918431e+15  2.145046e+19   
1315      7390   54612100  4.035834e+11  2.982481e+15  2.204054e+19   
10373     7400   54760000  4.052240e+11  2.998658e+15  2.219007e+19   
11871     7420   55056400  4.085185e+11  3.031207e+15  2.249156e+19   
12370     7440   55353600  4.118308e+11  3.064021e+15  2.279632e+19   
4024      7480   55950400  4.185090e+11  3.130447e+15  2.341575e+19   
14032     7620   58064400  4.424507e+11  3.371475e+15  2.569064e+19   
18477     7710   59444100  4.583140e+11  3.533601e+15  2.724406e+19   
16773     7730   59752900  4.618899e+11  3.570409e+15  2.759926e+19   
19858     7850   61622500  4.837366e+11  3.797333e+15  2.980906e+19   
13411     7880   62094400  4.893039e+11  3.855715e+15  3.038303e+19   
1448      8000   64000000  5.120000e+11  4.096000e+15  3.276800e+19   
1164      8010   64160100  5.139224e+11  4.116518e+15  3.297331e+19   
18302     8020   64320400  5.158496e+11  4.137114e+15  3.317965e+19   
14556     8670   75168900  6.517144e+11  5.650364e+15  4.898865e+19   
4411      9200   84640000  7.786880e+11  7.163930e+15  6.590815e+19   
8092      9640   92929600  8.958413e+11  8.635911e+15  8.325018e+19   
9254      9890   97812100  9.673617e+11  9.567207e+15  9.461968e+19   
3914     10040  100801600  1.012048e+12  1.016096e+16  1.020161e+20   
7252     12050  145202500  1.749690e+12  2.108377e+16  2.540594e+20   
12777    13540  183331600  2.482310e+12  3.361048e+16  4.550858e+20   

            power_6       power_7       power_8       power_9      power_10  \
19452  5.948233e+14  1.724988e+17  5.002464e+19  1.450715e+22  4.207072e+24   
15381  2.565726e+15  9.493188e+17  3.512479e+20  1.299617e+23  4.808584e+25   
860    3.010936e+15  1.144156e+18  4.347792e+20  1.652161e+23  6.278212e+25   
18379  3.206176e+15  1.231172e+18  4.727699e+20  1.815436e+23  6.971275e+25   
4868   3.518744e+15  1.372310e+18  5.352009e+20  2.087284e+23  8.140406e+25   
21332  3.518744e+15  1.372310e+18  5.352009e+20  2.087284e+23  8.140406e+25   
8623   4.750104e+15  1.947543e+18  7.984925e+20  3.273819e+23  1.342266e+26   
11500  5.489032e+15  2.305393e+18  9.682652e+20  4.066714e+23  1.708020e+26   
14466  5.489032e+15  2.305393e+18  9.682652e+20  4.066714e+23  1.708020e+26   
465    6.321363e+15  2.718186e+18  1.168820e+21  5.025926e+23  2.161148e+26   
1168   7.256314e+15  3.192778e+18  1.404822e+21  6.181218e+23  2.719736e+26   
18052  9.474297e+15  4.358177e+18  2.004761e+21  9.221902e+23  4.242075e+26   
12075  1.077922e+16  5.066231e+18  2.381129e+21  1.119130e+24  5.259913e+26   
15248  1.077922e+16  5.066231e+18  2.381129e+21  1.119130e+24  5.259913e+26   
17394  1.223059e+16  5.870683e+18  2.817928e+21  1.352605e+24  6.492506e+26   
8133   1.223059e+16  5.870683e+18  2.817928e+21  1.352605e+24  6.492506e+26   
4203   1.384129e+16  6.782231e+18  3.323293e+21  1.628414e+24  7.979227e+26   
4651   1.562500e+16  7.812500e+18  3.906250e+21  1.953125e+24  9.765625e+26   
2141   1.977061e+16  1.028072e+19  5.345973e+21  2.779906e+24  1.445551e+27   
3223   1.977061e+16  1.028072e+19  5.345973e+21  2.779906e+24  1.445551e+27   
18059  1.977061e+16  1.028072e+19  5.345973e+21  2.779906e+24  1.445551e+27   
2930   1.977061e+16  1.028072e+19  5.345973e+21  2.779906e+24  1.445551e+27   
6779   1.977061e+16  1.028072e+19  5.345973e+21  2.779906e+24  1.445551e+27   
9641   1.977061e+16  1.028072e+19  5.345973e+21  2.779906e+24  1.445551e+27   
12484  2.216436e+16  1.174711e+19  6.225969e+21  3.299764e+24  1.748875e+27   
3582   2.216436e+16  1.174711e+19  6.225969e+21  3.299764e+24  1.748875e+27   
7017   2.216436e+16  1.174711e+19  6.225969e+21  3.299764e+24  1.748875e+27   
14450  2.479491e+16  1.338925e+19  7.230196e+21  3.904306e+24  2.108325e+27   
12757  2.479491e+16  1.338925e+19  7.230196e+21  3.904306e+24  2.108325e+27   
18508  2.768064e+16  1.522435e+19  8.373394e+21  4.605367e+24  2.532952e+27   
...             ...           ...           ...           ...           ...   
12872  1.156465e+23  8.072126e+26  5.634344e+30  3.932772e+34  2.745075e+38   
15482  1.176490e+23  8.235430e+26  5.764801e+30  4.035361e+34  2.824752e+38   
7035   1.227820e+23  8.656131e+26  6.102572e+30  4.302313e+34  3.033131e+38   
4149   1.259504e+23  8.917288e+26  6.313440e+30  4.469916e+34  3.164700e+38   
7907   1.281003e+23  9.095120e+26  6.457535e+30  4.584850e+34  3.255244e+38   
21050  1.302807e+23  9.275983e+26  6.604500e+30  4.702404e+34  3.348112e+38   
6501   1.416522e+23  1.022729e+27  7.384100e+30  5.331320e+34  3.849213e+38   
21506  1.476408e+23  1.073349e+27  7.803246e+30  5.672960e+34  4.124242e+38   
18594  1.538390e+23  1.126102e+27  8.243064e+30  6.033923e+34  4.416831e+38   
2713   1.576609e+23  1.158808e+27  8.517237e+30  6.260169e+34  4.601224e+38   
1315   1.628796e+23  1.203680e+27  8.895196e+30  6.573550e+34  4.857853e+38   
10373  1.642065e+23  1.215128e+27  8.991947e+30  6.654041e+34  4.923990e+38   
11871  1.668874e+23  1.238304e+27  9.188217e+30  6.817657e+34  5.058701e+38   
12370  1.696046e+23  1.261858e+27  9.388225e+30  6.984839e+34  5.196720e+38   
4024   1.751498e+23  1.310120e+27  9.799700e+30  7.330176e+34  5.482971e+38   
14032  1.957626e+23  1.491711e+27  1.136684e+31  8.661533e+34  6.600088e+38   
18477  2.100517e+23  1.619499e+27  1.248634e+31  9.626965e+34  7.422390e+38   
16773  2.133423e+23  1.649136e+27  1.274782e+31  9.854066e+34  7.617193e+38   
19858  2.340011e+23  1.836909e+27  1.441973e+31  1.131949e+35  8.885801e+38   
13411  2.394183e+23  1.886616e+27  1.486653e+31  1.171483e+35  9.231285e+38   
1448   2.621440e+23  2.097152e+27  1.677722e+31  1.342177e+35  1.073742e+39   
1164   2.641162e+23  2.115571e+27  1.694572e+31  1.357352e+35  1.087239e+39   
18302  2.661008e+23  2.134129e+27  1.711571e+31  1.372680e+35  1.100889e+39   
14556  4.247316e+23  3.682423e+27  3.192661e+31  2.768037e+35  2.399888e+39   
4411   6.063550e+23  5.578466e+27  5.132189e+31  4.721614e+35  4.343885e+39   
8092   8.025317e+23  7.736406e+27  7.457895e+31  7.189411e+35  6.930592e+39   
9254   9.357886e+23  9.254949e+27  9.153145e+31  9.052460e+35  8.952883e+39   
3914   1.024241e+24  1.028338e+28  1.032452e+32  1.036581e+36  1.040728e+40   
7252   3.061416e+24  3.689006e+28  4.445252e+32  5.356529e+36  6.454617e+40   
12777  6.161862e+24  8.343162e+28  1.129664e+33  1.529565e+37  2.071031e+41   

           power_11      power_12      power_13      power_14      power_15  
19452  1.220051e+27  3.538148e+29  1.026063e+32  2.975582e+34  8.629189e+36  
15381  1.779176e+28  6.582952e+30  2.435692e+33  9.012061e+35  3.334463e+38  
860    2.385721e+28  9.065738e+30  3.444980e+33  1.309093e+36  4.974552e+38  
18379  2.676970e+28  1.027956e+31  3.947353e+33  1.515783e+36  5.820608e+38  
4868   3.174758e+28  1.238156e+31  4.828807e+33  1.883235e+36  7.344616e+38  
21332  3.174758e+28  1.238156e+31  4.828807e+33  1.883235e+36  7.344616e+38  
8623   5.503290e+28  2.256349e+31  9.251031e+33  3.792923e+36  1.555098e+39  
11500  7.173683e+28  3.012947e+31  1.265438e+34  5.314838e+36  2.232232e+39  
14466  7.173683e+28  3.012947e+31  1.265438e+34  5.314838e+36  2.232232e+39  
465    9.292937e+28  3.995963e+31  1.718264e+34  7.388536e+36  3.177070e+39  
1168   1.196684e+29  5.265409e+31  2.316780e+34  1.019383e+37  4.485286e+39  
18052  1.951354e+29  8.976230e+31  4.129066e+34  1.899370e+37  8.737103e+39  
12075  2.472159e+29  1.161915e+32  5.461000e+34  2.566670e+37  1.206335e+40  
15248  2.472159e+29  1.161915e+32  5.461000e+34  2.566670e+37  1.206335e+40  
17394  3.116403e+29  1.495873e+32  7.180192e+34  3.446492e+37  1.654316e+40  
8133   3.116403e+29  1.495873e+32  7.180192e+34  3.446492e+37  1.654316e+40  
4203   3.909821e+29  1.915812e+32  9.387480e+34  4.599865e+37  2.253934e+40  
4651   4.882812e+29  2.441406e+32  1.220703e+35  6.103516e+37  3.051758e+40  
2141   7.516866e+29  3.908770e+32  2.032560e+35  1.056931e+38  5.496043e+40  
3223   7.516866e+29  3.908770e+32  2.032560e+35  1.056931e+38  5.496043e+40  
18059  7.516866e+29  3.908770e+32  2.032560e+35  1.056931e+38  5.496043e+40  
2930   7.516866e+29  3.908770e+32  2.032560e+35  1.056931e+38  5.496043e+40  
6779   7.516866e+29  3.908770e+32  2.032560e+35  1.056931e+38  5.496043e+40  
9641   7.516866e+29  3.908770e+32  2.032560e+35  1.056931e+38  5.496043e+40  
12484  9.269036e+29  4.912589e+32  2.603672e+35  1.379946e+38  7.313715e+40  
3582   9.269036e+29  4.912589e+32  2.603672e+35  1.379946e+38  7.313715e+40  
7017   9.269036e+29  4.912589e+32  2.603672e+35  1.379946e+38  7.313715e+40  
14450  1.138496e+30  6.147876e+32  3.319853e+35  1.792721e+38  9.680692e+40  
12757  1.138496e+30  6.147876e+32  3.319853e+35  1.792721e+38  9.680692e+40  
18508  1.393123e+30  7.662179e+32  4.214198e+35  2.317809e+38  1.274795e+41  
...             ...           ...           ...           ...           ...  
12872  1.916062e+42  1.337412e+46  9.335133e+49  6.515923e+53  4.548114e+57  
15482  1.977327e+42  1.384129e+46  9.688901e+49  6.782231e+53  4.747562e+57  
7035   2.138357e+42  1.507542e+46  1.062817e+50  7.492860e+53  5.282467e+57  
4149   2.240608e+42  1.586350e+46  1.123136e+50  7.951803e+53  5.629876e+57  
7907   2.311223e+42  1.640968e+46  1.165087e+50  8.272121e+53  5.873206e+57  
21050  2.383856e+42  1.697305e+46  1.208481e+50  8.604387e+53  6.126323e+57  
6501   2.779132e+42  2.006533e+46  1.448717e+50  1.045974e+54  7.551930e+57  
21506  2.998324e+42  2.179781e+46  1.584701e+50  1.152078e+54  8.375605e+57  
18594  3.233121e+42  2.366644e+46  1.732384e+50  1.268105e+54  9.282527e+57  
2713   3.381900e+42  2.485696e+46  1.826987e+50  1.342835e+54  9.869839e+57  
1315   3.589953e+42  2.652976e+46  1.960549e+50  1.448846e+54  1.070697e+58  
10373  3.643753e+42  2.696377e+46  1.995319e+50  1.476536e+54  1.092637e+58  
11871  3.753557e+42  2.785139e+46  2.066573e+50  1.533397e+54  1.137781e+58  
12370  3.866360e+42  2.876572e+46  2.140169e+50  1.592286e+54  1.184661e+58  
4024   4.101263e+42  3.067744e+46  2.294673e+50  1.716415e+54  1.283879e+58  
14032  5.029267e+42  3.832301e+46  2.920214e+50  2.225203e+54  1.695605e+58  
18477  5.722663e+42  4.412173e+46  3.401785e+50  2.622777e+54  2.022161e+58  
16773  5.888090e+42  4.551494e+46  3.518304e+50  2.719649e+54  2.102289e+58  
19858  6.975354e+42  5.475653e+46  4.298387e+50  3.374234e+54  2.648774e+58  
13411  7.274253e+42  5.732111e+46  4.516904e+50  3.559320e+54  2.804744e+58  
1448   8.589935e+42  6.871948e+46  5.497558e+50  4.398047e+54  3.518437e+58  
1164   8.708787e+42  6.975739e+46  5.587567e+50  4.475641e+54  3.584988e+58  
18302  8.829133e+42  7.080965e+46  5.678934e+50  4.554505e+54  3.652713e+58  
14556  2.080703e+43  1.803969e+47  1.564041e+51  1.356024e+55  1.175673e+59  
4411   3.996374e+43  3.676664e+47  3.382531e+51  3.111928e+55  2.862974e+59  
8092   6.681091e+43  6.440572e+47  6.208711e+51  5.985197e+55  5.769730e+59  
9254   8.854401e+43  8.757003e+47  8.660676e+51  8.565409e+55  8.471189e+59  
3914   1.044891e+44  1.049070e+48  1.053266e+52  1.057480e+56  1.061709e+60  
7252   7.777813e+44  9.372265e+48  1.129358e+53  1.360876e+57  1.639856e+61  
12777  2.804176e+45  3.796855e+49  5.140941e+53  6.960834e+57  9.424970e+61  

[21613 rows x 15 columns]



In [125]:

    
l2_small_penalty = 1.5e-5
model = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model.fit(poly15_data, sales['price'])









    Out[125]:





Ridge(alpha=1.5e-05, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)



In [126]:

    
model.coef_









    Out[126]:





array([  1.24873306e+02,  -4.77376011e-02,   3.01446238e-05,
        -2.44419942e-09,  -1.94153675e-13,   8.54085686e-18,
         1.51142121e-21,   8.27979094e-26,   6.52603100e-31,
        -3.27895017e-34,  -3.87962315e-38,  -2.72437650e-42,
        -1.07790800e-46,   3.78242694e-51,   1.39790296e-54])



In [127]:

    
plt.plot(poly15_data, model.predict(poly15_data), poly15_data, sales['price'])
plt.show()

Ridge regression on subsets

Using ridge regression with small l2



In [128]:

    
l2_small_penalty=1e-9

poly15_data_set1 = polynomial_dataframe(set_1['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model1 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model1.fit(poly15_data_set1, set_1['price'])

poly15_data_set2 = polynomial_dataframe(set_2['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model2 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model2.fit(poly15_data_set2, set_2['price'])

poly15_data_set3 = polynomial_dataframe(set_3['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model3 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model3.fit(poly15_data_set3, set_3['price'])

poly15_data_set4 = polynomial_dataframe(set_4['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model4 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model4.fit(poly15_data_set4, set_4['price'])









    Out[128]:





Ridge(alpha=1e-09, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)



In [129]:

    
plt.plot(poly15_data_set1, model1.predict(poly15_data_set1), poly15_data_set1, set_1['price'])
plt.show()

plt.plot(poly15_data_set2, model2.predict(poly15_data_set2), poly15_data_set2, set_2['price'])
plt.show()

plt.plot(poly15_data_set3, model3.predict(poly15_data_set3), poly15_data_set3, set_3['price'])
plt.show()

plt.plot(poly15_data_set4, model4.predict(poly15_data_set4), poly15_data_set4, set_4['price'])
plt.show()



In [130]:

    
print('Model 1 coefficients: ', model1.coef_)
print('Model 2 coefficients: ', model2.coef_)
print('Model 3 coefficients: ', model3.coef_)
print('Model 4 coefficients: ', model4.coef_)









    



Model 1 coefficients:  [  5.44669376e+02  -3.55447580e-01   1.22446368e-04  -1.17175278e-08
  -3.90512972e-13  -1.39075896e-17   1.47860259e-20   6.87492376e-25
  -7.57204175e-29  -1.04097336e-32  -3.71843943e-37   3.39989317e-41
   5.56591999e-45   2.53761435e-49  -3.35152943e-53]
Model 2 coefficients:  [  8.59362612e+02  -8.18118183e-01   4.28879879e-04  -9.12770077e-08
  -2.69606133e-12   3.73980536e-15  -1.42711908e-19  -6.30794906e-23
  -1.44559687e-27   7.44321610e-31   9.25866075e-35   3.27974536e-41
  -1.29543502e-42  -1.38781261e-46   1.66546452e-50]
Model 3 coefficients:  [ -7.55395916e+02   9.75579484e-01  -4.58945974e-04   7.77958020e-08
   7.15013519e-12  -2.88601996e-15  -2.13677987e-20   3.38085238e-23
   2.19178142e-27  -1.97067733e-31  -4.15993090e-35  -1.80196317e-39
   3.19071186e-43   5.08456981e-47  -3.93304294e-51]
Model 4 coefficients:  [  1.11944572e+03  -9.83760236e-01   3.38770920e-04   3.60377089e-08
  -4.37813981e-11   5.77191626e-15   7.66795302e-19  -9.49297780e-23
  -1.96030805e-26  -2.10881952e-32   3.31005108e-34   3.47733782e-38
  -2.43039194e-42  -8.79553285e-46   6.44569669e-50]

Applying a higher L2 value



In [131]:

    
l2_large_penalty=1.23e2

poly15_data_set1 = polynomial_dataframe(set_1['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model1 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model1.fit(poly15_data_set1, set_1['price'])

poly15_data_set2 = polynomial_dataframe(set_2['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model2 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model2.fit(poly15_data_set2, set_2['price'])

poly15_data_set3 = polynomial_dataframe(set_3['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model3 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model3.fit(poly15_data_set3, set_3['price'])

poly15_data_set4 = polynomial_dataframe(set_4['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model4 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model4.fit(poly15_data_set4, set_4['price'])









    Out[131]:





Ridge(alpha=123.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)



In [132]:

    
plt.plot(poly15_data_set1, model1.predict(poly15_data_set1), poly15_data_set1, set_1['price'])
plt.show()

plt.plot(poly15_data_set2, model2.predict(poly15_data_set2), poly15_data_set2, set_2['price'])
plt.show()

plt.plot(poly15_data_set3, model3.predict(poly15_data_set3), poly15_data_set3, set_3['price'])
plt.show()

plt.plot(poly15_data_set4, model4.predict(poly15_data_set4), poly15_data_set4, set_4['price'])
plt.show()



In [133]:

    
print('Model 1 coefficients: ', model1.coef_)
print('Model 2 coefficients: ', model2.coef_)
print('Model 3 coefficients: ', model3.coef_)
print('Model 4 coefficients: ', model4.coef_)









    



Model 1 coefficients:  [  2.32806803e+00   3.53621608e-04   3.31969692e-08   2.00082477e-12
   1.11492559e-16   6.57786122e-21   4.12939525e-25   2.70393755e-29
   1.81614763e-33   1.23824277e-37   8.51872481e-42   5.89455598e-46
   4.09542560e-50   2.85464889e-54   1.99547476e-58]
Model 2 coefficients:  [  2.09756903e+00   3.90817483e-04   6.67189944e-08   8.90002997e-12
   9.72639877e-16   9.69733682e-20   9.50564475e-24   9.44491031e-28
   9.57191338e-32   9.86945155e-36   1.03101115e-39   1.08729784e-43
   1.15453748e-47   1.23211305e-51   1.31986696e-55]
Model 3 coefficients:  [  2.28906258e+00   4.12472190e-04   6.08835345e-08   6.58572163e-12
   6.15278155e-16   5.64446634e-20   5.28834396e-24   5.07091402e-28
   4.94657273e-32   4.88043809e-36   4.85009106e-40   4.84161534e-44
   4.84635021e-48   4.85883628e-52   4.87558469e-56]
Model 4 coefficients:  [  2.08596194e+00   4.05035772e-04   7.46864647e-08   1.13096608e-11
   1.45864442e-15   1.73561251e-19   2.01609632e-23   2.34605255e-27
   2.75636073e-31   3.27043069e-35   3.91046855e-39   4.70118041e-43
   5.67212304e-47   6.85958087e-51   8.30843630e-55]

Selecting an L2 penalty via cross-validation

Just like the polynomial degree, the L2 penalty is a "magic" parameter we need to select. We could use the validation set approach as we did in the last module, but that approach has a major disadvantage: it leaves fewer observations available for training. Cross-validation seeks to overcome this issue by using all of the training set in a smart way.

We will implement a kind of cross-validation called k-fold cross-validation. The method gets its name because it involves dividing the training set into k segments of roughtly equal size. Similar to the validation set method, we measure the validation error with one of the segments designated as the validation set. The major difference is that we repeat the process k times as follows:

Set aside segment 0 as the validation set, and fit a model on rest of data, and evalutate it on this validation set

Set aside segment 1 as the validation set, and fit a model on rest of data, and evalutate it on this validation set

...

Set aside segment k-1 as the validation set, and fit a model on rest of data, and evalutate it on this validation set

After this process, we compute the average of the k validation errors, and use it as an estimate of the generalization error. Notice that all observations are used for both training and validation, as we iterate over segments of data.



In [134]:

    
def k_fold_cross_validation(k, l2_penalty, data, output):
    n = len(data)
    sumRSS = 0
    
    for i in range(k):
        # Get the validation/training interval
        start = (n*i)/k
        end = (n*(i+1))/k-1
        #print (i, (ceil(start), ceil(end)))
        train_valid_shuffled[0:ceil(start)].append(train_valid_shuffled[ceil(end)+1:n])   
        
        # Train the model
        model = linear_model.Ridge(alpha=l2_penalty, normalize=True)
        model.fit(data, output)
    
        # Calculate RSS
        RSS = (abs(output - model.predict(data)) ** 2).sum()
        
        # Add the RSS to the sum for computing the average
        sumRSS += RSS
    return (sumRSS / k)



In [135]:

    
print (k_fold_cross_validation(10, 1e-9, poly15_data_set2, set_2['price']))









    



296862792315373.44

Minimize the l2 by using cross validation



In [136]:

    
import sys

l2s = np.logspace(3, 9, num=13)
train_valid_shuffled_poly15 = polynomial_dataframe(train_valid_shuffled['sqft_living'], 15)
k = 10

minError = sys.maxsize
for l2 in l2s:
    avgError = k_fold_cross_validation(k, l2, train_valid_shuffled_poly15, train_valid_shuffled['price'])
    print ('For l2:', l2, ' the CV is ', avgError)
    if avgError < minError:
        minError = avgError 
        bestl2 = l2
        
print (minError)
print (bestl2)









    



For l2: 1000.0  the CV is  2650520195070680.0
For l2: 3162.27766017  the CV is  2657012027282295.0
For l2: 10000.0  the CV is  2659080208536254.5
For l2: 31622.7766017  the CV is  2659735771406552.5
For l2: 100000.0  the CV is  2659943233824360.5
For l2: 316227.766017  the CV is  2660008854743339.0
For l2: 1000000.0  the CV is  2660029607454739.5
For l2: 3162277.66017  the CV is  2660036170193829.0
For l2: 10000000.0  the CV is  2660038245529684.5
For l2: 31622776.6017  the CV is  2660038901810067.5
For l2: 100000000.0  the CV is  2660039109344276.0
For l2: 316227766.017  the CV is  2660039174972390.5
For l2: 1000000000.0  the CV is  2660039195725831.0
2650520195070680.0
1000.0

Use the best l2 to train the model on all the data



In [161]:

    
model = linear_model.Ridge(alpha=1000, normalize=True)
model.fit(training[['sqft_living']], training['price'])









    Out[161]:





Ridge(alpha=1000, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)



In [163]:

    
print("Residual sum of squares: %.2f"
  % ((model.predict(test[['sqft_living']]) - test['price']) ** 2).sum())









    



Residual sum of squares: 284428436980723.12



In [ ]:



In [ ]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	...	grade	sqft_above	yr_built	zipcode	lat	long	sqft_living15	sqft_lot15
19452	3980300371	20140926T000000	142000	0	0.00	290	20875	1	...	1	290	1963	98024	47.5308	-121.888	1620	22850
15381	2856101479	20140701T000000	276000	1	0.75	370	1801	1	...	5	370	1923	98117	47.6778	-122.389	1340	5000
860	1723049033	20140620T000000	245000	1	0.75	380	15000	1	...	5	380	1963	98168	47.4810	-122.323	1170	15000
18379	1222029077	20141029T000000	265000	0	0.75	384	213444	1	...	4	384	2003	98070	47.4177	-122.491	1920	224341
4868	6896300380	20141002T000000	228000	0	1.00	390	5900	1	...	4	390	1953	98118	47.5260	-122.261	2170	6000