In [2]:
import pandas as pd

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [3]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [129]:
from sklearn import linear_model  # using scikit-learn

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [130]:
model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights


Out[130]:
Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [138]:
import numpy as np
model_all_coefs_array = np.zeros((len(all_features), 2))
model_all_coefs = pd.DataFrame(model_all_coefs_array, columns=['feature', 'coef'])
model_all_coefs['coef'] = model_all.coef_
model_all_coefs['feature'] = np.array(all_features).reshape((len(all_features), 1))
print(model_all_coefs)


             feature          coef
0           bedrooms      0.000000
1    bedrooms_square      0.000000
2          bathrooms      0.000000
3        sqft_living    134.439314
4   sqft_living_sqrt      0.000000
5           sqft_lot      0.000000
6      sqft_lot_sqrt      0.000000
7             floors      0.000000
8      floors_square      0.000000
9         waterfront      0.000000
10              view  24750.004586
11         condition      0.000000
12             grade  61749.103091
13        sqft_above      0.000000
14     sqft_basement      0.000000
15          yr_built     -0.000000
16      yr_renovated      0.000000

In [139]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [140]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [141]:
penalties = np.logspace(1, 7, num=13)

In [142]:
residuals = []
all_penalties_columns = ['featuers'] + ['Penalty(%s)' % str(x) for x in penalties]
multy_penalties_coefs = pd.DataFrame(np.zeros((len(all_features), len(penalties) + 1)), columns=all_penalties_columns)
multy_penalties_coefs['featuers'] = all_features
best_model_rss = None
best_model = None
best_non_zero_count = None
for l1_penalty in penalties:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    multy_penalties_coefs['Penalty(%s)' % str(l1_penalty)] = model.coef_
    error = model.predict(validation[all_features]) - validation['price'].values
    current_rss = error.T.dot(error)
    if not best_model_rss or current_rss < best_model_rss:
        best_model_rss = current_rss
        best_model = model
        best_non_zero_count = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    residuals.append(current_rss)
print(multy_penalties_coefs)


            featuers  Penalty(10.0)  Penalty(31.6227766017)  Penalty(100.0)  \
0           bedrooms  -16144.562757           -17381.966510   -10062.924265   
1    bedrooms_square     373.245384              119.275717        0.000000   
2          bathrooms   50841.243340            42651.730910    22103.055985   
3        sqft_living     617.853560              419.222928      169.564623   
4   sqft_living_sqrt  -44411.354867           -24602.925286       -0.000000   
5           sqft_lot       0.785623                0.239764       -0.000000   
6      sqft_lot_sqrt    -701.194765             -355.793616      -98.897828   
7             floors      -0.000000                0.000000        0.000000   
8      floors_square    5014.200457             5361.906420     2010.345927   
9         waterfront  619488.752486           598660.591456   548282.395707   
10              view   38041.855652            40513.888580    42538.679778   
11         condition   24998.771838            17411.366912     2855.781295   
12             grade  128716.234621           125151.940724   119373.956339   
13        sqft_above       0.000000                0.000000        0.000000   
14     sqft_basement       0.000000                0.000000        0.000000   
15          yr_built   -3293.831180            -3335.668558    -3039.561989   
16      yr_renovated      10.057321                4.308570        0.000000   

    Penalty(316.227766017)  Penalty(1000.0)  Penalty(3162.27766017)  \
0                -0.000000         0.000000                       0   
1                 0.000000         0.000000                       0   
2                 0.000000         0.000000                       0   
3               162.053761       132.372355                       0   
4                 0.000000         0.000000                       0   
5                -0.000000         0.000000                       0   
6                -0.000000         0.000000                       0   
7                 0.000000         0.000000                       0   
8                 0.000000         0.000000                       0   
9            378573.417934         0.000000                       0   
10            39163.580947      3184.913407                       0   
11                0.000000         0.000000                       0   
12            98530.759422     50123.826159                       0   
13                0.000000         0.000000                       0   
14                0.000000         0.000000                       0   
15            -1611.312775        -0.000000                       0   
16                0.000000         0.000000                       0   

    Penalty(10000.0)  Penalty(31622.7766017)  Penalty(100000.0)  \
0                  0                       0                  0   
1                  0                       0                  0   
2                  0                       0                  0   
3                  0                       0                  0   
4                  0                       0                  0   
5                  0                       0                  0   
6                  0                       0                  0   
7                  0                       0                  0   
8                  0                       0                  0   
9                  0                       0                  0   
10                 0                       0                  0   
11                 0                       0                  0   
12                 0                       0                  0   
13                 0                       0                  0   
14                 0                       0                  0   
15                 0                       0                  0   
16                 0                       0                  0   

    Penalty(316227.766017)  Penalty(1000000.0)  Penalty(3162277.66017)  \
0                        0                   0                       0   
1                        0                   0                       0   
2                        0                   0                       0   
3                        0                   0                       0   
4                        0                   0                       0   
5                        0                   0                       0   
6                        0                   0                       0   
7                        0                   0                       0   
8                        0                   0                       0   
9                        0                   0                       0   
10                       0                   0                       0   
11                       0                   0                       0   
12                       0                   0                       0   
13                       0                   0                       0   
14                       0                   0                       0   
15                       0                   0                       0   
16                       0                   0                       0   

    Penalty(10000000.0)  
0                     0  
1                     0  
2                     0  
3                     0  
4                     0  
5                     0  
6                     0  
7                     0  
8                     0  
9                     0  
10                    0  
11                    0  
12                    0  
13                    0  
14                    0  
15                    0  
16                    0  

In [143]:
rss = pd.DataFrame(np.zeros((len(residuals), 2)), columns=['l1_penalty', 'rss'])
rss['l1_penalty'] = penalties
rss['rss'] = residuals
rss


Out[143]:
l1_penalty rss
0 10.000000 3.982133e+14
1 31.622777 3.990419e+14
2 100.000000 4.297916e+14
3 316.227766 4.637398e+14
4 1000.000000 6.458987e+14
5 3162.277660 1.222507e+15
6 10000.000000 1.222507e+15
7 31622.776602 1.222507e+15
8 100000.000000 1.222507e+15
9 316227.766017 1.222507e+15
10 1000000.000000 1.222507e+15
11 3162277.660168 1.222507e+15
12 10000000.000000 1.222507e+15

In [144]:
best_model_index = residuals.index(min(residuals))
penalties[best_model_index]


Out[144]:
10.0

In [72]:
best_model_rss


Out[72]:
398213327300135.0

In [73]:
best_non_zero_count


Out[73]:
15

In [161]:
max_nonzeros = 7
penalties = np.logspace(1, 4, num=20)
non_zeros_count = []
for l1_penalty in penalties:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    non_zeros_count.append(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))

In [100]:
non_zeros_matrix = np.zeros((len(penalties), 3))
non_zeros_matrix[:, 0] = range(len(penalties))
non_zeros_matrix[:, 1] = non_zeros_count
non_zeros_matrix[:, 2] = penalties

In [164]:
#boundaries = [i for i,x in enumerate(non_zeros_count) if x == max_nonzeros]
boundaries = []
for i,x in enumerate(non_zeros_count):
    if x == max_nonzeros:
        boundaries.append(i)
        if i > 0 and non_zeros_count[i - 1] > max_nonzeros:
            boundaries.append(i - 1)
        if i < len(non_zeros_count) and non_zeros_count[i + 1] < max_nonzeros:
            boundaries.append(i + 1)
print(boundaries)
l1_penalty_min = penalties[min(boundaries)]
l1_penalty_max = penalties[max(boundaries)]
l1_penalty_min_index = min(boundaries)
l1_penalty_max_index = max(boundaries)
print(penalties[9])


[8, 7, 9]
263.665089873

In [165]:
print( l1_penalty_min)
print(l1_penalty_max)


127.42749857
263.665089873

In [166]:
non_zeros_matrix


Out[166]:
array([[  0.00000000e+00,   1.50000000e+01,   1.00000000e+01],
       [  1.00000000e+00,   1.50000000e+01,   1.43844989e+01],
       [  2.00000000e+00,   1.50000000e+01,   2.06913808e+01],
       [  3.00000000e+00,   1.50000000e+01,   2.97635144e+01],
       [  4.00000000e+00,   1.30000000e+01,   4.28133240e+01],
       [  5.00000000e+00,   1.20000000e+01,   6.15848211e+01],
       [  6.00000000e+00,   1.10000000e+01,   8.85866790e+01],
       [  7.00000000e+00,   1.00000000e+01,   1.27427499e+02],
       [  8.00000000e+00,   7.00000000e+00,   1.83298071e+02],
       [  9.00000000e+00,   6.00000000e+00,   2.63665090e+02],
       [  1.00000000e+01,   6.00000000e+00,   3.79269019e+02],
       [  1.10000000e+01,   6.00000000e+00,   5.45559478e+02],
       [  1.20000000e+01,   5.00000000e+00,   7.84759970e+02],
       [  1.30000000e+01,   3.00000000e+00,   1.12883789e+03],
       [  1.40000000e+01,   3.00000000e+00,   1.62377674e+03],
       [  1.50000000e+01,   2.00000000e+00,   2.33572147e+03],
       [  1.60000000e+01,   1.00000000e+00,   3.35981829e+03],
       [  1.70000000e+01,   1.00000000e+00,   4.83293024e+03],
       [  1.80000000e+01,   1.00000000e+00,   6.95192796e+03],
       [  1.90000000e+01,   1.00000000e+00,   1.00000000e+04]])

In [169]:
best_max_features_model = None
best_max_featuers_rss = None
best_max_features_penalty = None
for l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    predictions = model.predict(validation[all_features])
    errors = predictions - validation['price'].values
    rss = errors.T.dot(errors)
    feature_count = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    if (not best_max_featuers_rss or best_max_featuers_rss > rss) and feature_count == max_nonzeros:
        best_max_featuers_rss = rss
        best_max_features_model = model
        best_max_features_penalty = l1_penalty

In [170]:
best_max_features_penalty


Out[170]:
156.10909673930755

In [172]:
best_max_feature_coefs = pd.DataFrame(np.zeros((len(all_features), 2)), columns=['features', 'coefs'])
best_max_feature_coefs['features'] = all_features
best_max_feature_coefs['coefs'] = best_max_features_model.coef_
print(best_max_feature_coefs)


            features          coefs
0           bedrooms      -0.000000
1    bedrooms_square      -0.000000
2          bathrooms   10610.890284
3        sqft_living     163.380252
4   sqft_living_sqrt       0.000000
5           sqft_lot      -0.000000
6      sqft_lot_sqrt      -0.000000
7             floors       0.000000
8      floors_square       0.000000
9         waterfront  506451.687115
10              view   41960.043555
11         condition       0.000000
12             grade  116253.553700
13        sqft_above       0.000000
14     sqft_basement       0.000000
15          yr_built   -2612.234880
16      yr_renovated       0.000000

In [ ]: