In [2]:
import pandas as pd
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
In [3]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']
In [129]:
from sklearn import linear_model # using scikit-learn
all_features = ['bedrooms', 'bedrooms_square',
'bathrooms',
'sqft_living', 'sqft_living_sqrt',
'sqft_lot', 'sqft_lot_sqrt',
'floors', 'floors_square',
'waterfront', 'view', 'condition', 'grade',
'sqft_above',
'sqft_basement',
'yr_built', 'yr_renovated']
In [130]:
model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights
Out[130]:
In [138]:
import numpy as np
model_all_coefs_array = np.zeros((len(all_features), 2))
model_all_coefs = pd.DataFrame(model_all_coefs_array, columns=['feature', 'coef'])
model_all_coefs['coef'] = model_all.coef_
model_all_coefs['feature'] = np.array(all_features).reshape((len(all_features), 1))
print(model_all_coefs)
In [139]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)
In [140]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']
training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']
validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']
In [141]:
penalties = np.logspace(1, 7, num=13)
In [142]:
residuals = []
all_penalties_columns = ['featuers'] + ['Penalty(%s)' % str(x) for x in penalties]
multy_penalties_coefs = pd.DataFrame(np.zeros((len(all_features), len(penalties) + 1)), columns=all_penalties_columns)
multy_penalties_coefs['featuers'] = all_features
best_model_rss = None
best_model = None
best_non_zero_count = None
for l1_penalty in penalties:
model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
model.fit(training[all_features], training['price'])
multy_penalties_coefs['Penalty(%s)' % str(l1_penalty)] = model.coef_
error = model.predict(validation[all_features]) - validation['price'].values
current_rss = error.T.dot(error)
if not best_model_rss or current_rss < best_model_rss:
best_model_rss = current_rss
best_model = model
best_non_zero_count = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
residuals.append(current_rss)
print(multy_penalties_coefs)
In [143]:
rss = pd.DataFrame(np.zeros((len(residuals), 2)), columns=['l1_penalty', 'rss'])
rss['l1_penalty'] = penalties
rss['rss'] = residuals
rss
Out[143]:
In [144]:
best_model_index = residuals.index(min(residuals))
penalties[best_model_index]
Out[144]:
In [72]:
best_model_rss
Out[72]:
In [73]:
best_non_zero_count
Out[73]:
In [161]:
max_nonzeros = 7
penalties = np.logspace(1, 4, num=20)
non_zeros_count = []
for l1_penalty in penalties:
model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
model.fit(training[all_features], training['price'])
non_zeros_count.append(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))
In [100]:
non_zeros_matrix = np.zeros((len(penalties), 3))
non_zeros_matrix[:, 0] = range(len(penalties))
non_zeros_matrix[:, 1] = non_zeros_count
non_zeros_matrix[:, 2] = penalties
In [164]:
#boundaries = [i for i,x in enumerate(non_zeros_count) if x == max_nonzeros]
boundaries = []
for i,x in enumerate(non_zeros_count):
if x == max_nonzeros:
boundaries.append(i)
if i > 0 and non_zeros_count[i - 1] > max_nonzeros:
boundaries.append(i - 1)
if i < len(non_zeros_count) and non_zeros_count[i + 1] < max_nonzeros:
boundaries.append(i + 1)
print(boundaries)
l1_penalty_min = penalties[min(boundaries)]
l1_penalty_max = penalties[max(boundaries)]
l1_penalty_min_index = min(boundaries)
l1_penalty_max_index = max(boundaries)
print(penalties[9])
In [165]:
print( l1_penalty_min)
print(l1_penalty_max)
In [166]:
non_zeros_matrix
Out[166]:
In [169]:
best_max_features_model = None
best_max_featuers_rss = None
best_max_features_penalty = None
for l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):
model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
model.fit(training[all_features], training['price'])
predictions = model.predict(validation[all_features])
errors = predictions - validation['price'].values
rss = errors.T.dot(errors)
feature_count = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
if (not best_max_featuers_rss or best_max_featuers_rss > rss) and feature_count == max_nonzeros:
best_max_featuers_rss = rss
best_max_features_model = model
best_max_features_penalty = l1_penalty
In [170]:
best_max_features_penalty
Out[170]:
In [172]:
best_max_feature_coefs = pd.DataFrame(np.zeros((len(all_features), 2)), columns=['features', 'coefs'])
best_max_feature_coefs['features'] = all_features
best_max_feature_coefs['coefs'] = best_max_features_model.coef_
print(best_max_feature_coefs)
In [ ]: