In [119]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
import numpy as np
from math import ceil
In [141]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
regressionDir = '/home/weenkus/workspace/Machine Learning - University of Washington/Regression/datasets/'
sales = pd.read_csv(regressionDir + 'kc_house_data.csv', dtype = dtype_dict)
sales = sales.sort(['sqft_living','price'])
# dtype_dict same as above
set_1 = pd.read_csv(regressionDir + 'wk3_kc_house_set_1_data.csv', dtype=dtype_dict)
set_2 = pd.read_csv(regressionDir + 'wk3_kc_house_set_2_data.csv', dtype=dtype_dict)
set_3 = pd.read_csv(regressionDir + 'wk3_kc_house_set_3_data.csv', dtype=dtype_dict)
set_4 = pd.read_csv(regressionDir + 'wk3_kc_house_set_4_data.csv', dtype=dtype_dict)
train_valid_shuffled = pd.read_csv(regressionDir + 'wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)
test = pd.read_csv(regressionDir + 'wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv(regressionDir + 'wk3_kc_house_train_data.csv', dtype=dtype_dict)
In [121]:
# Show plots in jupyter
%matplotlib inline
sales.head()
Out[121]:
In [122]:
sales['price'].head()
Out[122]:
In [123]:
def polynomial_dataframe(feature, degree): # feature is pandas.Series type
# assume that degree >= 1
# initialize the dataframe:
poly_dataframe = pd.DataFrame()
# and set poly_dataframe['power_1'] equal to the passed feature
poly_dataframe['power_1'] = feature
# first check if degree > 1
if degree > 1:
# then loop over the remaining degrees:
for power in range(2, degree+1):
# first we'll give the column a name:
name = 'power_' + str(power)
# assign poly_dataframe[name] to be feature^power; use apply(*)
poly_dataframe[name] = feature;
poly_dataframe[name] = poly_dataframe[name].apply(lambda x: x**power)
return poly_dataframe
In [124]:
poly15_data = polynomial_dataframe(sales['sqft_living'], 15) # use equivalent of `polynomial_sframe`
print(poly15_data)
In [125]:
l2_small_penalty = 1.5e-5
model = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model.fit(poly15_data, sales['price'])
Out[125]:
In [126]:
model.coef_
Out[126]:
In [127]:
plt.plot(poly15_data, model.predict(poly15_data), poly15_data, sales['price'])
plt.show()
In [128]:
l2_small_penalty=1e-9
poly15_data_set1 = polynomial_dataframe(set_1['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model1 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model1.fit(poly15_data_set1, set_1['price'])
poly15_data_set2 = polynomial_dataframe(set_2['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model2 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model2.fit(poly15_data_set2, set_2['price'])
poly15_data_set3 = polynomial_dataframe(set_3['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model3 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model3.fit(poly15_data_set3, set_3['price'])
poly15_data_set4 = polynomial_dataframe(set_4['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model4 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model4.fit(poly15_data_set4, set_4['price'])
Out[128]:
In [129]:
plt.plot(poly15_data_set1, model1.predict(poly15_data_set1), poly15_data_set1, set_1['price'])
plt.show()
plt.plot(poly15_data_set2, model2.predict(poly15_data_set2), poly15_data_set2, set_2['price'])
plt.show()
plt.plot(poly15_data_set3, model3.predict(poly15_data_set3), poly15_data_set3, set_3['price'])
plt.show()
plt.plot(poly15_data_set4, model4.predict(poly15_data_set4), poly15_data_set4, set_4['price'])
plt.show()
In [130]:
print('Model 1 coefficients: ', model1.coef_)
print('Model 2 coefficients: ', model2.coef_)
print('Model 3 coefficients: ', model3.coef_)
print('Model 4 coefficients: ', model4.coef_)
In [131]:
l2_large_penalty=1.23e2
poly15_data_set1 = polynomial_dataframe(set_1['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model1 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model1.fit(poly15_data_set1, set_1['price'])
poly15_data_set2 = polynomial_dataframe(set_2['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model2 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model2.fit(poly15_data_set2, set_2['price'])
poly15_data_set3 = polynomial_dataframe(set_3['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model3 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model3.fit(poly15_data_set3, set_3['price'])
poly15_data_set4 = polynomial_dataframe(set_4['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model4 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model4.fit(poly15_data_set4, set_4['price'])
Out[131]:
In [132]:
plt.plot(poly15_data_set1, model1.predict(poly15_data_set1), poly15_data_set1, set_1['price'])
plt.show()
plt.plot(poly15_data_set2, model2.predict(poly15_data_set2), poly15_data_set2, set_2['price'])
plt.show()
plt.plot(poly15_data_set3, model3.predict(poly15_data_set3), poly15_data_set3, set_3['price'])
plt.show()
plt.plot(poly15_data_set4, model4.predict(poly15_data_set4), poly15_data_set4, set_4['price'])
plt.show()
In [133]:
print('Model 1 coefficients: ', model1.coef_)
print('Model 2 coefficients: ', model2.coef_)
print('Model 3 coefficients: ', model3.coef_)
print('Model 4 coefficients: ', model4.coef_)
Just like the polynomial degree, the L2 penalty is a "magic" parameter we need to select. We could use the validation set approach as we did in the last module, but that approach has a major disadvantage: it leaves fewer observations available for training. Cross-validation seeks to overcome this issue by using all of the training set in a smart way.
We will implement a kind of cross-validation called k-fold cross-validation. The method gets its name because it involves dividing the training set into k segments of roughtly equal size. Similar to the validation set method, we measure the validation error with one of the segments designated as the validation set. The major difference is that we repeat the process k times as follows:
Set aside segment 0 as the validation set, and fit a model on rest of data, and evalutate it on this validation set
Set aside segment 1 as the validation set, and fit a model on rest of data, and evalutate it on this validation set
...
Set aside segment k-1 as the validation set, and fit a model on rest of data, and evalutate it on this validation set
After this process, we compute the average of the k validation errors, and use it as an estimate of the generalization error. Notice that all observations are used for both training and validation, as we iterate over segments of data.
In [134]:
def k_fold_cross_validation(k, l2_penalty, data, output):
n = len(data)
sumRSS = 0
for i in range(k):
# Get the validation/training interval
start = (n*i)/k
end = (n*(i+1))/k-1
#print (i, (ceil(start), ceil(end)))
train_valid_shuffled[0:ceil(start)].append(train_valid_shuffled[ceil(end)+1:n])
# Train the model
model = linear_model.Ridge(alpha=l2_penalty, normalize=True)
model.fit(data, output)
# Calculate RSS
RSS = (abs(output - model.predict(data)) ** 2).sum()
# Add the RSS to the sum for computing the average
sumRSS += RSS
return (sumRSS / k)
In [135]:
print (k_fold_cross_validation(10, 1e-9, poly15_data_set2, set_2['price']))
In [136]:
import sys
l2s = np.logspace(3, 9, num=13)
train_valid_shuffled_poly15 = polynomial_dataframe(train_valid_shuffled['sqft_living'], 15)
k = 10
minError = sys.maxsize
for l2 in l2s:
avgError = k_fold_cross_validation(k, l2, train_valid_shuffled_poly15, train_valid_shuffled['price'])
print ('For l2:', l2, ' the CV is ', avgError)
if avgError < minError:
minError = avgError
bestl2 = l2
print (minError)
print (bestl2)
In [161]:
model = linear_model.Ridge(alpha=1000, normalize=True)
model.fit(training[['sqft_living']], training['price'])
Out[161]:
In [163]:
print("Residual sum of squares: %.2f"
% ((model.predict(test[['sqft_living']]) - test['price']) ** 2).sum())
In [ ]:
In [ ]: