Initialise the libraries



In [170]:

    
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
import numpy as np

Load the data



In [171]:

    
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

regressionDir = '/home/weenkus/workspace/Machine Learning - University of Washington/Regression/datasets/'

sales = pd.read_csv(regressionDir + 'kc_house_data.csv', dtype = dtype_dict)
train_data = pd.read_csv(regressionDir + 'kc_house_train_data.csv', dtype = dtype_dict)
test_data = pd.read_csv(regressionDir + 'kc_house_test_data.csv', dtype = dtype_dict)
#sales = sales.sort(['sqft_living','price'])

Data exploration



In [172]:

    
sales.head()









    Out[172]:






  
    
      
      id
      date
      price
      bedrooms
      bathrooms
      sqft_living
      sqft_lot
      floors
      waterfront
      view
      ...
      grade
      sqft_above
      sqft_basement
      yr_built
      yr_renovated
      zipcode
      lat
      long
      sqft_living15
      sqft_lot15
    
  
  
    
      0
      7129300520
      20141013T000000
      221900
      3
      1.00
      1180
      5650
      1
      0
      0
      ...
      7
      1180
      0
      1955
      0
      98178
      47.5112
      -122.257
      1340
      5650
    
    
      1
      6414100192
      20141209T000000
      538000
      3
      2.25
      2570
      7242
      2
      0
      0
      ...
      7
      2170
      400
      1951
      1991
      98125
      47.7210
      -122.319
      1690
      7639
    
    
      2
      5631500400
      20150225T000000
      180000
      2
      1.00
      770
      10000
      1
      0
      0
      ...
      6
      770
      0
      1933
      0
      98028
      47.7379
      -122.233
      2720
      8062
    
    
      3
      2487200875
      20141209T000000
      604000
      4
      3.00
      1960
      5000
      1
      0
      0
      ...
      7
      1050
      910
      1965
      0
      98136
      47.5208
      -122.393
      1360
      5000
    
    
      4
      1954400510
      20150218T000000
      510000
      3
      2.00
      1680
      8080
      1
      0
      0
      ...
      8
      1680
      0
      1987
      0
      98074
      47.6168
      -122.045
      1800
      7503
    
  

5 rows × 21 columns

Helper functions



In [173]:

    
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe = data_sframe[features]
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = np.matrix(features_sframe)
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = np.array(output_sarray) # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)



In [174]:

    
def predict_output(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)



In [175]:

    
def feature_derivative_ridge(errors, feature, weight, l2_penalty, feature_is_constant):
    #2*SUM[ error*[feature_i] ] + 2*l2_penalty*w[i]
    if feature_is_constant:
        derivative = 2 * np.dot(errors, feature) + 2 * l2_penalty * weight
    else:
        derivative = 2 * np.dot(errors, feature)
    return derivative



In [176]:

    
# Testing the above functions
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price')
my_weights = np.array([1., 10.])
test_predictions = predict_output(example_features, my_weights)
errors = test_predictions - example_output # prediction errors

# next two lines should print the same values
print (feature_derivative_ridge(errors, example_features[:,1], my_weights[1], 1, False))
print (np.sum(errors*example_features[:,1])*2+20.)
print ('')

# next two lines should print the same values
print (feature_derivative_ridge(errors, example_features[:,0], my_weights[0], 1, True))
print (np.sum(errors)*2.)









    



[[ -5.65541668e+13]]
-5.6554166816e+13

[[ -2.24467493e+10]]
-22446749330.0



In [177]:

    
def ridge_regression_gradient_descent(feature_matrix, output, initial_weights, step_size, l2_penalty, max_iterations=100):
    weights = np.array(initial_weights) # make sure it's a numpy array
    #while not reached maximum number of iterations:
    j = 0
    while(j != max_iterations):
        # compute the predictions using your predict_output() function
        predictions = predict_output(feature_matrix, weights)
        
        # compute the errors as predictions - output
        errors = predictions - output
        for i in range(len(weights)): # loop over each weight
            
            # Recall that feature_matrix[:,i] is the feature column associated with weights[i]
            # compute the derivative for weight[i].
            if i != 0:
                derivative = feature_derivative_ridge(errors, feature_matrix[:,i], weights[i], l2_penalty, True)
            #(Remember: when i=0, you are computing the derivative of the constant!)
            else:
                derivative = feature_derivative_ridge(errors, feature_matrix[:,i], weights[i], l2_penalty, False)

            # subtract the step size times the derivative from the current weight
            weights[i] = weights[i] - (step_size * derivative)
        j += 1
    return weights

Testing the ridge regression gradient descent



In [178]:

    
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
(simple_test_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)



In [206]:

    
step_size = 1e-12
max_iterations = 1000
initial_weights = [0. , 0.]
l2_penalty = 0.0
simple_weights_0_penalty = ridge_regression_gradient_descent(simple_feature_matrix,
                                                    output, initial_weights, step_size, l2_penalty, max_iterations)



In [180]:

    
print (simple_weights_0_penalty)









    



[ -1.63113515e-01   2.63024369e+02]



In [207]:

    
l2_penalty = 1e11 
simple_weights_high_penalty = ridge_regression_gradient_descent(simple_feature_matrix,
                                            output, initial_weights, step_size, l2_penalty, max_iterations)
print (simple_weights_high_penalty)









    



[   9.76730382  124.57217567]



In [208]:

    
%matplotlib inline
plt.plot(simple_feature_matrix,output,'k.',
        simple_feature_matrix,predict_output(simple_feature_matrix, simple_weights_0_penalty),'b-',
        simple_feature_matrix,predict_output(simple_feature_matrix, simple_weights_high_penalty),'r-')









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-208-bb5b33311325> in <module>()
      2 plt.plot(simple_feature_matrix,output,'k.',
      3         simple_feature_matrix,predict_output(simple_feature_matrix, simple_weights_0_penalty),'b-',
----> 4         simple_feature_matrix,predict_output(simple_feature_matrix, simple_weights_high_penalty),'r-')

/home/weenkus/anaconda3/lib/python3.5/site-packages/matplotlib/pyplot.py in plot(*args, **kwargs)
   3152         ax.hold(hold)
   3153     try:
-> 3154         ret = ax.plot(*args, **kwargs)
   3155     finally:
   3156         ax.hold(washold)

/home/weenkus/anaconda3/lib/python3.5/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1810                     warnings.warn(msg % (label_namer, func.__name__),
   1811                                   RuntimeWarning, stacklevel=2)
-> 1812             return func(ax, *args, **kwargs)
   1813         pre_doc = inner.__doc__
   1814         if pre_doc is None:

/home/weenkus/anaconda3/lib/python3.5/site-packages/matplotlib/axes/_axes.py in plot(self, *args, **kwargs)
   1422             kwargs['color'] = c
   1423 
-> 1424         for line in self._get_lines(*args, **kwargs):
   1425             self.add_line(line)
   1426             lines.append(line)

/home/weenkus/anaconda3/lib/python3.5/site-packages/matplotlib/axes/_base.py in _grab_next_args(self, *args, **kwargs)
    393                 isplit = 2
    394 
--> 395             for seg in self._plot_args(remaining[:isplit], kwargs):
    396                 yield seg
    397             remaining = remaining[isplit:]

/home/weenkus/anaconda3/lib/python3.5/site-packages/matplotlib/axes/_base.py in _plot_args(self, tup, kwargs)
    362             x, y = index_of(tup[-1])
    363 
--> 364         x, y = self._xy_from_xy(x, y)
    365 
    366         if self.command == 'plot':

/home/weenkus/anaconda3/lib/python3.5/site-packages/matplotlib/axes/_base.py in _xy_from_xy(self, x, y)
    221         y = _check_1d(y)
    222         if x.shape[0] != y.shape[0]:
--> 223             raise ValueError("x and y must have same first dimension")
    224         if x.ndim > 2 or y.ndim > 2:
    225             raise ValueError("x and y can be no greater than 2-D")

ValueError: x and y must have same first dimension

Compute the RSS on the test data



In [209]:

    
RSS_Low_p = np.square(predict_output(simple_test_feature_matrix, simple_weights_0_penalty) - test_output).sum()
RSS_High_p = np.square(predict_output(simple_test_feature_matrix, simple_weights_high_penalty) - test_output).sum()
RSS_0_weights = np.square(predict_output(simple_test_feature_matrix, initial_weights) - test_output).sum()

print (RSS_Low_p)
print (RSS_High_p)
print (RSS_0_weights)









    



2.75723632154e+14
6.9464210149e+14
1.78427328614e+15

Training a model with multiple features



In [218]:

    
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)
(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)



In [219]:

    
step_size = 1e-12
max_iterations = 1000
initial_weights = [0. , 0., 0.]
l2_penalty = 0.0
multiple_weights_0_penalty = ridge_regression_gradient_descent(feature_matrix, output, initial_weights,
                                                               step_size, l2_penalty, max_iterations)



In [220]:

    
print (multiple_weights_0_penalty)









    



[  -0.35743483  243.05416982   22.41481497]



In [221]:

    
l2_penalty = 1e11
multiple_weights_high_penalty = ridge_regression_gradient_descent(feature_matrix, output,
                                                                  initial_weights, step_size, l2_penalty, max_iterations)
print(multiple_weights_high_penalty)









    



[  6.74296579  91.48927365  78.43658766]

Calculate the RSS



In [224]:

    
RSS_Low_p = np.square(predict_output(test_feature_matrix, multiple_weights_0_penalty) - test_output).sum()
RSS_High_p = np.square(predict_output(test_feature_matrix, multiple_weights_high_penalty) - test_output).sum()
RSS_0_weights = np.square(predict_output(test_feature_matrix, initial_weights) - test_output).sum()

print (RSS_Low_p)
print (RSS_High_p)
print (RSS_0_weights)









    



2.74067615919e+14
5.00404800501e+14
1.78427328614e+15



In [244]:

    
print(predict_output(test_feature_matrix, multiple_weights_0_penalty)-test_output)









    



[[  77465.47605824  114977.14757579  205709.53237955 ...,   58296.48424192
   202473.88624925 -131322.99294662]]



In [246]:

    
print(predict_output(test_feature_matrix, multiple_weights_high_penalty)-test_output)

# Looking at the absolute value the first house is better predicted for the high penalty model 
# (40k error vs 70k error for the low penalty model)









    



[[ -39546.46967806 -212245.60217562    4243.08619562 ..., -182465.08653181
   -45114.07948542 -228769.87849753]]



In [ ]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900	3	1.00	1180	5650	1	...	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000	3	2.25	2570	7242	2	...	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000	2	1.00	770	10000	1	...	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000	4	3.00	1960	5000	1	...	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000	3	2.00	1680	8080	1	...	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503