Initialise the libs



In [43]:

    
import numpy as np
import pandas as pa
import matplotlib.pyplot as plt

Load the data



In [44]:

    
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int,
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float,
              'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int,
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

regressionDir = '/home/weenkus/workspace/Machine Learning - University of Washington/Regression'

house = pa.read_csv(regressionDir + '/datasets/kc_house_data.csv', dtype = dtype_dict)
house_test = pa.read_csv(regressionDir + '/datasets/kc_house_test_data.csv', dtype = dtype_dict)
house_train = pa.read_csv(regressionDir + '/datasets/kc_house_train_data.csv', dtype = dtype_dict)

Explore the data



In [45]:

    
house.head()









    Out[45]:






  
    
      
      id
      date
      price
      bedrooms
      bathrooms
      sqft_living
      sqft_lot
      floors
      waterfront
      view
      ...
      grade
      sqft_above
      sqft_basement
      yr_built
      yr_renovated
      zipcode
      lat
      long
      sqft_living15
      sqft_lot15
    
  
  
    
      0
      7129300520
      20141013T000000
      221900
      3
      1.00
      1180
      5650
      1
      0
      0
      ...
      7
      1180
      0
      1955
      0
      98178
      47.5112
      -122.257
      1340
      5650
    
    
      1
      6414100192
      20141209T000000
      538000
      3
      2.25
      2570
      7242
      2
      0
      0
      ...
      7
      2170
      400
      1951
      1991
      98125
      47.7210
      -122.319
      1690
      7639
    
    
      2
      5631500400
      20150225T000000
      180000
      2
      1.00
      770
      10000
      1
      0
      0
      ...
      6
      770
      0
      1933
      0
      98028
      47.7379
      -122.233
      2720
      8062
    
    
      3
      2487200875
      20141209T000000
      604000
      4
      3.00
      1960
      5000
      1
      0
      0
      ...
      7
      1050
      910
      1965
      0
      98136
      47.5208
      -122.393
      1360
      5000
    
    
      4
      1954400510
      20150218T000000
      510000
      3
      2.00
      1680
      8080
      1
      0
      0
      ...
      8
      1680
      0
      1987
      0
      98074
      47.6168
      -122.045
      1800
      7503
    
  

5 rows × 21 columns



In [46]:

    
# Show plots in jupyter
%matplotlib inline

plt.scatter(house.sqft_living, house.price, alpha=0.5)
plt.ylabel('')
plt.xlabel('price')
plt.show()

Custom numpy functions



In [47]:

    
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe = data_sframe[features]
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = np.matrix(features_sframe)
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = np.array(output_sarray) # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)



In [48]:

    
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)



In [49]:

    
def feature_derivative(errors, feature):
    derivative = np.dot(errors, feature) * 2
    return(derivative)



In [50]:

    
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        predictions = predict_outcome(feature_matrix, weights)
        # compute the errors as predictions - output:
        errors = predictions - output
        
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:, i])
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares = gradient_sum_squares + (derivative ** 2)
            # update the weight based on step size and derivative:
            weights[i] = weights[i] - (step_size * derivative)
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

Running the custom gradient descent function



In [51]:

    
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(house_train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7



In [52]:

    
simple_weights_train = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)



In [53]:

    
print ('Weights: ', simple_weights_train)









    



Weights:  [-46999.88716555    281.91211918]



In [54]:

    
(simple_feature_matrix, output) = get_numpy_data(house_test, simple_features, my_output)
simple_weights_test = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)



In [55]:

    
print ('Weights: ', simple_weights_test)









    



Weights:  [-46999.87880043    282.3594539 ]

Computing RSS



In [57]:

    
RSS_test = (output - np.dot(simple_feature_matrix, simple_weights_test))
print (RSS_test)









    



[[ -46774.14027534 -135960.51020203 -202834.78736709 ...,  -53860.9450254
  -205250.45970658  161094.23582331]]



In [ ]:

    
print (np.dot(simple_feature_matrix, simple_weights_test))

Regression model with two features



In [ ]:

    
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(house_train, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9



In [ ]:

    
weights_train = regression_gradient_descent(feature_matrix, output,initial_weights, step_size, tolerance)



In [ ]:

    
print ('Weights: ', weights_train)



In [ ]:

    
(feature_matrix, output) = get_numpy_data(house_test, model_features, my_output)
np.dot(feature_matrix, (weights_train)



In [ ]:

    
print (house_test['price'][0])

Computing RSS



In [ ]:

    
RSS_test = (output - np.dot(feature_matrix, np.array(weights_train)[0]))
print (RSS_test)



In [ ]:



In [ ]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900	3	1.00	1180	5650	1	...	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000	3	2.25	2570	7242	2	...	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000	2	1.00	770	10000	1	...	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000	4	3.00	1960	5000	1	...	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000	3	2.00	1680	8080	1	...	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503