In [42]:
# import
import graphlab as gl
import matplotlib.pyplot as plt

In [43]:
%matplotlib inline
gl.canvas.set_target('ipynb')

In [44]:
# importing the data
sales = gl.SFrame('data/kc_house_data.gl/')
sales.head(4)


Out[44]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront
7129300520 2014-10-13 00:00:00+00:00 221900.0 3.0 1.0 1180.0 5650 1 0
6414100192 2014-12-09 00:00:00+00:00 538000.0 3.0 2.25 2570.0 7242 2 0
5631500400 2015-02-25 00:00:00+00:00 180000.0 2.0 1.0 770.0 10000 1 0
2487200875 2014-12-09 00:00:00+00:00 604000.0 4.0 3.0 1960.0 5000 1 0
view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat
0 3 7 1180 0 1955 0 98178 47.51123398
0 3 7 2170 400 1951 1991 98125 47.72102274
0 3 6 770 0 1933 0 98028 47.73792661
0 5 7 1050 910 1965 0 98136 47.52082
long sqft_living15 sqft_lot15
-122.25677536 1340.0 5650.0
-122.3188624 1690.0 7639.0
-122.23319601 2720.0 8062.0
-122.39318505 1360.0 5000.0
[4 rows x 21 columns]

In [45]:
train_data,test_data = sales.random_split(.8,seed=0)

In [46]:
def simple_linear_regression(input_feature, output):
    sig_y = output.sum()
    sig_x = input_feature.sum()
    sig_xy = (input_feature*output).sum()
    sig_x2 = (input_feature*input_feature).sum()
    size = len(output)*1.0
    
    slope = (size*sig_xy - sig_y*sig_x )/(size*sig_x2-sig_x*sig_x)
    intercept = sig_y/size - slope*sig_x/size
    return(intercept, slope)

In [47]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_output = intercept + slope*input_feature
    return(predicted_output)

In [48]:
input_feature = train_data['sqft_living']
output = train_data['price']

Using your Slope and Intercept from above, What is the predicted price for a house with 2650 sqft?


In [49]:
intercept, slope = simple_linear_regression(input_feature, output)
print(get_regression_predictions(2650, intercept, slope))


700074.845629

In [50]:
def get_residual_sum_of_squares(input_feature, output, intercept,slope):
    predict = get_regression_predictions(input_feature, intercept, slope)
    RSS = ((output-predict)**2).sum()
    return(RSS)

According to this function and the slope and intercept What is the RSS for the simple linear regression using squarefeet to predict prices on TRAINING data?


In [51]:
print(get_residual_sum_of_squares(input_feature, output, intercept,slope))


1.20191835632e+15

In [52]:
def inverse_regression_predictions(output, intercept, slope):
    estimated_input=1.0*(output - intercept)/slope
    return(estimated_input)

According to this function and the regression slope and intercept from (3) what is the estimated square-feet for a house costing $800,000


In [53]:
inverse_regression_predictions(800000, intercept, slope)


Out[53]:
3004.3962476159445

In [54]:
print(intercept, slope)


(-47116.07657494012, 281.95883856769746)

Which model (square feet or bedrooms) has lowest RSS on TEST data? Think about why this might be the case.


In [55]:
bedroom_slope, bedroom_intercept = simple_linear_regression(train_data['bedrooms'], train_data['price'])

In [56]:
bedroom_RSS = get_residual_sum_of_squares(test_data['bedrooms'], test_data['price'], bedroom_intercept,bedroom_slope)
print(bedroom_RSS)


4.99664719812e+14

In [58]:
sqft_slope, sqft_intercept = simple_linear_regression(train_data['sqft_living'], train_data['price'])
sqft_RSS = get_residual_sum_of_squares(test_data['sqft_living'], test_data['price'], intercept, slope)
print(sqft_RSS)


2.75402936247e+14

In [ ]: