Predicting House Prices



In [21]:

    
# import
import graphlab as gl
from matplotlib import pyplot as plt



In [11]:

    
# inline the visualization
%matplotlib inline

gl.canvas.set_target("browser", port=None)  # use "ipynb" for inline visualization



In [12]:

    
# importing the data
data = gl.SFrame("home_data.gl/")



In [13]:

    
data









    Out[13]:





    
        id
        date
        price
        bedrooms
        bathrooms
        sqft_living
        sqft_lot
        floors
        waterfront
    
    
        7129300520
        2014-10-13 00:00:00+00:00
        221900
        3
        1
        1180
        5650
        1
        0
    
    
        6414100192
        2014-12-09 00:00:00+00:00
        538000
        3
        2.25
        2570
        7242
        2
        0
    
    
        5631500400
        2015-02-25 00:00:00+00:00
        180000
        2
        1
        770
        10000
        1
        0
    
    
        2487200875
        2014-12-09 00:00:00+00:00
        604000
        4
        3
        1960
        5000
        1
        0
    
    
        1954400510
        2015-02-18 00:00:00+00:00
        510000
        3
        2
        1680
        8080
        1
        0
    
    
        7237550310
        2014-05-12 00:00:00+00:00
        1225000
        4
        4.5
        5420
        101930
        1
        0
    
    
        1321400060
        2014-06-27 00:00:00+00:00
        257500
        3
        2.25
        1715
        6819
        2
        0
    
    
        2008000270
        2015-01-15 00:00:00+00:00
        291850
        3
        1.5
        1060
        9711
        1
        0
    
    
        2414600126
        2015-04-15 00:00:00+00:00
        229500
        3
        1
        1780
        7470
        1
        0
    
    
        3793500160
        2015-03-12 00:00:00+00:00
        323000
        3
        2.5
        1890
        6560
        2
        0
    


    
        view
        condition
        grade
        sqft_above
        sqft_basement
        yr_built
        yr_renovated
        zipcode
        lat
    
    
        0
        3
        7
        1180
        0
        1955
        0
        98178
        47.51123398
    
    
        0
        3
        7
        2170
        400
        1951
        1991
        98125
        47.72102274
    
    
        0
        3
        6
        770
        0
        1933
        0
        98028
        47.73792661
    
    
        0
        5
        7
        1050
        910
        1965
        0
        98136
        47.52082
    
    
        0
        3
        8
        1680
        0
        1987
        0
        98074
        47.61681228
    
    
        0
        3
        11
        3890
        1530
        2001
        0
        98053
        47.65611835
    
    
        0
        3
        7
        1715
        0
        1995
        0
        98003
        47.30972002
    
    
        0
        3
        7
        1060
        0
        1963
        0
        98198
        47.40949984
    
    
        0
        3
        7
        1050
        730
        1960
        0
        98146
        47.51229381
    
    
        0
        3
        7
        1890
        0
        2003
        0
        98038
        47.36840673
    


    
        long
        sqft_living15
        sqft_lot15
    
    
        -122.25677536
        1340.0
        5650.0
    
    
        -122.3188624
        1690.0
        7639.0
    
    
        -122.23319601
        2720.0
        8062.0
    
    
        -122.39318505
        1360.0
        5000.0
    
    
        -122.04490059
        1800.0
        7503.0
    
    
        -122.00528655
        4760.0
        101930.0
    
    
        -122.32704857
        2238.0
        6819.0
    
    
        -122.31457273
        1650.0
        9711.0
    
    
        -122.33659507
        1780.0
        8113.0
    
    
        -122.0308176
        2390.0
        7570.0
    

[21613 rows x 21 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [14]:

    
data.show()









    



Canvas is accessible via web browser at the URL: http://localhost:21695/index.html
Opening Canvas in default web browser.



In [15]:

    
gl.canvas.set_target("ipynb")
# scatter plot view
data.show(view="Scatter Plot", x="sqft_living", y="price")



In [16]:

    
# spliting the data into train and test data
train_data, test_data = data.random_split(0.8, seed=0)



In [17]:

    
# creating Linear Regression model
clf = gl.linear_regression.create(train_data, target="price", features=["sqft_living"])









    



PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.







    




Linear regression:






    




--------------------------------------------------------






    




Number of examples          : 16474






    




Number of features          : 1






    




Number of unpacked features : 1






    




Number of coefficients    : 2






    




Starting Newton Method






    




--------------------------------------------------------






    




+-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+






    




| Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |






    




+-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+






    




| 1         | 2        | 1.098370     | 4336865.368528     | 1673632.160487       | 263345.228506 | 255619.740349   |






    




+-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+






    




SUCCESS: Optimal solution found.



In [18]:

    
# Evaluate the simple model
test_data[:4]









    Out[18]:





    
        id
        date
        price
        bedrooms
        bathrooms
        sqft_living
        sqft_lot
        floors
        waterfront
    
    
        0114101516
        2014-05-28 00:00:00+00:00
        310000
        3
        1
        1430
        19901
        1.5
        0
    
    
        9297300055
        2015-01-24 00:00:00+00:00
        650000
        4
        3
        2950
        5000
        2
        0
    
    
        1202000200
        2014-11-03 00:00:00+00:00
        233000
        3
        2
        1710
        4697
        1.5
        0
    
    
        8562750320
        2014-11-10 00:00:00+00:00
        580500
        3
        2.5
        2320
        3980
        2
        0
    


    
        view
        condition
        grade
        sqft_above
        sqft_basement
        yr_built
        yr_renovated
        zipcode
        lat
    
    
        0
        4
        7
        1430
        0
        1927
        0
        98028
        47.75584254
    
    
        3
        3
        9
        1980
        970
        1979
        0
        98126
        47.57136955
    
    
        0
        5
        6
        1710
        0
        1941
        0
        98002
        47.30482931
    
    
        0
        3
        8
        2320
        0
        2003
        0
        98027
        47.5391103
    


    
        long
        sqft_living15
        sqft_lot15
    
    
        -122.22874498
        1780.0
        12697.0
    
    
        -122.37541218
        2140.0
        4000.0
    
    
        -122.21774909
        1030.0
        4705.0
    
    
        -122.06971484
        2580.0
        3980.0
    

[4 rows x 21 columns]



In [19]:

    
test_data['price'].mean()









    Out[19]:





543054.0425632534



In [20]:

    
# evaluate
clf.evaluate(test_data)









    Out[20]:





{'max_error': 4133533.0970407226, 'rmse': 255214.18419781374}



In [25]:

    
# predicting the data
plt.plot(test_data["sqft_living"], test_data["price"], ".", 
        test_data["sqft_living"], clf.predict(test_data), '-')









    Out[25]:





[<matplotlib.lines.Line2D at 0x242175c0>,
 <matplotlib.lines.Line2D at 0x24217668>]



In [26]:

    
clf.get("coefficients")









    Out[26]:





    
        name
        index
        value
        stderr
    
    
        (intercept)
        None
        -49178.3910177
        5063.19949956
    
    
        sqft_living
        None
        283.179503941
        2.22580563108
    

[2 rows x 4 columns]



In [43]:

    
# Exploring some more features
my_features = ["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors", "zipcode"]



In [44]:

    
data[my_features].show()



In [45]:

    
data.show(view="BoxWhisker Plot", x="zipcode", y="price")



In [54]:

    
# creating another model
clf_mine = gl.linear_regression.create(train_data, target="price", features=my_features, validation_set=None)









    




Linear regression:






    




--------------------------------------------------------






    




Number of examples          : 17384






    




Number of features          : 6






    




Number of unpacked features : 6






    




Number of coefficients    : 115






    




Starting Newton Method






    




--------------------------------------------------------






    




+-----------+----------+--------------+--------------------+---------------+






    




| Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |






    




+-----------+----------+--------------+--------------------+---------------+






    




| 1         | 2        | 0.025016     | 3763208.270523     | 181908.848367 |






    




+-----------+----------+--------------+--------------------+---------------+






    




SUCCESS: Optimal solution found.



In [55]:

    
# evaluate the new model and old model
print "Model 1 :", clf.evaluate(test_data)
print "Model 2 :",clf_mine.evaluate(test_data)









    



Model 1 : {'max_error': 4133533.0970407226, 'rmse': 255214.18419781374}
Model 2 : {'max_error': 3486584.509381705, 'rmse': 179542.4333126903}



In [56]:

    
# predicting some house proce
house1 = data[data['id']=='5309101200']
house1









    Out[56]:





    
        id
        date
        price
        bedrooms
        bathrooms
        sqft_living
        sqft_lot
        floors
        waterfront
    
    
        5309101200
        2014-06-05 00:00:00+00:00
        620000
        4
        2.25
        2400
        5350
        1.5
        0
    


    
        view
        condition
        grade
        sqft_above
        sqft_basement
        yr_built
        yr_renovated
        zipcode
        lat
    
    
        0
        4
        7
        1460
        940
        1929
        0
        98117
        47.67632376
    


    
        long
        sqft_living15
        sqft_lot15
    
    
        -122.37010126
        1250.0
        4880.0
    

[? rows x 21 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use sf.materialize() to force materialization.



In [57]:

    
print clf_mine.predict(house1)
print clf.predict(house1)









    



[721918.9333272863]
[630452.4184407898]

Quiz



In [40]:

    
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house				
'grade', # measure of quality of construction				
'waterfront', # waterfront property				
'view', # type of view				
'sqft_above', # square feet above ground				
'sqft_basement', # square feet in basement				
'yr_built', # the year built				
'yr_renovated', # the year renovated				
'lat', 'long', # the lat-long of the parcel				
'sqft_living15', # average sq.ft. of 15 nearest neighbors 				
'sqft_lot15', # average lot size of 15 nearest neighbors 
]



In [58]:

    
clf_adv = gl.linear_regression.create(train_data, target="price", features=advanced_features, validation_set=None)









    




Linear regression:






    




--------------------------------------------------------






    




Number of examples          : 17384






    




Number of features          : 18






    




Number of unpacked features : 18






    




Number of coefficients    : 127






    




Starting Newton Method






    




--------------------------------------------------------






    




+-----------+----------+--------------+--------------------+---------------+






    




| Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |






    




+-----------+----------+--------------+--------------------+---------------+






    




| 1         | 2        | 0.056044     | 3469012.450686     | 154580.940736 |






    




+-----------+----------+--------------+--------------------+---------------+






    




SUCCESS: Optimal solution found.



In [59]:

    
# evaluate the new model and old model
print "Model 2 :", clf_mine.evaluate(test_data)
print "Model 3 :",clf_adv.evaluate(test_data)









    



Model 2 : {'max_error': 3486584.509381705, 'rmse': 179542.4333126903}
Model 3 : {'max_error': 3556849.413858208, 'rmse': 156831.1168021901}



In [63]:

    
# Selection and summary statistics: We found the zip code with the highest average house price. 
# What is the average house price of that zip code?

data[data["zipcode"]=="98039"]["price"].mean()









    Out[63]:





2160606.5999999996



In [72]:

    
# Filtering data: What fraction of the houses have living space between 2000 sq.ft. and 4000 sq.ft.?
data1 = data[data["sqft_living"]>=2000]
data2 = data1[data1["sqft_living"]<=4000]









    Out[72]:





9221



In [74]:

    
data.num_rows()









    Out[74]:





21613



In [75]:

    
data2.num_rows()









    Out[75]:





9221



In [76]:

    
9221.00/21613









    Out[76]:





0.4266413732475825



In [ ]:

id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors
7129300520	2014-10-13 00:00:00+00:00	221900	3	1	1180	5650	1
6414100192	2014-12-09 00:00:00+00:00	538000	3	2.25	2570	7242	2
5631500400	2015-02-25 00:00:00+00:00	180000	2	1	770	10000	1
2487200875	2014-12-09 00:00:00+00:00	604000	4	3	1960	5000	1
1954400510	2015-02-18 00:00:00+00:00	510000	3	2	1680	8080	1
7237550310	2014-05-12 00:00:00+00:00	1225000	4	4.5	5420	101930	1
1321400060	2014-06-27 00:00:00+00:00	257500	3	2.25	1715	6819	2
2008000270	2015-01-15 00:00:00+00:00	291850	3	1.5	1060	9711	1
2414600126	2015-04-15 00:00:00+00:00	229500	3	1	1780	7470	1
3793500160	2015-03-12 00:00:00+00:00	323000	3	2.5	1890	6560	2

condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat
3	7	1180	0	1955	0	98178	47.51123398
3	7	2170	400	1951	1991	98125	47.72102274
3	6	770	0	1933	0	98028	47.73792661
5	7	1050	910	1965	0	98136	47.52082
3	8	1680	0	1987	0	98074	47.61681228
3	11	3890	1530	2001	0	98053	47.65611835
3	7	1715	0	1995	0	98003	47.30972002
3	7	1060	0	1963	0	98198	47.40949984
3	7	1050	730	1960	0	98146	47.51229381
3	7	1890	0	2003	0	98038	47.36840673

long	sqft_living15	sqft_lot15
-122.25677536	1340.0	5650.0
-122.3188624	1690.0	7639.0
-122.23319601	2720.0	8062.0
-122.39318505	1360.0	5000.0
-122.04490059	1800.0	7503.0
-122.00528655	4760.0	101930.0
-122.32704857	2238.0	6819.0
-122.31457273	1650.0	9711.0
-122.33659507	1780.0	8113.0
-122.0308176	2390.0	7570.0

id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors
0114101516	2014-05-28 00:00:00+00:00	310000	3	1	1430	19901	1.5
9297300055	2015-01-24 00:00:00+00:00	650000	4	3	2950	5000	2
1202000200	2014-11-03 00:00:00+00:00	233000	3	2	1710	4697	1.5
8562750320	2014-11-10 00:00:00+00:00	580500	3	2.5	2320	3980	2

view	condition	grade	sqft_above	sqft_basement	yr_built	zipcode	lat
0	4	7	1430	0	1927	98028	47.75584254
3	3	9	1980	970	1979	98126	47.57136955
0	5	6	1710	0	1941	98002	47.30482931
0	3	8	2320	0	2003	98027	47.5391103

long	sqft_living15	sqft_lot15
-122.22874498	1780.0	12697.0
-122.37541218	2140.0	4000.0
-122.21774909	1030.0	4705.0
-122.06971484	2580.0	3980.0

name	index	value	stderr
(intercept)	None	-49178.3910177	5063.19949956
sqft_living	None	283.179503941	2.22580563108