In [2]:
import graphlab
In [5]:
sales = graphlab.SFrame('home_data.gl/')
In [31]:
houses_1 = sales[sales['zipcode']=='98039']
In [32]:
houses_1['price'].mean()
Out[32]:
In [33]:
houses_2 = sales[(sales['sqft_living'] > 2000) & (sales['sqft_living'] < 4000)]
In [36]:
len(houses_2) / float(len(sales))
Out[36]:
RMSE of about \$255,170!
The RMSE goes down from \$255,170 to \$179,508 with more features.
In [8]:
train_data,test_data = sales.random_split(.8,seed=0)
In [9]:
advanced_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode', 'condition', 'grade', 'waterfront', 'view', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
In [10]:
advanced_features_model = graphlab.linear_regression.create(train_data,target='price',features=advanced_features)
In [11]:
print advanced_features_model.evaluate(test_data)