In [2]:
import graphlab
In [3]:
sales = graphlab.SFrame('home_data.gl/')
In [4]:
sales
Out[4]:
In [5]:
graphlab.canvas.set_target('ipynb')
sales.show(view='Scatter Plot', x='sqft_living', y='price')
In [6]:
train_data, test_data = sales.random_split(0.8, seed=0)
In [7]:
sqft_model = graphlab.linear_regression.create(train_data, target='price', features=['sqft_living'])
In [8]:
print test_data['price'].mean()
In [9]:
print sqft_model.evaluate(test_data)
In [10]:
import matplotlib.pyplot as plt
%matplotlib inline
In [11]:
plt.plot(test_data['sqft_living'], test_data['price'], '.',
test_data['sqft_living'], sqft_model.predict(test_data), '-')
Out[11]:
In [12]:
sqft_model.get('coefficients')
Out[12]:
In [13]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']
In [14]:
sales[my_features].show()
In [15]:
sales.show(view='BoxWhisker Plot', x='zipcode', y='price')
In [16]:
my_features_model = graphlab.linear_regression.create(train_data, target='price', features=my_features)
In [17]:
print sqft_model.evaluate(test_data)
print my_features_model.evaluate(test_data)
In [18]:
house1 = sales[sales['id']=='5309101200']
In [19]:
house1
Out[19]:
In [20]:
print house1['price']
In [21]:
print sqft_model.predict(house1)
In [22]:
print my_features_model.predict(house1)
In [23]:
house2 = sales[sales['id']=='1925069082']
In [24]:
house2
Out[24]:
In [25]:
print house2['price']
In [26]:
print sqft_model.predict(house2)
In [27]:
print my_features_model.predict(house2)
In [28]:
houses_98039 = sales[sales['zipcode']=='98039']
In [30]:
houses_98039['price'].mean()
Out[30]:
In [76]:
houses_2000_4000 = sales[sales['sqft_living'] > 2000]
houses_2000_4000 = houses_2000_4000[houses_2000_4000['sqft_living'] <= 4000]
In [79]:
a = len(houses_2000_4000)
b = len(sales)
print a, b
print a/float(b)
In [46]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']
In [47]:
advanced_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15', # average lot size of 15 nearest neighbors
]
In [63]:
train_data, test_data = sales.random_split(0.8, seed=0)
In [64]:
my_features_model = graphlab.linear_regression.create(train_data, target='price', features=my_features, validation_set=None)
In [65]:
advanced_features_model = graphlab.linear_regression.create(train_data, target='price', features=advanced_features, validation_set=None)
In [66]:
print my_features_model.evaluate(test_data)
print advanced_features_model.evaluate(test_data)
In [67]:
print 179542.4333126903 - 156831.1168021901
In [ ]: