In [1]:
import graphlab
In [2]:
sales = graphlab.SFrame('home_data.gl/')
In [3]:
sales
Out[3]:
In [4]:
graphlab.canvas.set_target('ipynb')
In [5]:
sales.show(view="Scatter Plot", x="sqft_living", y="price")
In [7]:
trainData, testData = sales.random_split(0.8, seed=0)
In [8]:
sqftPredModel = graphlab.linear_regression.create(trainData, target='price', features=['sqft_living'])
In [9]:
print testData['price'].mean()
In [10]:
print sqftPredModel.evaluate(testData)
In [12]:
import matplotlib.pyplot as plt
In [13]:
%matplotlib inline
In [14]:
plt.plot(testData['sqft_living'], testData['price'], '.',
testData['sqft_living'], sqftPredModel.predict(testData), '-')
Out[14]:
In [15]:
sqftPredModel.get('coefficients')
Out[15]:
In [16]:
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']
In [17]:
features
Out[17]:
In [19]:
sales[features].show()
In [63]:
sales.show(view='BoxWhisker Plot', x='zipcode', y='price')
In [22]:
features
Out[22]:
In [23]:
multiFeaturesModel = graphlab.linear_regression.create(trainData, target='price', features=features)
In [24]:
print sqftPredModel.evaluate(testData)
print multiFeaturesModel.evaluate(testData)
In [25]:
h1 = sales[sales['id']=='5309101200']
In [26]:
h1
Out[26]:
In [27]:
h1['price']
Out[27]:
In [28]:
sqftPredModel.predict(h1)
Out[28]:
In [29]:
multiFeaturesModel.predict(h1)
Out[29]:
In [30]:
h2 = sales[sales['id']=='1925069082']
In [31]:
h2
Out[31]:
In [34]:
print sqftPredModel.predict(h2)
In [35]:
print multiFeaturesModel.predict(h2)
In [36]:
h3 = {'bedrooms':[8],
'bathrooms':[25],
'sqft_living':[50000],
'sqft_lot':[225000],
'floors':[4],
'zipcode':['98039'],
'condition':[10],
'grade':[10],
'waterfront':[1],
'view':[4],
'sqft_above':[37500],
'sqft_basement':[12500],
'yr_built':[1994],
'yr_renovated':[2010],
'lat':[47.627606],
'long':[-122.242054],
'sqft_living15':[5000],
'sqft_lot15':[40000]}
In [37]:
h3
Out[37]:
In [40]:
print sqftPredModel.predict(graphlab.SFrame(h3))
In [41]:
print multiFeaturesModel.predict(graphlab.SFrame(h3))
In [64]:
sales[sales['zipcode']=='98039']
Out[64]:
In [65]:
sales[sales['zipcode']=='98039']['price'].mean()
Out[65]:
In [46]:
mediumHouses = sales[(sales['sqft_living']>2000) & (sales['sqft_living']<=4000)]
In [47]:
mediumHouses
Out[47]:
In [48]:
len(sales)
Out[48]:
In [49]:
len(mediumHouses)
Out[49]:
In [52]:
float(len(mediumHouses))/len(sales)
Out[52]:
In [66]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']
In [68]:
multiFeaturesModel = graphlab.linear_regression.create(trainData, target='price', features=my_features)
In [69]:
print multiFeaturesModel.evaluate(testData)
In [56]:
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15', # average lot size of 15 nearest neighbors
]
In [57]:
advanced_features
Out[57]:
In [70]:
advancedFeaturesModel = graphlab.linear_regression.create(trainData, target='price', features=advanced_features)
In [71]:
advancedFeaturesModel.evaluate(testData)
Out[71]:
In [73]:
multiFeaturesModel.evaluate(testData)['rmse'] - advancedFeaturesModel.evaluate(testData)['rmse']
Out[73]:
In [ ]: