In [21]:
# import
import graphlab as gl
from matplotlib import pyplot as plt
In [11]:
# inline the visualization
%matplotlib inline
gl.canvas.set_target("browser", port=None) # use "ipynb" for inline visualization
In [12]:
# importing the data
data = gl.SFrame("home_data.gl/")
In [13]:
data
Out[13]:
In [14]:
data.show()
In [15]:
gl.canvas.set_target("ipynb")
# scatter plot view
data.show(view="Scatter Plot", x="sqft_living", y="price")
In [16]:
# spliting the data into train and test data
train_data, test_data = data.random_split(0.8, seed=0)
In [17]:
# creating Linear Regression model
clf = gl.linear_regression.create(train_data, target="price", features=["sqft_living"])
In [18]:
# Evaluate the simple model
test_data[:4]
Out[18]:
In [19]:
test_data['price'].mean()
Out[19]:
In [20]:
# evaluate
clf.evaluate(test_data)
Out[20]:
In [25]:
# predicting the data
plt.plot(test_data["sqft_living"], test_data["price"], ".",
test_data["sqft_living"], clf.predict(test_data), '-')
Out[25]:
In [26]:
clf.get("coefficients")
Out[26]:
In [43]:
# Exploring some more features
my_features = ["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors", "zipcode"]
In [44]:
data[my_features].show()
In [45]:
data.show(view="BoxWhisker Plot", x="zipcode", y="price")
In [54]:
# creating another model
clf_mine = gl.linear_regression.create(train_data, target="price", features=my_features, validation_set=None)
In [55]:
# evaluate the new model and old model
print "Model 1 :", clf.evaluate(test_data)
print "Model 2 :",clf_mine.evaluate(test_data)
In [56]:
# predicting some house proce
house1 = data[data['id']=='5309101200']
house1
Out[56]:
In [57]:
print clf_mine.predict(house1)
print clf.predict(house1)
In [40]:
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15', # average lot size of 15 nearest neighbors
]
In [58]:
clf_adv = gl.linear_regression.create(train_data, target="price", features=advanced_features, validation_set=None)
In [59]:
# evaluate the new model and old model
print "Model 2 :", clf_mine.evaluate(test_data)
print "Model 3 :",clf_adv.evaluate(test_data)
In [63]:
# Selection and summary statistics: We found the zip code with the highest average house price.
# What is the average house price of that zip code?
data[data["zipcode"]=="98039"]["price"].mean()
Out[63]:
In [72]:
# Filtering data: What fraction of the houses have living space between 2000 sq.ft. and 4000 sq.ft.?
data1 = data[data["sqft_living"]>=2000]
data2 = data1[data1["sqft_living"]<=4000]
Out[72]:
In [74]:
data.num_rows()
Out[74]:
In [75]:
data2.num_rows()
Out[75]:
In [76]:
9221.00/21613
Out[76]:
In [ ]: