notebook.community

Edit and run



In [1]:

    
import graphlab



In [2]:

    
sales = graphlab.SFrame('../home_data.gl/')









    



[INFO] This non-commercial license of GraphLab Create is assigned to rajasekar87.ra@gmail.comand will expire on September 23, 2016. For commercial licensing options, visit https://dato.com/buy/.

[INFO] Start server at: ipc:///tmp/graphlab_server-5351 - Server binary: /hdd/anaconda/envs/dato-env/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1443387437.log
[INFO] GraphLab Server Version: 1.6.1



In [ ]:

    
sales



In [3]:

    
graphlab.canvas.set_target('ipynb')
sales.show(view="Scatter Plot", x="sqft_living", y="price")



In [4]:

    
train_data, test_data = sales.random_split(.8, seed = 0)



In [ ]:

    
sqft_model = graphlab.linear_regression.create(train_data, target="price", features=['sqft_living'])



In [ ]:

    
print test_data['price'].mean()



In [ ]:

    
print sqft_model.evaluate(test_data)



In [ ]:

    
import matplotlib.pyplot as plt
%matplotlib inline



In [ ]:

    
plt.plot(test_data['sqft_living'], test_data['price'], '.', test_data['sqft_living'], sqft_model.predict(test_data),'-')



In [5]:

    
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']



In [ ]:

    
sales[my_features].show()



In [ ]:

    
sales.show(view = 'BoxWhisker Plot', x='zipcode', y='price')



In [6]:

    
my_features_model = graphlab.linear_regression.create(train_data, target='price', features = my_features)









    



PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 16501
PROGRESS: Number of features          : 6
PROGRESS: Number of unpacked features : 6
PROGRESS: Number of coefficients    : 115
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 2        | 1.030242     | 3735785.483509     | 1213984.588209       | 183227.304384 | 156277.311261   |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+



In [ ]:

    
plt.plot(test_data['sqft_living'],test_data['price'],'.',
        test_data['sqft_living'],my_features_model.predict(test_data),'-')



In [ ]:

    
print my_features



In [ ]:

    
my_model = graphlab.regression.create(train_data, target='price', features=my_features)



In [ ]:

    
print my_model.evaluate(test_data)



In [ ]:

    
sales



In [ ]:

    
sales.show()



In [ ]:

    
cost_zip_code = ['price', 'zipcode']



In [ ]:

    
sales[cost_zip_code].show()



In [ ]:

    
sales.show()



In [ ]:

    
price_avg = sales.groupby(['zipcode'], {'price_avg' : graphlab.aggregate.AVG('price')})



In [ ]:

    
price_avg.sort('price_avg', ascending = False)



In [ ]:

    
sales



In [ ]:

    
len(sales[(sales['sqft_living'] > 2000 ) & (sales['sqft_living'] < 4000)])



In [ ]:

    
len(sales)



In [7]:

    
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house				
'grade', # measure of quality of construction				
'waterfront', # waterfront property				
'view', # type of view				
'sqft_above', # square feet above ground				
'sqft_basement', # square feet in basement				
'yr_built', # the year built				
'yr_renovated', # the year renovated				
'lat', 'long', # the lat-long of the parcel				
'sqft_living15', # average sq.ft. of 15 nearest neighbors 				
'sqft_lot15', # average lot size of 15 nearest neighbors 
]



In [ ]:

    
print advanced_features



In [8]:

    
my_advanced_features_model = graphlab.linear_regression.create(train_data, target='price', features = advanced_features)









    



PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 16509
PROGRESS: Number of features          : 18
PROGRESS: Number of unpacked features : 18
PROGRESS: Number of coefficients    : 127
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 2        | 0.023834     | 3501263.132139     | 2231071.249070       | 152905.845214 | 195746.528153   |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+



In [9]:

    
print my_features_model.evaluate(test_data)
print my_advanced_features_model.evaluate(test_data)









    



{'max_error': 3472806.4514070554, 'rmse': 179636.27816359687}
{'max_error': 3515905.390161382, 'rmse': 156171.8856532509}



In [10]:

    
179636.27816359687 - 156791.70994956387









    Out[10]:





22844.568214033003



In [ ]: