In [1]:
import graphlab

In [2]:
sales = graphlab.SFrame('../home_data.gl/')


[INFO] This non-commercial license of GraphLab Create is assigned to rajasekar87.ra@gmail.comand will expire on September 23, 2016. For commercial licensing options, visit https://dato.com/buy/.

[INFO] Start server at: ipc:///tmp/graphlab_server-5351 - Server binary: /hdd/anaconda/envs/dato-env/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1443387437.log
[INFO] GraphLab Server Version: 1.6.1

In [ ]:
sales

In [3]:
graphlab.canvas.set_target('ipynb')
sales.show(view="Scatter Plot", x="sqft_living", y="price")



In [4]:
train_data, test_data = sales.random_split(.8, seed = 0)

In [ ]:
sqft_model = graphlab.linear_regression.create(train_data, target="price", features=['sqft_living'])

In [ ]:
print test_data['price'].mean()

In [ ]:
print sqft_model.evaluate(test_data)

In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline

In [ ]:
plt.plot(test_data['sqft_living'], test_data['price'], '.', test_data['sqft_living'], sqft_model.predict(test_data),'-')

In [5]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

In [ ]:
sales[my_features].show()

In [ ]:
sales.show(view = 'BoxWhisker Plot', x='zipcode', y='price')

In [6]:
my_features_model = graphlab.linear_regression.create(train_data, target='price', features = my_features)


PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 16501
PROGRESS: Number of features          : 6
PROGRESS: Number of unpacked features : 6
PROGRESS: Number of coefficients    : 115
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 2        | 1.030242     | 3735785.483509     | 1213984.588209       | 183227.304384 | 156277.311261   |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+

In [ ]:
plt.plot(test_data['sqft_living'],test_data['price'],'.',
        test_data['sqft_living'],my_features_model.predict(test_data),'-')

In [ ]:
print my_features

In [ ]:
my_model = graphlab.regression.create(train_data, target='price', features=my_features)

In [ ]:
print my_model.evaluate(test_data)

In [ ]:
sales

In [ ]:
sales.show()

In [ ]:
cost_zip_code = ['price', 'zipcode']

In [ ]:
sales[cost_zip_code].show()

In [ ]:
sales.show()

In [ ]:
price_avg = sales.groupby(['zipcode'], {'price_avg' : graphlab.aggregate.AVG('price')})

In [ ]:
price_avg.sort('price_avg', ascending = False)

In [ ]:
sales

In [ ]:
len(sales[(sales['sqft_living'] > 2000 ) & (sales['sqft_living'] < 4000)])

In [ ]:
len(sales)

In [7]:
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house				
'grade', # measure of quality of construction				
'waterfront', # waterfront property				
'view', # type of view				
'sqft_above', # square feet above ground				
'sqft_basement', # square feet in basement				
'yr_built', # the year built				
'yr_renovated', # the year renovated				
'lat', 'long', # the lat-long of the parcel				
'sqft_living15', # average sq.ft. of 15 nearest neighbors 				
'sqft_lot15', # average lot size of 15 nearest neighbors 
]

In [ ]:
print advanced_features

In [8]:
my_advanced_features_model = graphlab.linear_regression.create(train_data, target='price', features = advanced_features)


PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 16509
PROGRESS: Number of features          : 18
PROGRESS: Number of unpacked features : 18
PROGRESS: Number of coefficients    : 127
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 2        | 0.023834     | 3501263.132139     | 2231071.249070       | 152905.845214 | 195746.528153   |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+

In [9]:
print my_features_model.evaluate(test_data)
print my_advanced_features_model.evaluate(test_data)


{'max_error': 3472806.4514070554, 'rmse': 179636.27816359687}
{'max_error': 3515905.390161382, 'rmse': 156171.8856532509}

In [10]:
179636.27816359687 - 156791.70994956387


Out[10]:
22844.568214033003

In [ ]: