In [1]:
import graphlab
graphlab.get_dependencies()
In [1]:
# import
import graphlab as gl
import matplotlib.pyplot as plt
import numpy as np
In [108]:
gl.canvas.set_target('ipynb')
%matplotlib inline
In [109]:
# reading data
sales = gl.SFrame('data/kc_house_data.gl/')
sales = sales.sort(['sqft_living','price'])
sales.head(2)
Out[109]:
In [110]:
def polynomial_sframe(feature, degree):
# assume that degree >= 1
# initialize the SFrame:
poly_sframe = gl.SFrame()
# and set poly_sframe['power_1'] equal to the passed feature
poly_sframe['power_1'] = feature
# first check if degree > 1
if degree > 1:
# then loop over the remaining degrees:
for power in range(2, degree+1):
# first we'll give the column a name:
name = 'power_' + str(power)
# assign poly_sframe[name] to be feature^power
#poly_sframe[name]= feature.apply(lambda x: x**power)
poly_sframe[name]= feature**power # can use this as well
return poly_sframe
In [117]:
poly1_data = polynomial_sframe(sales['sqft_living'], 1)
poly1_data['price'] = sales['price']
poly1_data_names = poly1_data.column_names()
In [118]:
model1 = gl.linear_regression.create(poly1_data, target = 'price', features = ['power_1'], validation_set = None)
In [119]:
plt.plot(poly1_data['power_1'], poly1_data['price'], '.',
poly1_data['power_1'], model1.predict(poly1_data), '-', linewidth=2)
plt.grid(True)
In [121]:
poly2_data = polynomial_sframe(sales['sqft_living'], 2)
poly2_data_names = poly2_data.column_names()
poly2_data['price'] = sales['price']
model2 = gl.linear_regression.create(poly2_data, target = 'price', features = poly2_data_names,
validation_set = None, verbose=False)
plt.plot(poly2_data['power_2'], poly2_data['price'], '.',
poly2_data['power_2'], model2.predict(poly2_data), '-', linewidth=2)
plt.grid(True)
In [122]:
poly3_data = polynomial_sframe(sales['sqft_living'], 3)
poly3_data_names = poly3_data.column_names()
poly3_data['price'] = sales['price']
model3 = gl.linear_regression.create(poly3_data, target = 'price', features = poly3_data_names, validation_set = None, verbose=False)
plt.plot(poly3_data['power_3'], poly3_data['price'], '.',
poly3_data['power_3'], model3.predict(poly3_data), '-', linewidth=2)
plt.grid(True)
In [123]:
poly15_data = polynomial_sframe(sales['sqft_living'], 15)
poly15_data_names = poly15_data.column_names()
poly15_data['price'] = sales['price']
model15 = gl.linear_regression.create(poly15_data, target = 'price', features = poly15_data_names, validation_set = None, verbose=False)
plt.plot(poly15_data['power_15'], poly15_data['price'], '.',
poly15_data['power_15'], model15.predict(poly15_data), '-', linewidth=2)
plt.grid(True)
first split sales into 2 subsets with .random_split(.5) use seed = 0!
next split these into 2 more subsets (4 total) using random_split(0.5) again set seed = 0!
you should have 4 subsets of (approximately) equal size, call them set_1, set_2, set_3, and set_4
In [124]:
sales1, sales2 = sales.random_split(.5, seed = 0)
set_1, set_2 = sales1.random_split(.5, seed = 0)
set_3, set_4 = sales2.random_split(.5, seed = 0)
In [125]:
poly15_set_1 = polynomial_sframe(set_1['sqft_living'], 15)
poly15_set_1_names = poly15_set_1.column_names()
poly15_set_1['price'] = set_1['price']
#print(poly15_set_1.head(2))
model15_set_1 = gl.linear_regression.create(poly15_set_1, target = 'price',
features = poly15_set_1_names, validation_set = None,
verbose=False)
plt.plot(poly15_set_1['power_15'], poly15_set_1['price'], '.',
poly15_set_1['power_15'], model15_set_1.predict(poly15_set_1), '-', linewidth=2)
plt.grid(True)
model15_set_1.get('coefficients')
Out[125]:
In [ ]:
poly15_set_2 = polynomial_sframe(set_2['sqft_living'], 15)
poly15_set_2['price'] = set_2['price']
model15_set_2 = gl.linear_regression.create(poly15_set_2, target = 'price', features = ['power_15'], validation_set = None, verbose=False)
plt.plot(poly15_set_2['power_15'], poly15_set_2['price'], '.',
poly15_set_2['power_15'], model15_set_2.predict(poly15_set_2), '-', linewidth=2)
plt.grid(True)
model15_set_2.get('coefficients')
In [ ]:
poly15_set_3 = polynomial_sframe(set_3['sqft_living'], 15)
poly15_set_3['price'] = set_3['price']
model15_set_3 = gl.linear_regression.create(poly15_set_3, target = 'price', features = ['power_15'], validation_set = None, verbose=False)
plt.plot(poly15_set_3['power_15'], poly15_set_3['price'], '.',
poly15_set_3['power_15'], model15_set_3.predict(poly15_set_3), '-', linewidth=2)
plt.grid(True)
model15_set_3.get('coefficients')
In [ ]:
poly15_set_4 = polynomial_sframe(set_4['sqft_living'], 15)
poly15_set_4['price'] = set_4['price']
model15_set_4 = gl.linear_regression.create(poly15_set_4, target = 'price', features = ['power_15'], validation_set = None, verbose=False)
plt.plot(poly15_set_4['power_15'], poly15_set_4['price'], '.',
poly15_set_4['power_15'], model15_set_4.predict(poly15_set_4), '-', linewidth=2)
plt.grid(True)
model15_set_4.get('coefficients')
In [ ]:
training_and_validation, testing = sales.random_split(0.9, seed=1)
training, validation = training_and_validation.random_split(0.5, seed=1)
In [ ]:
RSS_validation = {}
RSS_testing = {}
for degree in range(1, 16):
model_name = 'model_'+ str(degree)
dataset_name = 'dataset_'+ str(degree)
feature_name = 'power_'+ str(degree)
validation_dataset = gl.SFrame()
validation_dataset[feature_name] = validation['sqft_living']
validation_dataset['price'] = validation['price']
testing_dataset = gl.SFrame()
testing_dataset[feature_name] = testing['sqft_living']
testing_dataset['price'] = testing['price']
dataset_name = polynomial_sframe(training['sqft_living'], degree)
dataset_name['price'] = training['price']
model_name = gl.linear_regression.create(dataset_name,
target = 'price',
features = [feature_name],
validation_set = None,
verbose = False)
validation_dataset['prediction'] = model_name.predict(validation_dataset)
#print(validation_dataset['prediction'])
rss = np.sum(np.square(validation_dataset['price'] - validation_dataset['prediction']))
rss2 = np.sum(np.square(testing_dataset['price'] - model_name.predict(testing_dataset)))
RSS_validation[degree] = rss
RSS_testing[degree] = rss2
In [ ]:
# sorting the dict
for k in sorted(RSS_validation, key=RSS_validation.get):
print k,'\t',RSS_validation[k]
In [ ]:
RSS_validation
In [ ]:
x = gl.SArray((1,2,3))
In [ ]:
y = x.apply(lambda p: p**2)
In [ ]:
y
In [ ]:
features
In [ ]:
In [ ]: