In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn import linear_model
import matplotlib.pyplot as plt


/Users/viktorp/anaconda/lib/python3.5/site-packages/sklearn/utils/fixes.py:64: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  if 'order' in inspect.getargspec(np.copy)[0]:

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
def polynomial_dataframe(feature, degree): # feature is pandas.Series type
    # assume that degree >= 1
    # initialize the dataframe:
    poly_dataframe = pd.DataFrame()
    # and set poly_dataframe['power_1'] equal to the passed feature
    poly_dataframe['power_1'] = feature
    # first check if degree > 1
    if degree > 1:
        # then loop over the remaining degrees:
        for power in range(2, degree+1):
            # first we'll give the column a name:
            name = 'power_' + str(power)
            # assign poly_dataframe[name] to be feature^power; use apply(*)
            poly_dataframe[name] = feature**power
    poly_dataframe['constant'] = 1

    return poly_dataframe

In [4]:
tmp = pd.Series([1., 2., 3.])
print(polynomial_dataframe(tmp, 3))


   power_1  power_2  power_3  constant
0        1        1        1         1
1        2        4        8         1
2        3        9       27         1

Prediction with 1 degree polynomial


In [5]:
sales = pd.read_csv('kc_house_data.csv', dtype = dtype_dict)
sales = sales.sort_values(by=['sqft_living', 'price'])

In [6]:
poly1_data = polynomial_dataframe(sales['sqft_living'], 1)
poly1_data['price'] = sales['price']

In [7]:
model1 = linear_model.LinearRegression(fit_intercept=False)
model1.fit(poly1_data[['constant','power_1']], poly1_data['price'])


/Users/viktorp/anaconda/lib/python3.5/site-packages/sklearn/base.py:175: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  args, varargs, kw, default = inspect.getargspec(init)
Out[7]:
LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [8]:
model1.coef_


Out[8]:
array([-43580.74309449,    280.6235679 ])

In [9]:
plt.plot(poly1_data['power_1'], poly1_data['price'], '.', 
        poly1_data['power_1'], model1.predict(poly1_data[['constant','power_1']]), '-')


Out[9]:
[<matplotlib.lines.Line2D at 0x109a2a588>,
 <matplotlib.lines.Line2D at 0x109a2a940>]

Prediction with 2-nd and 3-rd degree polynomial


In [10]:
poly3_data = polynomial_dataframe(sales['sqft_living'], 3)
features = ['constant'] + poly3_data.columns.values.tolist()
features_2 = features[:3]
print(features_2)
features_3 = features[:4]
print(features_3)


['constant', 'power_1', 'power_2']
['constant', 'power_1', 'power_2', 'power_3']

In [11]:
poly3_data['price'] = sales['price']
model2 = linear_model.LinearRegression(fit_intercept=False)
model2.fit(poly3_data[features_2], poly3_data['price'])
model3 = linear_model.LinearRegression(fit_intercept=False)
model3.fit(poly3_data[features_3], poly3_data['price'])


/Users/viktorp/anaconda/lib/python3.5/site-packages/sklearn/base.py:175: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  args, varargs, kw, default = inspect.getargspec(init)
Out[11]:
LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [12]:
plt.plot(poly3_data['power_1'], poly3_data['price'], '.', 
        poly3_data['power_1'], model2.predict(poly3_data[features_2]), '-',
        poly3_data['power_1'], model3.predict(poly3_data[features_3]), '-'
        )


Out[12]:
[<matplotlib.lines.Line2D at 0x10a44cef0>,
 <matplotlib.lines.Line2D at 0x10a4522b0>,
 <matplotlib.lines.Line2D at 0x10a452ac8>]

Predictiction with polinom of 15 degree


In [13]:
poly15_data = polynomial_dataframe(sales['sqft_living'], 15)
features_15 = ['constant'] + poly15_data.columns.values[:15].tolist()
print(features_15)
poly15_data['price'] = sales['price']
model15 = linear_model.LinearRegression(fit_intercept=False)
model15.fit(poly15_data[features_15], poly15_data['price'])


['constant', 'power_1', 'power_2', 'power_3', 'power_4', 'power_5', 'power_6', 'power_7', 'power_8', 'power_9', 'power_10', 'power_11', 'power_12', 'power_13', 'power_14', 'power_15']
/Users/viktorp/anaconda/lib/python3.5/site-packages/sklearn/base.py:175: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  args, varargs, kw, default = inspect.getargspec(init)
Out[13]:
LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [14]:
plt.plot(poly15_data['power_1'], poly15_data['price'], '.',
        poly15_data['power_1'], model15.predict(poly15_data[features_15]), '-')


Out[14]:
[<matplotlib.lines.Line2D at 0x10a9bfb70>,
 <matplotlib.lines.Line2D at 0x10a9bfef0>]

Estimating 15th degree polynomial for sales subsets


In [15]:
set_1 = pd.read_csv('wk3_kc_house_set_1_data.csv', dtype=dtype_dict)
set_2 = pd.read_csv('wk3_kc_house_set_2_data.csv', dtype=dtype_dict)
set_3 = pd.read_csv('wk3_kc_house_set_3_data.csv', dtype=dtype_dict)
set_4 = pd.read_csv('wk3_kc_house_set_4_data.csv', dtype=dtype_dict)
subsets = [set_1, set_2, set_3, set_4]

In [16]:
subset_models = []
for (i, subset) in enumerate(subsets):
    subset_poly15 = polynomial_dataframe(subset['sqft_living'], 15)
    features_15 = ['constant'] + subset_poly15.columns.values[:15].tolist()
    subset_poly15['price'] = subset['price']
    subset_model = linear_model.LinearRegression(fit_intercept=False)
    subset_model.fit(subset_poly15[features_15], subset_poly15['price'])
    subset_models.append(subset_model)

Is the sign (positive or negative) for power_15 the same in all four models?


In [17]:
for (i, model) in enumerate(subset_models):
    print(model.coef_[15])


2.70204342076e-52
-3.31987748947e-49
4.56200509375e-51
-1.00221075723e-48

In [18]:
for (model, subset) in zip(subset_models, subsets):
    poly_set = polynomial_dataframe(subset['sqft_living'], 15)
    features_15 = ['constant'] + poly_set.columns.values[:15].tolist()
    plt.plot(poly_set['power_1'], subset['price'], '.',
            poly_set['power_1'], model.predict(poly_set[features_15]), '-')


Select best polynomial feature


In [22]:
training_data = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation_data = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)

In [30]:
validation_poly_RSS = []
poly_models = []
for degree in range(1, 16):
    training_poly = polynomial_dataframe(training_data['sqft_living'], degree)
    degree_features = ['constant'] + training_poly.columns.values[:degree].tolist()
    training_poly['price'] = training_data['price']
    regr = linear_model.LinearRegression(fit_intercept=False)
    regr.fit(training_poly[degree_features], training_poly['price'])
    poly_models.append(regr)
    validation_data_poly = polynomial_dataframe(validation_data['sqft_living'], degree)
    error = validation_data['price'] - regr.predict(validation_data_poly[degree_features])
    RSS = error.T.dot(error)
    validation_poly_RSS.append(RSS)

In [31]:
validation_poly_RSS


Out[31]:
[629097886299586.0,
 623955062706519.5,
 625820280268345.88,
 629987726205413.88,
 635093812629578.0,
 1777694369209873.5,
 10074410325410116.0,
 48507681731651368.0,
 2.1836761447419587e+17,
 9.4610553671480205e+17,
 3.8638461185265987e+18,
 1.4769334120626762e+19,
 1.4203156003611302e+18,
 4.0264955123220577e+18,
 1.165299858107777e+19]

In [32]:
best_polynom_index = validation_poly_RSS.index(min(validation_poly_RSS))
best_polynom_index


Out[32]:
1

In [33]:
min(validation_poly_RSS)


Out[33]:
623955062706519.5

In [38]:
test_poly = polynomial_dataframe(test_data['sqft_living'], best_polynom_index + 1)
test_features = ['constant'] + test_poly.columns.values[:best_polynom_index + 1].tolist()
print(test_features)


['constant', 'power_1', 'power_2']

In [39]:
test_error = test_data['price'] - poly_models[best_polynom_index].predict(test_poly[test_features])

In [40]:
test_RSS = error.T.dot(error)

In [41]:
test_RSS


Out[41]:
1.165299858107777e+19

In [ ]: