In [42]:
import numpy as np
import pandas as pa
import matplotlib.pyplot as plt
from sklearn import linear_model
In [6]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int,
'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float,
'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int,
'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
regressionDir = '/home/weenkus/workspace/Machine Learning - University of Washington/Regression'
house = pa.read_csv(regressionDir + '/datasets/kc_house_data.csv', dtype = dtype_dict)
house_test = pa.read_csv(regressionDir + '/datasets/kc_house_test_data.csv', dtype = dtype_dict)
house_train = pa.read_csv(regressionDir + '/datasets/kc_house_train_data.csv', dtype = dtype_dict)
In [7]:
house.head()
Out[7]:
In [8]:
# Show plots in jupyter
%matplotlib inline
plt.scatter(house.sqft_living, house.price, alpha=0.5)
plt.ylabel('')
plt.xlabel('price')
plt.show()
In [63]:
house['bedrooms_squared'] = house['bedrooms'].apply(lambda x : x*x)
house_test['bedrooms_squared'] = house_test['bedrooms'].apply(lambda x : x*x)
house_train['bedrooms_squared'] = house_train['bedrooms'].apply(lambda x : x*x)
house['bed_bath_rooms'] = house.apply(lambda x : x['bedrooms'] * x['bathrooms'], axis=1)
house_test['bed_bath_rooms'] = house_test.apply(lambda x : x['bedrooms'] * x['bathrooms'], axis=1)
house_train['bed_bath_rooms'] = house_train.apply(lambda x : x['bedrooms'] * x['bathrooms'], axis=1)
house['log_sqft_living'] = house['sqft_living'].apply(lambda x : np.log(x))
house_test['log_sqft_living'] = house_test['sqft_living'].apply(lambda x : np.log(x))
house_train['log_sqft_living'] = house_train['sqft_living'].apply(lambda x : np.log(x))
house['lat_plus_long'] = house.apply(lambda x : x['lat'] + x['long'], axis=1)
house_test['lat_plus_long'] = house_test.apply(lambda x : x['lat'] + x['long'], axis=1)
house_train['lat_plus_long'] = house_train.apply(lambda x : x['lat'] + x['long'], axis=1)
house.head()
Out[63]:
In [64]:
print ('Bedrooms_squared mean: ', np.round(np.mean(house_test['bedrooms_squared']),2))
print ('Bed_sqft_living mean: ', np.round(np.mean(house_test['bed_bath_rooms']),2))
print ('Log_sqft_living mean: ', np.round(np.mean(house_test['log_sqft_living']),2))
print ('Lat_plus_long mean: ', np.round(np.mean(house_test['lat_plus_long']),2))
In [60]:
model1 = linear_model.LinearRegression()
model1_features = house_train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']]
model1_features_test = house_test[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']]
model1.fit(model1_features, house_train['price'])
model2 = linear_model.LinearRegression()
model2_features = house_train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']]
model2_features_test = house_test[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']]
model2.fit(model2_features, house_train['price'])
model3 = linear_model.LinearRegression()
model3_features = house_train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms',
'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']]
model3_features_test = house_test[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms',
'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']]
model3.fit(model3_features, house_train['price'])
Out[60]:
In [48]:
print ('Model1: ', model1.coef_)
In [50]:
print ('Model2: ', model2.coef_)
In [58]:
print("Model1 RSS: %.2f" % ((model1.predict(model1_features) - house_train['price']) ** 2).sum())
print("Model2 RSS: %.2f" % ((model2.predict(model2_features) - house_train['price']) ** 2).sum())
print("Model3 RSS: %.2f" % ((model3.predict(model3_features) - house_train['price']) ** 2).sum())
In [62]:
print("Model1 RSS: %.2f" % ((model1.predict(model1_features_test) - house_test['price']) ** 2).sum())
print("Model2 RSS: %.2f" % ((model2.predict(model2_features_test) - house_test['price']) ** 2).sum())
print("Model3 RSS: %.2f" % ((model3.predict(model3_features_test) - house_test['price']) ** 2).sum())
In [ ]: