In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
In [268]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)
In [269]:
print(train_data)
In [270]:
train_data['price'].mean()
Out[270]:
In [5]:
train_data['sqft_living'].mean()
Out[5]:
In [6]:
train_data['sqft_living'].var()
Out[6]:
In [8]:
from sklearn import linear_model
In [53]:
regr = linear_model.LinearRegression(fit_intercept=False)
In [169]:
X = np.ones((len(train_data), 4))
X[:,1:] = train_data[['sqft_living', 'bedrooms', 'bathrooms']].values
y = train_data['price']
In [131]:
X
Out[131]:
In [170]:
regr.fit(X, y)
Out[170]:
In [171]:
regr.coef_
Out[171]:
In [279]:
def add_new_features(dataset):
dataset['bedrooms_squared'] = dataset['bedrooms'] * dataset['bedrooms']
dataset['bed_bath_rooms'] = dataset['bedrooms'] * dataset['bathrooms']
dataset['log_sqft_living'] = np.log(dataset['sqft_living'])
dataset['lat_plus_long'] = dataset['lat'] + dataset['long']
In [280]:
add_new_features(train_data)
add_new_features(test_data)
In [281]:
train_data.columns.values
Out[281]:
In [282]:
np.mean(test_data['bedrooms_squared'])
Out[282]:
In [283]:
np.mean(train_data['bedrooms_squared'])
Out[283]:
In [284]:
np.mean(test_data['bed_bath_rooms'])
Out[284]:
In [285]:
np.mean(test_data['log_sqft_living'])
Out[285]:
In [286]:
np.mean(test_data['lat_plus_long'])
Out[286]:
In [287]:
feature_set1 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
feature_set2 = feature_set1 + ['bed_bath_rooms']
feature_set3 = feature_set2 + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']
In [288]:
def train_linear_regression(dataset, feature_set):
y = dataset['price']
X = np.ones((len(dataset), len(feature_set) + 1))
X[:,1:] = dataset[feature_set]
regr = linear_model.LinearRegression(fit_intercept=False)
regr.fit(X, y)
return regr
In [289]:
model1 = train_linear_regression(train_data, feature_set1)
In [290]:
model2 = train_linear_regression(train_data, feature_set2)
In [291]:
model3 = train_linear_regression(train_data, feature_set3)
In [292]:
model1.coef_[3]
Out[292]:
What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 2?
In [293]:
model2.coef_[3]
Out[293]:
In [294]:
def predict(model, dataset, feature_set):
X = np.ones((len(dataset), len(feature_set) + 1))
X[:,1:] = dataset[feature_set]
return model.predict(X)
In [295]:
def compute_rss(model, dataset, feature_set):
y_hat = predict(model, dataset, feature_set)
y = dataset['price']
return (y - y_hat).T.dot(y - y_hat)
In [300]:
train_rss = [compute_rss(model1, train_data, feature_set1), compute_rss(model2, train_data, feature_set2), compute_rss(model3, train_data, feature_set3)]
train_rss
Out[300]:
In [304]:
train_rss.index(min(train_rss))
Out[304]:
In [301]:
test_rss = [compute_rss(model1, test_data, feature_set1), compute_rss(model2, test_data, feature_set2), compute_rss(model3, test_data, feature_set3)]
test_rss
Out[301]:
In [299]:
test_rss.index(min(test_rss))
Out[299]:
In [303]:
np.array(test_rss) - np.array(train_rss)
Out[303]:
In [ ]: