In [1]:
#Imports
from math import *
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
sns.set(font_scale=1.5)
sns.set_style("ticks")
sns.set_palette(palette='deep')
sns.set_color_codes(palette='deep')
mpl.rcParams.update({'font.family': 'serif', 'font.serif':'DejaVu Serif'})
%matplotlib notebook
In [4]:
House_DataFrame = pd.read_csv('kc_house_data.csv').drop(['id','date'],axis=1)
House_DataFrame = House_DataFrame.drop(15870,axis=0) #Outlier at row 15870 confirmed to be incorrect data
This is to get some sense of what the data looks like and what features influence the price.
Location is typically considered a strong predictor of house price. Let's see what locations tend to be more or less expensive.
In [5]:
latitude = House_DataFrame['lat'].values
longitude = House_DataFrame['long'].values
price = House_DataFrame['price'].values
In [6]:
#Histogram of latitude and longitude
NumBins = 30
denominator, xedges, yedges = np.histogram2d(longitude,latitude,bins=NumBins)
numerator, _, _ = np.histogram2d(longitude,latitude,bins=[xedges, yedges], weights=price)
Histogram = numerator/denominator
Histogram[Histogram==0] = np.nan
HistogramMasked = np.ma.masked_invalid(Histogram)
In [7]:
#Plot
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(1, 1, 1)
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
#im = ax.pcolormesh(xedges,yedges,HistogramMasked.T*10**-6, cmap='viridis')
im = ax.imshow(np.fliplr(HistogramMasked*10**-6).T, extent=extent, interpolation='None',cmap='viridis')
cbar = fig.colorbar(im, ax=ax)
cbar.set_label('Mean price $\\times 10^6$ (dollars)')
ax.set_aspect('auto')
ax.set_xlim(xedges[0],xedges[-1])
ax.set_ylim(yedges[0],yedges[-1])
ax.set_xlabel('Longitude (degrees)')
ax.set_ylabel('Latitude (degrees)')
plt.tight_layout()
There seems to be a region of expensive homes near (-122.25, 47.6) with a gradual drop off further from this region. Perhaps K-nearest neighbors would be a good model for predicting prices based on location. This analysis could be extended by looking at the zipcode as a categorical variable.
House size (i.e. square footage) is also considered a strong price predictor. This data set has multiple features associated with square footage. The features labeleled with '15' correspond with the average square footage of the 15 nearest homes rather than the home itself.
In [8]:
fig,ax = plt.subplots(2,2,sharey=True,figsize=(12, 9))
opacity = 0.5
ax[0,0].plot(House_DataFrame['sqft_living'].values/10**3,price/10**6,marker='.',linestyle='None',alpha=opacity)
ax[0,0].set_xlabel('Sq. ft. $\\times 10^3$ (living)')
ax[0,0].set_ylabel('Price (Millions)')
ax[0,1].plot(House_DataFrame['sqft_lot'].values/10**5,price/10**6,marker='.',linestyle='None',alpha=opacity)
ax[0,1].set_xlabel('Sq. ft. $\\times 10^5$ (lot)') #Whatever 15 means...
ax[1,0].plot(House_DataFrame['sqft_above'].values/10**3,price/10**6,marker='.',linestyle='None',alpha=opacity)
ax[1,0].set_xlabel('Sq. ft. $\\times 10^3$ (above)')
ax[1,0].set_ylabel('Price (Millions)')
ax[1,1].plot(House_DataFrame['sqft_basement'].values/10**3,price/10**6,marker='.',linestyle='None',alpha=opacity)
ax[1,1].set_xlabel('Sq. ft. $\\times 10^3$ (basement)')
plt.tight_layout()
There appears to be a strong linear relationship between the square footage of living space and price. For basement square footage, there are a lot of data points with a value of zero, presumably indicating they have no basement.
The features 'grade' and 'condition' presumably relate to the condition of the house and would be expected to strongly influence price.
In [9]:
fig,ax = plt.subplots(2,figsize=(10, 14))
#Condition
sns.violinplot(x='condition', y='price', data=House_DataFrame, orient='v', showmeans=True, ax=ax[0])
ax[0].set_ylim(0,2*10**6)
#Grade
sns.boxplot(x='grade', y='price', data=House_DataFrame, orient='v', showmeans=True, ax=ax[1])
ax[1].set_ylim(0,6*10**6)
plt.tight_layout()
In [10]:
r_condition_price = sp.stats.pearsonr(House_DataFrame['condition'].values, House_DataFrame['price'].values)[0]
r_grade_price = sp.stats.pearsonr(House_DataFrame['grade'].values, House_DataFrame['price'].values)[0]
print(r_condition_price,r_grade_price)
The 'condition' seems to be a much worse predictor than expected with almost no correlation. 'Grade', however, appears to be a good predictor with an apparent power law dependence. We can do a quick fit on the means to see what the power is:
In [11]:
#Calculate mean price for each grade
grade = House_DataFrame['grade'].values
grade_range = np.unique(grade) #All unique values of 'grade'
mean_price_grade = np.zeros(grade_range.size)
for idx,grade_value in enumerate(grade_range):
mean_price_grade[idx] = np.mean(price[grade==grade_value])
#Power law fit
def PowerLaw(x,A,n,y0):
return A*x**n + y0
p0 = [1, 3, 0]
popt, pcov = sp.optimize.curve_fit(PowerLaw, grade_range, mean_price_grade, p0)
perr = np.sqrt(np.diag(pcov))
print(popt)
print(perr)
In [12]:
grade_power = popt[1]
In [13]:
#Plot
plt.figure()
plt.plot(grade_range,mean_price_grade*10**-6, linestyle='None',marker='o')
plt.plot(grade_range,PowerLaw(grade_range,popt[0],popt[1],popt[2])*10**-6,
label='$Ax^n$, n={power:.2f}'.format(power=grade_power))
plt.xlabel('grade')
plt.ylabel('Mean price $\\times 10^6$ (dollars)')
plt.legend(loc=2)
plt.tight_layout()
Next, let's look at the price dependence on the number of bedrooms and bathrooms.
In [14]:
fig,ax = plt.subplots(2,figsize=(12, 18))
#Bedrooms
sns.violinplot(x='bedrooms', y='price', data=House_DataFrame, orient='v', showmeans=True, ax=ax[0])
ax[0].set_ylim(0,2*10**6)
#Bathrooms
sns.boxplot(x='bathrooms', y='price', data=House_DataFrame, orient='v', showmeans=True, ax=ax[1])
ax[1].set_ylim(0,7*10**6)
ax[1].tick_params(axis='x',labelsize=12)
plt.tight_layout()
In [15]:
r_bed_price = sp.stats.pearsonr(House_DataFrame['bedrooms'].values, House_DataFrame['price'].values)[0]
r_bath_price = sp.stats.pearsonr(House_DataFrame['bathrooms'].values, House_DataFrame['price'].values)[0]
print(r_bed_price,r_bath_price)
The number of bedrooms doesn't appear to be as a good predictor as the number of bathrooms. It's worth keeping in mind that a house with many bedrooms/bathrooms would also likely have a large square footage. Let's take a quick look at this.
In [16]:
plt.figure()
sns.violinplot(x='bedrooms',y='sqft_living', data=House_DataFrame, orient='v', showmeans=True)
plt.ylim(0,10000)
plt.tight_layout()
In [17]:
r = sp.stats.pearsonr(House_DataFrame['sqft_living'].values, House_DataFrame['bedrooms'].values)[0]
print(r)
There is a modest correlation. This could be convoluting the true relationship between the price and the number of bathrooms.
Lastly, let's look at the correlation between all pairs of features using seaborn's pairplot function.
In [ ]:
sns.set(font_scale=1.0)
sns.pairplot(House_DataFrame)
First, let's define a function to split the data, fit to the training data, predict the prices of the testing data, and output the R^2 and RMSE values (testing error).
In [180]:
def OLS_Regression(data,target,train_size=0.7,seed=123):
#Split data into training and testing sets
train_data, test_data, train_target, test_target = train_test_split(data, target, train_size=train_size,
random_state=seed)
linreg = LinearRegression()
linreg.fit(train_data, train_target)
train_predict = linreg.predict(train_data) #Predicted values of the training data
test_predict = linreg.predict(test_data) #Predicted values of the testing data
R2 = linreg.score(test_data, test_target)
RMSE = np.sqrt(sklearn.metrics.mean_squared_error(test_target,test_predict))
output = {'train_data':train_data, 'test_data':test_data, 'train_target':train_target,
'test_target':test_target, 'train_predict':train_predict, 'test_predict':test_predict,
'R2':R2, 'RMSE':RMSE}
return output
From our analysis above, the sqft_living and grade features should work well with linear regression. Let's try using just one feature, sqft_living, for now.
In [181]:
#Select data
sqft_living = House_DataFrame['sqft_living'].values #Select a single feature
sqft_living = sqft_living.reshape(sqft_living.size,1)
target = House_DataFrame['price'].values
In [182]:
lin_sqft_output = OLS_Regression(sqft_living,target)
In [183]:
print('R^2 =',lin_sqft_output['R2'])
print('RMSE =',lin_sqft_output['RMSE'])
fig,ax = plt.subplots(1,1,figsize=(6, 5))
opacity = 0.5
ax.plot(lin_sqft_output['test_data'][:,0]/10**3, lin_sqft_output['test_target']/10**6,
marker='.',linestyle='None', alpha=opacity)
ax.plot(lin_sqft_output['test_data'][:,0]/10**3,lin_sqft_output['test_predict']/10**6,
marker='.',linestyle='None', alpha=opacity)
ax.set_xlabel('Sq. ft. $\\times 10^3$ (living)')
ax.set_ylabel('Price $\\times 10^6$ (Dollars)')
plt.tight_layout()
Next, let's try the grade feature
In [236]:
#Select data
House_DataFrame['grade^5.6'] = House_DataFrame['grade'].values**grade_power #Add grade^5.6 to dataframe
grade_powered = House_DataFrame['grade^5.6'].values
grade = grade.reshape(grade.size,1)
In [237]:
lin_grade_output = OLS_Regression(grade,target)
In [238]:
print('R^2 =',lin_grade_output['R2'])
print('RMSE =',lin_grade_output['RMSE'])
Both the sqft_living and grade features perform similarly well. We can also try multiple regression using both features.
In [239]:
data = np.hstack((sqft_living,grade))
In [240]:
lin_sqft_and_grade_output = OLS_Regression(data,target)
In [241]:
print('R^2 =',lin_sqft_and_grade_output['R2'])
print('RMSE =',lin_sqft_and_grade_output['RMSE'])
Using both features has improved the estimate. Further improvements can likely be made by using additional features.
Let's define a function as before
In [199]:
def knn_Regression(data, target, n_neighbors, train_size=0.7, seed=123):
#Split data into training and testing sets
train_data, test_data, train_target, test_target = train_test_split(data, target, train_size=train_size,
random_state=seed)
#We can weight the nearest neighbors by their distance from the point of interest
knnreg = KNeighborsRegressor(n_neighbors=n_neighbors, weights='distance')
knnreg.fit(train_data, train_target)
train_predict = knnreg.predict(train_data) #Predicted values of the training data
test_predict = knnreg.predict(test_data) #Predicted values of the testing data
R2 = knnreg.score(test_data, test_target)
RMSE = np.sqrt(sklearn.metrics.mean_squared_error(test_target,test_predict))
output = {'train_data':train_data, 'test_data':test_data, 'train_target':train_target,
'test_target':test_target, 'train_predict':train_predict, 'test_predict':test_predict,
'R2':R2, 'RMSE':RMSE}
return output
For this model, let's try using latitude and longitude. We can also try many different values for the number of nearest neighbors
In [228]:
data = House_DataFrame[['lat','long']].values
target = House_DataFrame['price'].values
In [218]:
Num_n_neighbors = 20
n_neighbors = np.arange(1,Num_n_neighbors+1)
knn_R2 = np.zeros(Num_n_neighbors)
knn_RMSE = np.zeros(Num_n_neighbors)
for idx,n in enumerate(n_neighbors):
knn_latlong_output = knn_Regression(data,target,n_neighbors=n)
knn_R2[idx] = knn_latlong_output['R2']
knn_RMSE[idx] = knn_latlong_output['RMSE']
In [219]:
print('Optimal number of nearest neighbors =', n_neighbors[knn_RMSE == np.min(knn_RMSE)][0])
fig, ax1 = plt.subplots(figsize=(8,5.5))
#Plot R^2 on the left axis
ax1.plot(n_neighbors, knn_R2, color='b')
ax1.set_xlabel('n neighbors')
ax1.set_ylabel('$R^2$', color="b")
for label in ax1.get_yticklabels():
label.set_color("b")
#Plot RMSE on the right axis
ax2 = ax1.twinx()
ax2.plot(n_neighbors, knn_RMSE, color='r')
ax2.set_ylabel('RMSE (Dollars)', color="r")
for label in ax2.get_yticklabels():
label.set_color("r")
plt.tight_layout()
Again, we can choose other features to try to improve our results. We already had some success using the grade feature in the OLS method, so let's try using that with k-nearest nearest neighbors as well.
In [248]:
data = House_DataFrame[['lat','long','grade^5.6']].values
target = House_DataFrame['price'].values
In [252]:
knn_latlong_and_grade_output = knn_Regression(data,target,n_neighbors=n_neighbors[knn_RMSE == np.min(knn_RMSE)][0])
In [253]:
print('R2 =', knn_latlong_and_grade_output['R2'])
print('RMSE =', knn_latlong_and_grade_output['RMSE'])
As expected, we see some reasonable improvement.