In [1]:
#Imports
from math import *
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

sns.set(font_scale=1.5)
sns.set_style("ticks")
sns.set_palette(palette='deep')
sns.set_color_codes(palette='deep')
mpl.rcParams.update({'font.family': 'serif', 'font.serif':'DejaVu Serif'})

%matplotlib notebook

Load data


In [4]:
House_DataFrame = pd.read_csv('kc_house_data.csv').drop(['id','date'],axis=1)
House_DataFrame = House_DataFrame.drop(15870,axis=0) #Outlier at row 15870 confirmed to be incorrect data

Select visualizations

This is to get some sense of what the data looks like and what features influence the price.

Price dependence on location

Location is typically considered a strong predictor of house price. Let's see what locations tend to be more or less expensive.


In [5]:
latitude = House_DataFrame['lat'].values
longitude = House_DataFrame['long'].values
price = House_DataFrame['price'].values

In [6]:
#Histogram of latitude and longitude
NumBins = 30
denominator, xedges, yedges = np.histogram2d(longitude,latitude,bins=NumBins)
numerator, _, _ = np.histogram2d(longitude,latitude,bins=[xedges, yedges], weights=price)
Histogram = numerator/denominator
Histogram[Histogram==0] = np.nan
HistogramMasked = np.ma.masked_invalid(Histogram)

In [7]:
#Plot
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(1, 1, 1)

extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
#im = ax.pcolormesh(xedges,yedges,HistogramMasked.T*10**-6, cmap='viridis')
im = ax.imshow(np.fliplr(HistogramMasked*10**-6).T, extent=extent, interpolation='None',cmap='viridis')
cbar = fig.colorbar(im, ax=ax)
cbar.set_label('Mean price $\\times 10^6$ (dollars)')
ax.set_aspect('auto')
ax.set_xlim(xedges[0],xedges[-1])
ax.set_ylim(yedges[0],yedges[-1])
ax.set_xlabel('Longitude (degrees)')
ax.set_ylabel('Latitude (degrees)')
plt.tight_layout()


There seems to be a region of expensive homes near (-122.25, 47.6) with a gradual drop off further from this region. Perhaps K-nearest neighbors would be a good model for predicting prices based on location. This analysis could be extended by looking at the zipcode as a categorical variable.

Square footage

House size (i.e. square footage) is also considered a strong price predictor. This data set has multiple features associated with square footage. The features labeleled with '15' correspond with the average square footage of the 15 nearest homes rather than the home itself.


In [8]:
fig,ax = plt.subplots(2,2,sharey=True,figsize=(12, 9))

opacity = 0.5

ax[0,0].plot(House_DataFrame['sqft_living'].values/10**3,price/10**6,marker='.',linestyle='None',alpha=opacity)
ax[0,0].set_xlabel('Sq. ft. $\\times 10^3$ (living)')
ax[0,0].set_ylabel('Price (Millions)')
ax[0,1].plot(House_DataFrame['sqft_lot'].values/10**5,price/10**6,marker='.',linestyle='None',alpha=opacity)
ax[0,1].set_xlabel('Sq. ft. $\\times 10^5$ (lot)') #Whatever 15 means...
ax[1,0].plot(House_DataFrame['sqft_above'].values/10**3,price/10**6,marker='.',linestyle='None',alpha=opacity)
ax[1,0].set_xlabel('Sq. ft. $\\times 10^3$ (above)')
ax[1,0].set_ylabel('Price (Millions)')
ax[1,1].plot(House_DataFrame['sqft_basement'].values/10**3,price/10**6,marker='.',linestyle='None',alpha=opacity)
ax[1,1].set_xlabel('Sq. ft. $\\times 10^3$ (basement)')
plt.tight_layout()


There appears to be a strong linear relationship between the square footage of living space and price. For basement square footage, there are a lot of data points with a value of zero, presumably indicating they have no basement.

Grade and condition

The features 'grade' and 'condition' presumably relate to the condition of the house and would be expected to strongly influence price.


In [9]:
fig,ax = plt.subplots(2,figsize=(10, 14))

#Condition
sns.violinplot(x='condition', y='price', data=House_DataFrame, orient='v', showmeans=True, ax=ax[0])
ax[0].set_ylim(0,2*10**6)

#Grade
sns.boxplot(x='grade', y='price', data=House_DataFrame, orient='v', showmeans=True, ax=ax[1])
ax[1].set_ylim(0,6*10**6)

plt.tight_layout()



In [10]:
r_condition_price = sp.stats.pearsonr(House_DataFrame['condition'].values, House_DataFrame['price'].values)[0]
r_grade_price = sp.stats.pearsonr(House_DataFrame['grade'].values, House_DataFrame['price'].values)[0]
print(r_condition_price,r_grade_price)


0.0363360942064 0.667447259299

The 'condition' seems to be a much worse predictor than expected with almost no correlation. 'Grade', however, appears to be a good predictor with an apparent power law dependence. We can do a quick fit on the means to see what the power is:


In [11]:
#Calculate mean price for each grade
grade = House_DataFrame['grade'].values
grade_range = np.unique(grade) #All unique values of 'grade'
mean_price_grade = np.zeros(grade_range.size)

for idx,grade_value in enumerate(grade_range):
    mean_price_grade[idx] = np.mean(price[grade==grade_value])
    
#Power law fit
def PowerLaw(x,A,n,y0):
    return A*x**n + y0

p0 = [1, 3, 0]
popt, pcov = sp.optimize.curve_fit(PowerLaw, grade_range, mean_price_grade, p0)
perr = np.sqrt(np.diag(pcov))
print(popt)
print(perr)


[  1.54586445e+00   5.68901953e+00   2.47252736e+05]
[  1.44281091e+00   3.66251038e-01   4.26555417e+04]

In [12]:
grade_power = popt[1]

In [13]:
#Plot
plt.figure()
plt.plot(grade_range,mean_price_grade*10**-6, linestyle='None',marker='o')
plt.plot(grade_range,PowerLaw(grade_range,popt[0],popt[1],popt[2])*10**-6, 
         label='$Ax^n$, n={power:.2f}'.format(power=grade_power))
plt.xlabel('grade')
plt.ylabel('Mean price $\\times 10^6$ (dollars)')
plt.legend(loc=2)
plt.tight_layout()


Number of bedrooms and bathrooms

Next, let's look at the price dependence on the number of bedrooms and bathrooms.


In [14]:
fig,ax = plt.subplots(2,figsize=(12, 18))

#Bedrooms
sns.violinplot(x='bedrooms', y='price', data=House_DataFrame, orient='v', showmeans=True, ax=ax[0])
ax[0].set_ylim(0,2*10**6)

#Bathrooms
sns.boxplot(x='bathrooms', y='price', data=House_DataFrame, orient='v', showmeans=True, ax=ax[1])
ax[1].set_ylim(0,7*10**6)
ax[1].tick_params(axis='x',labelsize=12)

plt.tight_layout()



In [15]:
r_bed_price = sp.stats.pearsonr(House_DataFrame['bedrooms'].values, House_DataFrame['price'].values)[0]
r_bath_price = sp.stats.pearsonr(House_DataFrame['bathrooms'].values, House_DataFrame['price'].values)[0]
print(r_bed_price,r_bath_price)


0.315444864864 0.525147094736

The number of bedrooms doesn't appear to be as a good predictor as the number of bathrooms. It's worth keeping in mind that a house with many bedrooms/bathrooms would also likely have a large square footage. Let's take a quick look at this.


In [16]:
plt.figure()
sns.violinplot(x='bedrooms',y='sqft_living', data=House_DataFrame, orient='v', showmeans=True)
plt.ylim(0,10000)
plt.tight_layout()



In [17]:
r = sp.stats.pearsonr(House_DataFrame['sqft_living'].values, House_DataFrame['bedrooms'].values)[0]
print(r)


0.591467222122

There is a modest correlation. This could be convoluting the true relationship between the price and the number of bathrooms.

Scatterplots of all pairs of features (slow)

Lastly, let's look at the correlation between all pairs of features using seaborn's pairplot function.


In [ ]:
sns.set(font_scale=1.0)
sns.pairplot(House_DataFrame)

Regression

Ordinary least squares

First, let's define a function to split the data, fit to the training data, predict the prices of the testing data, and output the R^2 and RMSE values (testing error).


In [180]:
def OLS_Regression(data,target,train_size=0.7,seed=123):
    
    #Split data into training and testing sets
    train_data, test_data, train_target, test_target = train_test_split(data, target, train_size=train_size, 
                                                                        random_state=seed)
    linreg = LinearRegression()
    linreg.fit(train_data, train_target)
    
    train_predict = linreg.predict(train_data) #Predicted values of the training data
    test_predict = linreg.predict(test_data) #Predicted values of the testing data
    
    R2 = linreg.score(test_data, test_target)
    RMSE = np.sqrt(sklearn.metrics.mean_squared_error(test_target,test_predict))
    
    output = {'train_data':train_data, 'test_data':test_data, 'train_target':train_target, 
              'test_target':test_target, 'train_predict':train_predict, 'test_predict':test_predict, 
              'R2':R2, 'RMSE':RMSE}
    
    return output

From our analysis above, the sqft_living and grade features should work well with linear regression. Let's try using just one feature, sqft_living, for now.


In [181]:
#Select data
sqft_living = House_DataFrame['sqft_living'].values #Select a single feature
sqft_living = sqft_living.reshape(sqft_living.size,1)

target = House_DataFrame['price'].values

In [182]:
lin_sqft_output = OLS_Regression(sqft_living,target)

In [183]:
print('R^2 =',lin_sqft_output['R2'])
print('RMSE =',lin_sqft_output['RMSE'])

fig,ax = plt.subplots(1,1,figsize=(6, 5))
opacity = 0.5
ax.plot(lin_sqft_output['test_data'][:,0]/10**3, lin_sqft_output['test_target']/10**6,
        marker='.',linestyle='None', alpha=opacity)
ax.plot(lin_sqft_output['test_data'][:,0]/10**3,lin_sqft_output['test_predict']/10**6,
        marker='.',linestyle='None', alpha=opacity)
ax.set_xlabel('Sq. ft. $\\times 10^3$ (living)')
ax.set_ylabel('Price $\\times 10^6$ (Dollars)')
plt.tight_layout()


R^2 = 0.505630805256
RMSE = 248521.725875

Next, let's try the grade feature


In [236]:
#Select data
House_DataFrame['grade^5.6'] = House_DataFrame['grade'].values**grade_power #Add grade^5.6 to dataframe
grade_powered = House_DataFrame['grade^5.6'].values
grade = grade.reshape(grade.size,1)

In [237]:
lin_grade_output = OLS_Regression(grade,target)

In [238]:
print('R^2 =',lin_grade_output['R2'])
print('RMSE =',lin_grade_output['RMSE'])


R^2 = 0.500305154468
RMSE = 249856.754952

Both the sqft_living and grade features perform similarly well. We can also try multiple regression using both features.


In [239]:
data = np.hstack((sqft_living,grade))

In [240]:
lin_sqft_and_grade_output = OLS_Regression(data,target)

In [241]:
print('R^2 =',lin_sqft_and_grade_output['R2'])
print('RMSE =',lin_sqft_and_grade_output['RMSE'])


R^2 = 0.578005448189
RMSE = 229610.67876

Using both features has improved the estimate. Further improvements can likely be made by using additional features.

K-nearest neighbors

Let's define a function as before


In [199]:
def knn_Regression(data, target, n_neighbors, train_size=0.7, seed=123):
    
    #Split data into training and testing sets
    train_data, test_data, train_target, test_target = train_test_split(data, target, train_size=train_size, 
                                                                        random_state=seed)
    
    #We can weight the nearest neighbors by their distance from the point of interest
    knnreg = KNeighborsRegressor(n_neighbors=n_neighbors, weights='distance')
    knnreg.fit(train_data, train_target)
    
    train_predict = knnreg.predict(train_data) #Predicted values of the training data
    test_predict = knnreg.predict(test_data) #Predicted values of the testing data
    
    R2 = knnreg.score(test_data, test_target)
    RMSE = np.sqrt(sklearn.metrics.mean_squared_error(test_target,test_predict))
    
    output = {'train_data':train_data, 'test_data':test_data, 'train_target':train_target, 
              'test_target':test_target, 'train_predict':train_predict, 'test_predict':test_predict, 
              'R2':R2, 'RMSE':RMSE}
    
    return output

For this model, let's try using latitude and longitude. We can also try many different values for the number of nearest neighbors


In [228]:
data = House_DataFrame[['lat','long']].values
target = House_DataFrame['price'].values

In [218]:
Num_n_neighbors = 20
n_neighbors = np.arange(1,Num_n_neighbors+1)

knn_R2 = np.zeros(Num_n_neighbors)
knn_RMSE = np.zeros(Num_n_neighbors)

for idx,n in enumerate(n_neighbors):
    
    knn_latlong_output = knn_Regression(data,target,n_neighbors=n)
    
    knn_R2[idx] = knn_latlong_output['R2']
    knn_RMSE[idx] = knn_latlong_output['RMSE']

In [219]:
print('Optimal number of nearest neighbors =', n_neighbors[knn_RMSE == np.min(knn_RMSE)][0])

fig, ax1 = plt.subplots(figsize=(8,5.5))

#Plot R^2 on the left axis
ax1.plot(n_neighbors, knn_R2, color='b')
ax1.set_xlabel('n neighbors')
ax1.set_ylabel('$R^2$', color="b")
for label in ax1.get_yticklabels():
    label.set_color("b")

#Plot RMSE on the right axis
ax2 = ax1.twinx() 
ax2.plot(n_neighbors, knn_RMSE, color='r')
ax2.set_ylabel('RMSE (Dollars)', color="r")
for label in ax2.get_yticklabels():
    label.set_color("r")

plt.tight_layout()


Optimal number of nearest neighbors = 16

Again, we can choose other features to try to improve our results. We already had some success using the grade feature in the OLS method, so let's try using that with k-nearest nearest neighbors as well.


In [248]:
data = House_DataFrame[['lat','long','grade^5.6']].values
target = House_DataFrame['price'].values

In [252]:
knn_latlong_and_grade_output = knn_Regression(data,target,n_neighbors=n_neighbors[knn_RMSE == np.min(knn_RMSE)][0])

In [253]:
print('R2 =', knn_latlong_and_grade_output['R2'])
print('RMSE =', knn_latlong_and_grade_output['RMSE'])


R2 = 0.769784031044
RMSE = 169592.398768

As expected, we see some reasonable improvement.