Tutorial on Hedonic Regression

This material uses Python to demonstrate some aspects of hedonic regression, but the objective here is not to learn to program, but to understand the hedonic regression methodology. We begin with an example in which we generate some synthetic data using a set of coefficients and a mathematical model, and learn those coefficients using a statistical method called multiple regression.



In [ ]:

    
# Startup steps
import pandas as pd, numpy as np, statsmodels.api as sm
import matplotlib.pyplot as plt, matplotlib.cm as cm, matplotlib.font_manager as fm
import matplotlib.mlab as mlab
from scipy.stats import pearsonr, ttest_rel
%matplotlib inline

Generate data using a model we define:



In [ ]:

    
nsample = 100
X = np.linspace(0, 10, 100)
beta = np.array([0, 2])
e = np.random.normal(size=nsample)
X = sm.add_constant(X)
y = np.dot(X, beta) + e

Plot the data and the model. Note that the intercept



In [ ]:

    
plt.figure(1, figsize=(10,8), )
plt.plot([0, 10], [0, 20])
plt.scatter(x, y, marker=0, s=10, c='g')
plt.axis([0, 10, 0, 20])
plt.show();



In [ ]:

    
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())



In [ ]:

    
print('Parameters: ', results.params)
print('R2: ', results.rsquared)



In [ ]:

    
nsample = 100
x = np.linspace(0, 10, 100)
X = np.column_stack((x, x**2))
beta = np.array([1, 2, .5])
e = np.random.normal(size=nsample)



In [ ]:

    
X = sm.add_constant(X)
y = np.dot(X, beta) + e



In [ ]:

    
X[:10]



In [ ]:

    
plt.scatter(x,y, s=5)



In [ ]:

    
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())



In [ ]:



In [ ]:



In [ ]:

    
sf = pd.read_csv('data/redfin_2017-03-05-17-45-34-san-francisco-county-1-month.csv')
sf.columns



In [ ]:

    
sf1 = sf.rename(index=str, columns={'SALE TYPE': 'saletype',
    'SOLD DATE': 'solddate', 'PROPERTY TYPE': 'proptype', 'ADDRESS': 'address',
    'CITY': 'city', 'STATE': 'state', 'ZIP': 'zip', 'PRICE': 'price', 'BEDS': 'beds',
    'BATHS': 'baths', 'LOCATION': 'location', 'SQUARE FEET': 'sqft', 'LOT SIZE': 'lotsize',
    'YEAR BUILT': 'yrbuilt', 'DAYS ON MARKET': 'daysonmkt', '$/SQUARE FEET': 'pricesqft',
    'LATITUDE': 'latitude', 'LONGITUDE': 'longitude', 'HOA/MONTH': 'hoamonth',
    'URL (SEE http://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)': 'url',
    'STATUS': 'status', 'NEXT OPEN HOUSE START TIME': 'nextopenstart', 'NEXT OPEN HOUSE END TIME': 'nextopenend',
    'SOURCE': 'source', 'MLS#': 'mls', 'FAVORITE': 'favorite', 'INTERESTED': 'interested'
    })

sf1.head()



In [ ]:

    
sf1.describe()



In [ ]:

    
plt.figure(1, figsize=(10,8), )
plt.scatter(sf1['sqft'], sf1['price'], marker=0, s=10, c='g')
#plt.axis([12, 16, 12, 16])
plt.show();



In [ ]:

    
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
y, X = dmatrices('np.log(price) ~ np.log(sqft) + C(baths)', 
                 data=sf1, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())



In [ ]:

    
plt.hist(residuals, bins=25, normed=True, alpha=.5)
mu = residuals.mean()
variance = residuals.var()
sigma = residuals.std()
x = np.linspace(-3, 3, 100)
plt.plot(x,mlab.normpdf(x, mu, sigma));



In [ ]:

    
plt.figure(1, figsize=(10,8), )
plt.plot([12, 16], [0, 0], c='b')
plt.scatter(predicted, residuals, marker=0, s=10, c='g');
plt.axis([12, 16, -0.8, 0.8])
plt.show();



In [ ]:

    
plt.figure(1, figsize=(10,8), )
plt.plot([12, 16], [12, 16])
plt.scatter(observed, predicted, marker=0, s=10, c='g')
plt.axis([12, 16, 12, 16])
plt.show();



In [ ]:

    
vars = pd.read_csv('data/ba_block_variables.csv', dtype={'block_id':object})



In [ ]:

    
vars.shape



In [ ]:

    
smallvars = vars[['block_id','block_groups_sum_persons','block_groups_sum_acres','block_groups_total_jobs',
                 'block_groups_median_children', 'block_groups_median_persons' , 'block_groups_median_income',
                 'block_groups_prop_tenure_2', 'nodes_low_income_hh_1500m', 'nodes_high_income_hh_1500m',
                 'nodes_jobs_5000m', 'nodes_jobs_30km', 'nodes_population_400m', 'nodes_population_800m',
                 'nodes_jobs_800m', 'prop_race_of_head_1', 'prop_race_of_head_2', 'pumas_density_households',
                 'nodes_jobs_3000m_agg1', 'nodes_jobs_3000m_agg2', 'pumas_density_jobs', 'nodes_jobs_40km',
                 'nodes_jobs_3000m_agg3', 'nodes_jobs_3000m_agg4', 'nodes_jobs_3000m_agg5',
                 'block_groups_prop_persons_1', 'block_groups_prop_persons_2', 'block_groups_median_age_of_head',
                 'puma10_id_is_0607501', 'puma10_id_is_0607502', 'puma10_id_is_0607503', 'puma10_id_is_0607504']]
sv = smallvars.rename(index=str, columns={'block_groups_sum_persons': 'bgpop',
                                     'block_groups_sum_acres': 'bgacres',
                                    'block_groups_total_jobs': 'bgjobs',
                                    'block_groups_median_children': 'bgmedkids',
                                    'block_groups_median_persons': 'bgmedhhs',
                                    'block_groups_median_income': 'bgmedinc',
                                    'block_groups_prop_tenure_2': 'proprent',
                                    'nodes_low_income_hh_1500m': 'lowinc1500m',
                                    'nodes_high_income_hh_1500m': 'highinc1500m',
                                    'nodes_jobs_5000m': 'lnjobs5000m',
                                    'nodes_jobs_30km': 'lnjobs30km',
                                    'nodes_jobs_40km': 'lnjobs40km',
                                    'nodes_population_400m': 'lnpop400m',
                                    'nodes_population_800m': 'lnpop800m',
                                    'nodes_jobs_800m': 'lnjobs800m',
                                    'prop_race_of_head_1': 'propwhite',
                                    'prop_race_of_head_2': 'propblack',
                                    'pumas_density_households': 'pumahhden',
                                    'pumas_density_jobs': 'pumajobden',
                                    'nodes_jobs_3000m_agg1': 'lnbasic3000m',
                                    'nodes_jobs_3000m_agg2': 'lntcpuw3000m',
                                    'nodes_jobs_3000m_agg3': 'lnret3000m',
                                    'nodes_jobs_3000m_agg4': 'lnfire3000m',
                                    'nodes_jobs_3000m_agg5': 'lnserv3000m',
                                    'block_groups_prop_persons_1': 'prop1per',
                                    'block_groups_prop_persons_2': 'prop2per',
                                    'block_groups_median_age_of_head': 'bgmedagehd',
                                    'puma10_id_is_0607501': 'puma1',
                                    'puma10_id_is_0607502': 'puma2',
                                    'puma10_id_is_0607503': 'puma3',
                                    'puma10_id_is_0607504': 'puma4'
                                         })
sv.head()

Recoded detailed race code 1 .White alone 2 .Black or African American alone 3 .American Indian alone 4 .Alaska Native alone 5 .American Indian and Alaska Native tribes specified; or American .Indian or Alaska native, not specified and no other races 6 .Asian alone 7 .Native Hawaiian and Other Pacific Islander alone 8 .Some other race alone 9 .Two or more major race groups



In [ ]:

    
with open('pg_engine_redfin.txt') as f:
    pg_engine = f.readlines()
from sqlalchemy import create_engine
engine = create_engine(pg_engine[0])



In [ ]:

    
%%time
import pandas as pd
df = pd.read_sql_query('select * from "rental_listings"',con=engine)



In [ ]:

    
print(df.dtypes)
df.describe()



In [ ]:

    
# convert the date column to yyyy-mm-dd date format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')



In [ ]:

    
# load the 2014 census data set of MSAs
census = pd.read_csv('data/census_pop_income.csv', encoding='ISO-8859-1')
census['median_income'] = census['2014_median_income'].str.replace(',','').astype(int)
census['population'] = census['2014_pop_est'].str.replace(',','').astype(int)
census = census.drop(labels='notes', axis=1, inplace=False)
census = census.set_index('region')
census.head()



In [ ]:

    
# these are the 15 most populous metros by population, defined by census bureau 2014 estimates
most_populous_regions = census['2014_pop_est'].sort_values(ascending=False, inplace=False)
print(most_populous_regions.head(15))



In [ ]:

    
df = df.merge(census, left_on='region', right_index=True)
df.head()



In [ ]:

    
# Create an HDF5 file if desired, in the data directory
#df.to_hdf('data/rents.h5','rents',append=False)



In [ ]:

    
# Load from HDF5 if desired
#df = pd.HDFStore('data/rents.h5')
#df



In [ ]:

    
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(df) * upper_percentile)
lower = int(len(df) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = df['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = df['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = df['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])



In [ ]:

    
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (df['rent_sqft'] > lower_rent_sqft) & (df['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (df['rent'] > lower_rent) & (df['rent'] < upper_rent)
sqft_mask = (df['sqft'] > lower_sqft) & (df['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
filtered_listings = pd.DataFrame(df[rent_sqft_mask & rent_mask & sqft_mask])
len(filtered_listings)



In [ ]:

    
filtered_listings.describe()



In [ ]:

    
#sfbay = filtered_listings[filtered_listings['region']=='sfbay']
sfbay = df[df['region']=='sfbay']
sfbay.describe()



In [ ]:

    
#sfbay['rent_sqft'].quantile(.01)



In [ ]:

    
#sfbay['sqft'].quantile(.01)



In [ ]:

    
# create a boolean vector mask to filter out any rows with rent_sqft and sqft in Bay Area under 1 percentile
#sfbay_rent_sqft_mask = (sfbay['rent_sqft'] > sfbay['rent_sqft'].quantile(.01) )

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
#sfbay_sqft_mask = (sfbay['sqft'] > sfbay['sqft'].quantile(.01) )

# filter the thorough listings according to these masks
#sfbay_filtered = pd.DataFrame(sfbay[sfbay_rent_sqft_mask & sfbay_sqft_mask])
#len(sfbay_filtered)



In [ ]:

    
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(sfbay) * upper_percentile)
lower = int(len(sfbay) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = sfbay['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = sfbay['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = sfbay['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])



In [ ]:

    
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (sfbay['rent_sqft'] > lower_rent_sqft) & (sfbay['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (sfbay['rent'] > lower_rent) & (sfbay['rent'] < upper_rent)
sqft_mask = (sfbay['sqft'] > lower_sqft) & (sfbay['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
sfbay_filtered = pd.DataFrame(sfbay[rent_sqft_mask & rent_mask & sqft_mask])
len(sfbay_filtered)



In [ ]:

    
sfbay_filtered.describe()



In [ ]:

    
sf = sfbay_filtered.merge(sv, left_on='fips_block', right_on='block_id')
sf['northsf'] = sf['puma1']+sf['puma2']+sf['puma3']+sf['puma4']
sf['sqft1'] = sf['sqft']*sf['sqft']<1500
sf['sqft2'] = sf['sqft']*sf['sqft']>1499
sf['pct1per'] = sf['prop1per']*100
sf['pct2per'] = sf['prop2per']*100
sf['pctrent'] = sf['proprent']*100
sf['pctblack'] = sf['propblack']*100
sf.head()



In [ ]:

    
sf.describe()



In [ ]:

    
sf['bgpopden'] = sf['bgpop']/sf['bgacres']
sf['bgjobden'] = sf['bgjobs']/sf['bgacres']
sf['highlowinc1500m'] = sf['highinc1500m']/sf['lowinc1500m']



In [ ]:

    
sf.dtypes



In [ ]:

    
# Use either filtered_listings for the national dataset or sfbay_filtered for the Bay Area subset

import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=sf, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())



In [ ]:

    
# Use either filtered_listings for the national dataset or sfbay_filtered for the Bay Area subset

import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms  +\
                 np.log(bgmedinc) + lnjobs5000m + lnjobs40km + pctblack \
                  + pumahhden + pctrent + pct1per + pct2per + bgmedagehd  \
                 ', 
                 data=sf, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())



In [ ]:

    
plt.hist(residuals, bins=25, normed=True, alpha=.5)
mu = residuals.mean()
variance = residuals.var()
sigma = residuals.std()
x = np.linspace(-3, 3, 100)
plt.plot(x,mlab.normpdf(x, mu, sigma));



In [ ]:

    
plt.figure(1, figsize=(10,8), )
plt.plot([7, 9], [0, 0], c='b')
plt.scatter(predicted, residuals, marker=0, s=2, c='g');
plt.axis([7.25, 9, -1.5, 1.5])
plt.show();



In [ ]:

    
plt.figure(1, figsize=(10,8), )
plt.plot([6, 9.5], [6, 9.5])
plt.scatter(observed, predicted, marker=0, s=2, c='g')
plt.axis([6.5, 9.5, 6.5, 9.5])
plt.show();



In [ ]:

    
print(residuals.mean())
print(residuals.std())



In [ ]:

    
# If we want to use WLS we need a useful set of weights. The default produces the same results as OLS 
mod = sm.WLS(y, X, weights=1.)
res = mod.fit()
print(res.summary())



In [ ]:

    
# Warning, this is a very intensive process and will take a while
%%time
from pymc3 import Model, NUTS, sample
from pymc3.glm import glm

with Model() as model_glm:
    glm('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms', sfbay_filtered)
    trace = sample(5000)



In [ ]:

    
from pymc3 import traceplot
%matplotlib inline
traceplot(trace);



In [ ]:

    
from scipy import optimize
from pymc3 import find_MAP
with model_glm:

    # obtain starting values via MAP
    start = find_MAP(fmin=optimize.fmin_powell)

    # draw 2000 posterior samples
    trace = sample(5000, start=start)



In [ ]:

    
traceplot(trace);



In [ ]:

    
import matplotlib.pyplot as plt
import theano
import pymc3 as pm



In [ ]:

    
fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot(111, xlabel='x', ylabel='y', title='Generated data and underlying model')
ax.plot(np.log(sfbay['sqft']), np.log(sfbay['rent']), 'o', markersize=.5, color='blue', label='sampled data')
#ax.plot(x, true_regression_line, label='true regression line', lw=2.)
#pm.glm.plot_posterior_predictive(trace, samples=100,
#                                 label='posterior predictive regression lines')
plt.legend(loc=0);



In [ ]: