This material uses Python to demonstrate some aspects of hedonic regression, but the objective here is not to learn to program, but to understand the hedonic regression methodology. We begin with an example in which we generate some synthetic data using a set of coefficients and a mathematical model, and learn those coefficients using a statistical method called multiple regression.
Simple linear regression is an approach for predicting a quantitative response using a single feature (or "predictor" or "input variable"). It takes the following form:
$y = \beta_0 + \beta_1x$
What does each term represent?
Together, $\beta_0$ and $\beta_1$ are called the model coefficients. To create your model, you must "learn" the values of these coefficients. And once we've learned these coefficients, we can use the model to predict values of $y$ based on new values of $x$.
Generally speaking, coefficients are estimated using the least squares criterion, which means we are find the line (mathematically) which minimizes the sum of squared residuals (or "sum of squared errors"):
What elements are present in the diagram?
How do the model coefficients relate to the least squares line?
Here is a graphical depiction of those calculations:
In [18]:
from IPython.display import display, Image
display(Image("regression.png", width=100))
In [20]:
# Startup steps
import pandas as pd, numpy as np, statsmodels.api as sm
import matplotlib.pyplot as plt, matplotlib.cm as cm, matplotlib.font_manager as fm
import matplotlib.mlab as mlab
from scipy.stats import pearsonr, ttest_rel
%matplotlib inline
Generate data using a model we define:
In [21]:
nsample = 100
x = np.linspace(0, 10, nsample)
beta = np.array([0, 2])
e = np.random.normal(size=nsample)
X = sm.add_constant(x)
y = np.dot(X, beta) + e
Plot the data and the model. Note that the intercept is set to zero in this example initially.
In [22]:
plt.figure(1, figsize=(10,8), )
plt.plot([0, 10], [0, 20])
plt.scatter(x, y, marker=0, s=10, c='g')
plt.axis([0, 10, 0, 20])
plt.show();
In [23]:
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())
In [24]:
print('Parameters: ', results.params)
print('R2: ', results.rsquared)
In [26]:
results.params
Out[26]:
In [25]:
results.conf_int()
Out[25]:
In [21]:
nsample = 100
x = np.linspace(0, 10, 100)
X = np.column_stack((x, x**2))
beta = np.array([1, 2, .5])
e = np.random.normal(size=nsample)
In [22]:
X = sm.add_constant(X)
y = np.dot(X, beta) + e
In [23]:
X[:10]
Out[23]:
In [24]:
plt.scatter(x,y, s=5)
Out[24]:
In [15]:
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())
In [ ]:
In [ ]:
In [25]:
sf = pd.read_csv('data/redfin_2017-03-05-17-45-34-san-francisco-county-1-month.csv')
sf.columns
Out[25]:
In [26]:
sf1 = sf.rename(index=str, columns={'SALE TYPE': 'saletype',
'SOLD DATE': 'solddate', 'PROPERTY TYPE': 'proptype', 'ADDRESS': 'address',
'CITY': 'city', 'STATE': 'state', 'ZIP': 'zip', 'PRICE': 'price', 'BEDS': 'beds',
'BATHS': 'baths', 'LOCATION': 'location', 'SQUARE FEET': 'sqft', 'LOT SIZE': 'lotsize',
'YEAR BUILT': 'yrbuilt', 'DAYS ON MARKET': 'daysonmkt', '$/SQUARE FEET': 'pricesqft',
'LATITUDE': 'latitude', 'LONGITUDE': 'longitude', 'HOA/MONTH': 'hoamonth',
'URL (SEE http://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)': 'url',
'STATUS': 'status', 'NEXT OPEN HOUSE START TIME': 'nextopenstart', 'NEXT OPEN HOUSE END TIME': 'nextopenend',
'SOURCE': 'source', 'MLS#': 'mls', 'FAVORITE': 'favorite', 'INTERESTED': 'interested'
})
sf1.head()
Out[26]:
In [27]:
sf1.describe()
Out[27]:
In [28]:
plt.figure(1, figsize=(10,8), )
plt.scatter(sf1['sqft'], sf1['price'], marker=0, s=10, c='g')
#plt.axis([12, 16, 12, 16])
plt.show();
In [20]:
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
y, X = dmatrices('np.log(price) ~ np.log(sqft) + C(baths)',
data=sf1, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
In [21]:
plt.hist(residuals, bins=25, normed=True, alpha=.5)
mu = residuals.mean()
variance = residuals.var()
sigma = residuals.std()
x = np.linspace(-3, 3, 100)
plt.plot(x,mlab.normpdf(x, mu, sigma));
In [22]:
plt.figure(1, figsize=(10,8), )
plt.plot([12, 16], [0, 0], c='b')
plt.scatter(predicted, residuals, marker=0, s=10, c='g');
plt.axis([12, 16, -0.8, 0.8])
plt.show();
In [24]:
plt.figure(1, figsize=(10,8), )
plt.plot([12, 16], [12, 16])
plt.scatter(observed, predicted, marker=0, s=10, c='g')
plt.axis([12, 16, 12, 16])
plt.show();
In [25]:
vars = pd.read_csv('data/ba_block_variables.csv', dtype={'block_id':object})
In [26]:
vars.shape
Out[26]:
In [27]:
smallvars = vars[['block_id','block_groups_sum_persons','block_groups_sum_acres','block_groups_total_jobs',
'block_groups_median_children', 'block_groups_median_persons' , 'block_groups_median_income',
'block_groups_prop_tenure_2', 'nodes_low_income_hh_1500m', 'nodes_high_income_hh_1500m',
'nodes_jobs_5000m', 'nodes_jobs_30km', 'nodes_population_400m', 'nodes_population_800m',
'nodes_jobs_800m', 'prop_race_of_head_1', 'prop_race_of_head_2', 'pumas_density_households',
'nodes_jobs_3000m_agg1', 'nodes_jobs_3000m_agg2', 'pumas_density_jobs', 'nodes_jobs_40km',
'nodes_jobs_3000m_agg3', 'nodes_jobs_3000m_agg4', 'nodes_jobs_3000m_agg5',
'block_groups_prop_persons_1', 'block_groups_prop_persons_2', 'block_groups_median_age_of_head',
'puma10_id_is_0607501', 'puma10_id_is_0607502', 'puma10_id_is_0607503', 'puma10_id_is_0607504']]
sv = smallvars.rename(index=str, columns={'block_groups_sum_persons': 'bgpop',
'block_groups_sum_acres': 'bgacres',
'block_groups_total_jobs': 'bgjobs',
'block_groups_median_children': 'bgmedkids',
'block_groups_median_persons': 'bgmedhhs',
'block_groups_median_income': 'bgmedinc',
'block_groups_prop_tenure_2': 'proprent',
'nodes_low_income_hh_1500m': 'lowinc1500m',
'nodes_high_income_hh_1500m': 'highinc1500m',
'nodes_jobs_5000m': 'lnjobs5000m',
'nodes_jobs_30km': 'lnjobs30km',
'nodes_jobs_40km': 'lnjobs40km',
'nodes_population_400m': 'lnpop400m',
'nodes_population_800m': 'lnpop800m',
'nodes_jobs_800m': 'lnjobs800m',
'prop_race_of_head_1': 'propwhite',
'prop_race_of_head_2': 'propblack',
'pumas_density_households': 'pumahhden',
'pumas_density_jobs': 'pumajobden',
'nodes_jobs_3000m_agg1': 'lnbasic3000m',
'nodes_jobs_3000m_agg2': 'lntcpuw3000m',
'nodes_jobs_3000m_agg3': 'lnret3000m',
'nodes_jobs_3000m_agg4': 'lnfire3000m',
'nodes_jobs_3000m_agg5': 'lnserv3000m',
'block_groups_prop_persons_1': 'prop1per',
'block_groups_prop_persons_2': 'prop2per',
'block_groups_median_age_of_head': 'bgmedagehd',
'puma10_id_is_0607501': 'puma1',
'puma10_id_is_0607502': 'puma2',
'puma10_id_is_0607503': 'puma3',
'puma10_id_is_0607504': 'puma4'
})
sv.head()
Out[27]:
Recoded detailed race code 1 .White alone 2 .Black or African American alone 3 .American Indian alone 4 .Alaska Native alone 5 .American Indian and Alaska Native tribes specified; or American .Indian or Alaska native, not specified and no other races 6 .Asian alone 7 .Native Hawaiian and Other Pacific Islander alone 8 .Some other race alone 9 .Two or more major race groups
In [29]:
with open('pg_engine.txt') as f:
pg_engine = f.readlines()
from sqlalchemy import create_engine
engine = create_engine(pg_engine[0])
In [30]:
%%time
import pandas as pd
df = pd.read_sql_query('select * from "rental_listings"',con=engine)
In [31]:
print(df.dtypes)
df.describe()
Out[31]:
In [32]:
# convert the date column to yyyy-mm-dd date format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
In [33]:
# load the 2014 census data set of MSAs
census = pd.read_csv('data/census_pop_income.csv', encoding='ISO-8859-1')
census['median_income'] = census['2014_median_income'].str.replace(',','').astype(int)
census['population'] = census['2014_pop_est'].str.replace(',','').astype(int)
census = census.drop(labels='notes', axis=1, inplace=False)
census = census.set_index('region')
census.head()
Out[33]:
In [39]:
# these are the 15 most populous metros by population, defined by census bureau 2014 estimates
most_populous_regions = census['2014_pop_est'].sort_values(ascending=False)
print(most_populous_regions.head(15))
In [35]:
df = df.merge(census, left_on='region', right_index=True)
df.head()
Out[35]:
In [ ]:
# Create an HDF5 file if desired, in the data directory
#df.to_hdf('data/rents.h5','rents',append=False)
In [ ]:
# Load from HDF5 if desired
#df = pd.HDFStore('data/rents.h5')
#df
In [36]:
upper_percentile = 0.998
lower_percentile = 0.002
# how many rows would be within the upper and lower percentiles?
upper = int(len(df) * upper_percentile)
lower = int(len(df) * lower_percentile)
# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = df['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]
# get the rent values at the upper and lower percentiles
rent_sorted = df['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]
# get the sqft values at the upper and lower percentiles
sqft_sorted = df['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]
print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])
In [37]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (df['rent_sqft'] > lower_rent_sqft) & (df['rent_sqft'] < upper_rent_sqft)
# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (df['rent'] > lower_rent) & (df['rent'] < upper_rent)
sqft_mask = (df['sqft'] > lower_sqft) & (df['sqft'] < upper_sqft)
# filter the thorough listings according to these masks
filtered_listings = pd.DataFrame(df[rent_sqft_mask & rent_mask & sqft_mask])
len(filtered_listings)
Out[37]:
In [ ]:
filtered_listings.describe()
In [ ]:
#sfbay = filtered_listings[filtered_listings['region']=='sfbay']
sfbay = df[df['region']=='sfbay']
sfbay.describe()
In [ ]:
#sfbay['rent_sqft'].quantile(.01)
In [ ]:
#sfbay['sqft'].quantile(.01)
In [ ]:
# create a boolean vector mask to filter out any rows with rent_sqft and sqft in Bay Area under 1 percentile
#sfbay_rent_sqft_mask = (sfbay['rent_sqft'] > sfbay['rent_sqft'].quantile(.01) )
# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
#sfbay_sqft_mask = (sfbay['sqft'] > sfbay['sqft'].quantile(.01) )
# filter the thorough listings according to these masks
#sfbay_filtered = pd.DataFrame(sfbay[sfbay_rent_sqft_mask & sfbay_sqft_mask])
#len(sfbay_filtered)
In [ ]:
upper_percentile = 0.998
lower_percentile = 0.002
# how many rows would be within the upper and lower percentiles?
upper = int(len(sfbay) * upper_percentile)
lower = int(len(sfbay) * lower_percentile)
# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = sfbay['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]
# get the rent values at the upper and lower percentiles
rent_sorted = sfbay['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]
# get the sqft values at the upper and lower percentiles
sqft_sorted = sfbay['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]
print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])
In [ ]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (sfbay['rent_sqft'] > lower_rent_sqft) & (sfbay['rent_sqft'] < upper_rent_sqft)
# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (sfbay['rent'] > lower_rent) & (sfbay['rent'] < upper_rent)
sqft_mask = (sfbay['sqft'] > lower_sqft) & (sfbay['sqft'] < upper_sqft)
# filter the thorough listings according to these masks
sfbay_filtered = pd.DataFrame(sfbay[rent_sqft_mask & rent_mask & sqft_mask])
len(sfbay_filtered)
In [ ]:
sfbay_filtered.describe()
In [ ]:
sf = sfbay_filtered.merge(sv, left_on='fips_block', right_on='block_id')
sf['northsf'] = sf['puma1']+sf['puma2']+sf['puma3']+sf['puma4']
sf['sqft1'] = sf['sqft']*sf['sqft']<1500
sf['sqft2'] = sf['sqft']*sf['sqft']>1499
sf['pct1per'] = sf['prop1per']*100
sf['pct2per'] = sf['prop2per']*100
sf['pctrent'] = sf['proprent']*100
sf['pctblack'] = sf['propblack']*100
sf.head()
In [ ]:
sf.describe()
In [ ]:
sf['bgpopden'] = sf['bgpop']/sf['bgacres']
sf['bgjobden'] = sf['bgjobs']/sf['bgacres']
sf['highlowinc1500m'] = sf['highinc1500m']/sf['lowinc1500m']
In [ ]:
sf.dtypes
In [ ]:
# Use either filtered_listings for the national dataset or sfbay_filtered for the Bay Area subset
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
',
data=sf, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
In [ ]:
# Use either filtered_listings for the national dataset or sfbay_filtered for the Bay Area subset
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms +\
np.log(bgmedinc) + lnjobs5000m + lnjobs40km + pctblack \
+ pumahhden + pctrent + pct1per + pct2per + bgmedagehd \
',
data=sf, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
In [ ]:
plt.hist(residuals, bins=25, normed=True, alpha=.5)
mu = residuals.mean()
variance = residuals.var()
sigma = residuals.std()
x = np.linspace(-3, 3, 100)
plt.plot(x,mlab.normpdf(x, mu, sigma));
In [ ]:
plt.figure(1, figsize=(10,8), )
plt.plot([7, 9], [0, 0], c='b')
plt.scatter(predicted, residuals, marker=0, s=2, c='g');
plt.axis([7.25, 9, -1.5, 1.5])
plt.show();
In [ ]:
plt.figure(1, figsize=(10,8), )
plt.plot([6, 9.5], [6, 9.5])
plt.scatter(observed, predicted, marker=0, s=2, c='g')
plt.axis([6.5, 9.5, 6.5, 9.5])
plt.show();
In [ ]:
print(residuals.mean())
print(residuals.std())
In [ ]:
# If we want to use WLS we need a useful set of weights. The default produces the same results as OLS
mod = sm.WLS(y, X, weights=1.)
res = mod.fit()
print(res.summary())
In [ ]:
# Warning, this is a very intensive process and will take a while
%%time
from pymc3 import Model, NUTS, sample
from pymc3.glm import glm
with Model() as model_glm:
glm('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms', sfbay_filtered)
trace = sample(5000)
In [ ]:
from pymc3 import traceplot
%matplotlib inline
traceplot(trace);
In [ ]:
from scipy import optimize
from pymc3 import find_MAP
with model_glm:
# obtain starting values via MAP
start = find_MAP(fmin=optimize.fmin_powell)
# draw 2000 posterior samples
trace = sample(5000, start=start)
In [ ]:
traceplot(trace);
In [ ]:
import matplotlib.pyplot as plt
import theano
import pymc3 as pm
In [ ]:
fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot(111, xlabel='x', ylabel='y', title='Generated data and underlying model')
ax.plot(np.log(sfbay['sqft']), np.log(sfbay['rent']), 'o', markersize=.5, color='blue', label='sampled data')
#ax.plot(x, true_regression_line, label='true regression line', lw=2.)
#pm.glm.plot_posterior_predictive(trace, samples=100,
# label='posterior predictive regression lines')
plt.legend(loc=0);
In [ ]: