In [ ]:
# import necessary modules and display matplotlib plots inline within the ipython notebook webpage
#
import pandas as pd, numpy as np, statsmodels.api as sm
import matplotlib.pyplot as plt, matplotlib.cm as cm, matplotlib.font_manager as fm
import matplotlib.mlab as mlab
from scipy.stats import pearsonr, ttest_rel
%matplotlib inline

In [ ]:
vars = pd.read_csv('data/ba_block_variables.csv', dtype={'block_id':object})

In [ ]:
# Optionally, store in HDF5 to reload faster
#store = pd.HDFStore('data/sfblockvars')
#store['sfblockvars'] = vars
#store
store = pd.HDFStore('data/blockdata.h5', complevel=9, complib='blosc')
store['vars'] = vars
store

In [ ]:
# Reload from HDF5 as needed
#vars = store['sfblockvars']
#vars.shape

In [ ]:
smallvars = vars[['block_id','block_groups_sum_persons','block_groups_sum_acres','block_groups_total_jobs',
                 'block_groups_median_children', 'block_groups_median_persons' , 'block_groups_median_income',
                 'block_groups_prop_tenure_2', 'nodes_low_income_hh_1500m', 'nodes_high_income_hh_1500m',
                 'nodes_jobs_5000m', 'nodes_jobs_30km', 'nodes_population_400m', 'nodes_population_800m',
                 'nodes_jobs_800m', 'prop_race_of_head_1', 'prop_race_of_head_2', 'prop_race_of_head_6', 
                 'pumas_density_households',
                 'nodes_jobs_3000m_agg1', 'nodes_jobs_3000m_agg2', 'pumas_density_jobs', 'nodes_jobs_40km',
                 'nodes_jobs_3000m_agg3', 'nodes_jobs_3000m_agg4', 'nodes_jobs_3000m_agg5',
                 'block_groups_prop_persons_1', 'block_groups_prop_persons_2', 'block_groups_median_age_of_head',
                 'puma10_id_is_0607501', 'puma10_id_is_0607502', 'puma10_id_is_0607503', 'puma10_id_is_0607504']]
sv = smallvars.rename(index=str, columns={'block_groups_sum_persons': 'bgpop',
                                     'block_groups_sum_acres': 'bgacres',
                                    'block_groups_total_jobs': 'bgjobs',
                                    'block_groups_median_children': 'bgmedkids',
                                    'block_groups_median_persons': 'bgmedhhs',
                                    'block_groups_median_income': 'bgmedinc',
                                    'block_groups_prop_tenure_2': 'proprent',
                                    'nodes_low_income_hh_1500m': 'lowinc1500m',
                                    'nodes_high_income_hh_1500m': 'highinc1500m',
                                    'nodes_jobs_5000m': 'lnjobs5000m',
                                    'nodes_jobs_30km': 'lnjobs30km',
                                    'nodes_jobs_40km': 'lnjobs40km',
                                    'nodes_population_400m': 'lnpop400m',
                                    'nodes_population_800m': 'lnpop800m',
                                    'nodes_jobs_800m': 'lnjobs800m',
                                    'prop_race_of_head_1': 'propwhite',
                                    'prop_race_of_head_2': 'propblack',
                                    'prop_race_of_head_6': 'propasian',
                                    'pumas_density_households': 'pumahhden',
                                    'pumas_density_jobs': 'pumajobden',
                                    'nodes_jobs_3000m_agg1': 'lnbasic3000m',
                                    'nodes_jobs_3000m_agg2': 'lntcpuw3000m',
                                    'nodes_jobs_3000m_agg3': 'lnret3000m',
                                    'nodes_jobs_3000m_agg4': 'lnfire3000m',
                                    'nodes_jobs_3000m_agg5': 'lnserv3000m',
                                    'block_groups_prop_persons_1': 'prop1per',
                                    'block_groups_prop_persons_2': 'prop2per',
                                    'block_groups_median_age_of_head': 'bgmedagehd',
                                    'puma10_id_is_0607501': 'puma1',
                                    'puma10_id_is_0607502': 'puma2',
                                    'puma10_id_is_0607503': 'puma3',
                                    'puma10_id_is_0607504': 'puma4'
                                         })
sv.head()

Recoded detailed race code 1 .White alone 2 .Black or African American alone 3 .American Indian alone 4 .Alaska Native alone 5 .American Indian and Alaska Native tribes specified; or American .Indian or Alaska native, not specified and no other races 6 .Asian alone 7 .Native Hawaiian and Other Pacific Islander alone 8 .Some other race alone 9 .Two or more major race groups


In [ ]:
with open('pg_engine.txt') as f:
    pg_engine = f.readlines()
from sqlalchemy import create_engine
engine = create_engine(pg_engine[0])

In [ ]:
%%time
import pandas as pd
df = pd.read_sql_query('select * from "rental_listings"',con=engine)

In [ ]:
print(df.dtypes)
df.describe()

In [ ]:
# convert the date column to yyyy-mm-dd date format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['week'] = df['date'].dt.week
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

In [ ]:
# load the 2014 census data set of MSAs
census = pd.read_csv('data/census_pop_income.csv', encoding='ISO-8859-1')
census['median_income'] = census['2014_median_income'].str.replace(',','').astype(int)
census['population'] = census['2014_pop_est'].str.replace(',','').astype(int)
census = census.drop(labels='notes', axis=1, inplace=False)
census = census.set_index('region')
census.head()

In [ ]:
# these are the 15 most populous metros by population, defined by census bureau 2014 estimates
most_populous_regions = census['population'].sort_values(ascending=False, inplace=False)
most_populous_regions.head(15)

In [ ]:
df = df.merge(census, left_on='region', right_index=True)
df.head()

In [ ]:
# Create an HDF5 file if desired, in the data directory
df.to_hdf('data/rents.h5','rents',append=False, complevel=9, complib='blosc')

In [ ]:
# Load from HDF5 if desired
#df = pd.HDFStore('data/rents.h5')
#df

In [ ]:
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(df) * upper_percentile)
lower = int(len(df) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = df['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = df['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = df['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])

In [ ]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (df['rent_sqft'] > lower_rent_sqft) & (df['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (df['rent'] > lower_rent) & (df['rent'] < upper_rent)
sqft_mask = (df['sqft'] > lower_sqft) & (df['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
filtered_listings = pd.DataFrame(df[rent_sqft_mask & rent_mask & sqft_mask])
len(filtered_listings)

In [ ]:
filtered_listings.describe()

In [ ]:
#filtered_listings.to_hdf('data/rents.h5','rents',append=False, complevel=9, complib='blosc')
store = pd.HDFStore('filtered.h5', complevel=9, complib='blosc')
store['rentals'] = filtered_listings
store

In [ ]:
test = store['rentals']
test

In [ ]:
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms\
                 ', 
                 data=filtered_listings, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

In [ ]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms + np.log(median_income) \
                 + np.log(population)\
                 ', 
                 data=filtered_listings, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

In [ ]:
#sfbay = filtered_listings[filtered_listings['region']=='sfbay']
sfbay = df[df['region']=='sfbay']
sfbay.describe()

In [ ]:
#sfbay['rent_sqft'].quantile(.01)

In [ ]:
#sfbay['sqft'].quantile(.01)

In [ ]:
# create a boolean vector mask to filter out any rows with rent_sqft and sqft in Bay Area under 1 percentile
#sfbay_rent_sqft_mask = (sfbay['rent_sqft'] > sfbay['rent_sqft'].quantile(.01) )

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
#sfbay_sqft_mask = (sfbay['sqft'] > sfbay['sqft'].quantile(.01) )

# filter the thorough listings according to these masks
#sfbay_filtered = pd.DataFrame(sfbay[sfbay_rent_sqft_mask & sfbay_sqft_mask])
#len(sfbay_filtered)

In [ ]:
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(sfbay) * upper_percentile)
lower = int(len(sfbay) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = sfbay['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = sfbay['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = sfbay['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])

In [ ]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (sfbay['rent_sqft'] > lower_rent_sqft) & (sfbay['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (sfbay['rent'] > lower_rent) & (sfbay['rent'] < upper_rent)
sqft_mask = (sfbay['sqft'] > lower_sqft) & (sfbay['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
sfbay_filtered = pd.DataFrame(sfbay[rent_sqft_mask & rent_mask & sqft_mask])
len(sfbay_filtered)

In [ ]:
sfbay_filtered.describe()

In [ ]:
sf = sfbay_filtered.merge(sv, left_on='fips_block', right_on='block_id')
sf['northsf'] = sf['puma1']+sf['puma2']+sf['puma3']+sf['puma4']
sf['sqft1'] = sf['sqft']*sf['sqft']<1500
sf['sqft2'] = sf['sqft']*sf['sqft']>1499
sf['pct1per'] = sf['prop1per']*100
sf['pct2per'] = sf['prop2per']*100
sf['pctrent'] = sf['proprent']*100
sf['pctblack'] = sf['propblack']*100
sf['pctwhite'] = sf['propwhite']*100
sf['pctasian'] = sf['propasian']*100
sf['y17jan'] = sf['month']==1
sf['y17feb'] = sf['month']==2
sf['y17mar'] = sf['month']==3
sf.head()

In [ ]:
sf.describe()

In [ ]:
sf.columns

In [ ]:
sf['bgpopden'] = sf['bgpop']/sf['bgacres']
sf['bgjobden'] = sf['bgjobs']/sf['bgacres']
sf['highlowinc1500m'] = sf['highinc1500m']/sf['lowinc1500m']

In [ ]:
sf.dtypes

In [ ]:
sfstore = pd.HDFStore('data/sf_filtered.h5', complevel=9, complib='blosc')
sfstore['rentals'] = sf
sfstore

In [ ]:
test = sfstore['rentals']
test

In [ ]:
sf.to_csv('data/sf_filtered.csv')

In [ ]:
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=sf, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

In [ ]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms + y17jan + y17feb + y17mar\
                 ', 
                 data=sf, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

In [ ]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms   +\
                 np.log(bgmedinc)  \
                 ', 
                 data=sf, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

In [ ]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms   +\
                 np.log(bgmedinc) + lnjobs5000m + lnjobs40km   \
                 ', 
                 data=sf, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

In [ ]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms   +\
                 np.log(bgmedinc) + lnjobs5000m + lnjobs40km + highinc1500m + lowinc1500m\
                 ', 
                 data=sf, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

In [ ]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms   +\
                 np.log(bgmedinc) + lnjobs5000m + lnjobs40km + highinc1500m + lowinc1500m\
                  + pctrent + pct1per + pct2per + bgmedagehd + pumahhden \
                 ', 
                 data=sf, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

In [ ]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms   +\
                 np.log(bgmedinc) + lnjobs5000m + lnjobs40km + highinc1500m + lowinc1500m\
                 + pctrent + pct1per + pct2per + bgmedagehd  + pumahhden  \
                 ', 
                 data=sf, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

Recoded detailed race code 1 .White alone 2 .Black or African American alone 3 .American Indian alone 4 .Alaska Native alone 5 .American Indian and Alaska Native tribes specified; or American .Indian or Alaska native, not specified and no other races 6 .Asian alone 7 .Native Hawaiian and Other Pacific Islander alone 8 .Some other race alone 9 .Two or more major race groups


In [ ]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms   +\
                 np.log(bgmedinc) + lnjobs5000m + lnjobs40km + highinc1500m + lowinc1500m\
                 + pctrent + pct1per + pct2per + bgmedagehd  + pumahhden \
                 + y17jan + y17feb + y17mar \
                 ', 
                 data=sf, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

In [ ]:
sf['neighborhood'].value_counts()

In [ ]:
soma = sf[sf['neighborhood']=='SOMA / south beach']
soma.describe()

In [ ]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms  +\
                 np.log(bgmedinc) + lnjobs5000m + lnjobs40km + highinc1500m + lowinc1500m\
                 + pctrent + pct1per + pct2per + bgmedagehd  + pumahhden  \
                 + y17jan + y17feb + y17mar \
                 ', 
                 data=soma, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

In [ ]:
census.dtypes

In [ ]:
plt.hist(residuals, bins=25, normed=True, alpha=.5)
mu = residuals.mean()
variance = residuals.var()
sigma = residuals.std()
x = np.linspace(-3, 3, 100)
plt.plot(x,mlab.normpdf(x, mu, sigma));

In [ ]:
plt.figure(1, figsize=(10,8), )
plt.plot([7, 9], [0, 0], c='b')
plt.scatter(predicted, residuals, marker=0, s=2, c='g');
plt.axis([7.25, 9, -1.5, 1.5])
plt.show();

In [ ]:
plt.figure(1, figsize=(10,8), )
plt.plot([6, 9.5], [6, 9.5])
plt.scatter(observed, predicted, marker=0, s=2, c='g')
plt.axis([6.5, 9.5, 6.5, 9.5])
plt.show();

In [ ]:
print(residuals.mean())
print(residuals.std())

In [ ]:
# If we want to use WLS we need a useful set of weights. The default produces the same results as OLS 
mod = sm.WLS(y, X, weights=1.)
res = mod.fit()
print(res.summary())

In [ ]:
# Warning, this is a very intensive process and will take a while
%%time
from pymc3 import Model, NUTS, sample
from pymc3.glm import glm

with Model() as model_glm:
    glm('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms', sfbay_filtered)
    trace = sample(5000)

In [ ]:
from pymc3 import traceplot
%matplotlib inline
traceplot(trace);

In [ ]:
from scipy import optimize
from pymc3 import find_MAP
with model_glm:

    # obtain starting values via MAP
    start = find_MAP(fmin=optimize.fmin_powell)

    # draw 2000 posterior samples
    trace = sample(5000, start=start)

In [ ]:
traceplot(trace);

In [ ]:
import matplotlib.pyplot as plt
import theano
import pymc3 as pm

In [ ]:
fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot(111, xlabel='x', ylabel='y', title='Generated data and underlying model')
ax.plot(np.log(sfbay['sqft']), np.log(sfbay['rent']), 'o', markersize=.5, color='blue', label='sampled data')
#ax.plot(x, true_regression_line, label='true regression line', lw=2.)
#pm.glm.plot_posterior_predictive(trace, samples=100,
#                                 label='posterior predictive regression lines')
plt.legend(loc=0);

In [ ]: