In [1]:
import pandas as pd, numpy as np, statsmodels.api as sm
import matplotlib.pyplot as plt, matplotlib.cm as cm, matplotlib.font_manager as fm
import matplotlib.mlab as mlab
from scipy.stats import pearsonr, ttest_rel
%matplotlib inline
In [4]:
store = pd.HDFStore('data/filtered_listings.h5')
rents = store['rents']
In [53]:
rents['y17jan'] = rents['month']==1
rents['y17feb'] = rents['month']==2
rents['y17mar'] = rents['month']==3
In [54]:
rents.describe()
Out[54]:
In [57]:
sfbay = rents[rents['region']=='sfbay']
sfbay.describe()
Out[57]:
In [58]:
dset = sfbay
upper_percentile = 0.998
lower_percentile = 0.002
# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)
# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]
# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]
# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]
print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])
In [59]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)
# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)
# filter the thorough listings according to these masks
sfbay_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(sfbay_filtered)
Out[59]:
In [60]:
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
',
data=sfbay_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
In [61]:
detroit = rents[rents['region']=='detroit']
detroit.describe()
Out[61]:
In [62]:
dset = detroit
upper_percentile = 0.998
lower_percentile = 0.002
# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)
# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]
# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]
# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]
print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])
In [63]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)
# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)
# filter the thorough listings according to these masks
detroit_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(detroit_filtered)
Out[63]:
In [64]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
',
data=detroit_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
In [65]:
denver = rents[rents['region']=='denver']
denver.describe()
Out[65]:
In [66]:
dset = denver
upper_percentile = 0.998
lower_percentile = 0.002
# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)
# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]
# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]
# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]
print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])
In [67]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)
# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)
# filter the thorough listings according to these masks
denver_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(denver_filtered)
Out[67]:
In [68]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
',
data=denver_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
In [69]:
newyork = rents[rents['region']=='newyork']
newyork.describe()
Out[69]:
In [70]:
dset = newyork
upper_percentile = 0.998
lower_percentile = 0.002
# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)
# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]
# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]
# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]
print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])
In [71]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)
# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)
# filter the thorough listings according to these masks
newyork_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(newyork_filtered)
Out[71]:
In [72]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
',
data=newyork_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
In [73]:
houston = rents[rents['region']=='houston']
houston.describe()
Out[73]:
In [74]:
dset = houston
upper_percentile = 0.998
lower_percentile = 0.002
# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)
# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]
# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]
# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]
print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])
In [75]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)
# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)
# filter the thorough listings according to these masks
houston_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(houston_filtered)
Out[75]:
In [76]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
',
data=houston_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
In [77]:
losangeles = rents[rents['region']=='losangeles']
losangeles.describe()
Out[77]:
In [78]:
dset = losangeles
upper_percentile = 0.998
lower_percentile = 0.002
# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)
# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]
# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]
# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]
print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])
In [79]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)
# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)
# filter the thorough listings according to these masks
losangeles_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(losangeles_filtered)
Out[79]:
In [80]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
',
data=losangeles_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
In [81]:
seattle = rents[rents['region']=='seattle']
seattle.describe()
Out[81]:
In [82]:
dset = seattle
upper_percentile = 0.998
lower_percentile = 0.002
# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)
# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]
# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]
# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]
print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])
In [83]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)
# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)
# filter the thorough listings according to these masks
seattle_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(seattle_filtered)
Out[83]:
In [84]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
',
data=seattle_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
In [85]:
chicago = rents[rents['region']=='chicago']
chicago.describe()
Out[85]:
In [86]:
dset = chicago
upper_percentile = 0.998
lower_percentile = 0.002
# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)
# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]
# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]
# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]
print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])
In [87]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)
# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)
# filter the thorough listings according to these masks
chicago_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(chicago_filtered)
Out[87]:
In [88]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
',
data=chicago_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())
In [ ]: