In [1]:
import pandas as pd, numpy as np, statsmodels.api as sm
import matplotlib.pyplot as plt, matplotlib.cm as cm, matplotlib.font_manager as fm
import matplotlib.mlab as mlab
from scipy.stats import pearsonr, ttest_rel
%matplotlib inline

In [4]:
store = pd.HDFStore('data/filtered_listings.h5')
rents = store['rents']

In [53]:
rents['y17jan'] = rents['month']==1
rents['y17feb'] = rents['month']==2
rents['y17mar'] = rents['month']==3

In [54]:
rents.describe()


Out[54]:
rent bedrooms sqft rent_sqft longitude latitude bathrooms week month year median_income population
count 2.908942e+06 2.908942e+06 2.908942e+06 2.908942e+06 2.908942e+06 2.908942e+06 1.726125e+06 2.908942e+06 2.908942e+06 2.908942e+06 2.908942e+06 2.908942e+06
mean 1.493998e+03 1.754201e+00 1.004015e+03 1.595298e+00 -9.722085e+01 3.671698e+01 1.473283e+00 2.580334e+01 6.328678e+00 2.016533e+03 6.144500e+04 4.517404e+06
std 8.623287e+02 9.353790e-01 4.319104e+02 8.620291e-01 1.728631e+01 5.212311e+00 5.973946e-01 2.187013e+01 5.013247e+00 4.989424e-01 1.145150e+04 3.539324e+06
min 7.800000e+01 0.000000e+00 2.160000e+02 6.198347e-02 -1.596241e+02 -8.450782e+01 0.000000e+00 1.000000e+00 1.000000e+00 2.016000e+03 4.342300e+04 3.757510e+05
25% 9.200000e+02 1.000000e+00 7.250000e+02 9.961538e-01 -1.153218e+02 3.325834e+01 1.000000e+00 5.000000e+00 1.000000e+00 2.016000e+03 5.268900e+04 2.069681e+06
50% 1.291000e+03 2.000000e+00 9.250000e+02 1.352273e+00 -9.556070e+01 3.626040e+01 1.000000e+00 1.000000e+01 3.000000e+00 2.017000e+03 6.007200e+04 3.263431e+06
75% 1.795000e+03 2.000000e+00 1.158000e+03 1.950000e+00 -8.128679e+01 3.989770e+01 2.000000e+00 4.900000e+01 1.200000e+01 2.017000e+03 6.687000e+04 6.033737e+06
max 9.999000e+03 9.000000e+00 4.498000e+03 8.431373e+00 2.317761e+00 1.653830e+02 8.000000e+00 5.200000e+01 1.200000e+01 2.017000e+03 9.119300e+04 2.009288e+07

In [57]:
sfbay = rents[rents['region']=='sfbay']
sfbay.describe()


Out[57]:
rent bedrooms sqft rent_sqft longitude latitude bathrooms week month year median_income population
count 128379.000000 128379.000000 128379.000000 128379.000000 128379.000000 128379.000000 66682.000000 128379.000000 128379.000000 128379.000000 128379.0 128379.0
mean 2862.479152 1.763474 1011.005881 3.050018 -122.129394 37.636303 1.489487 28.248787 6.897577 2016.489208 80600.0 8607423.0
std 1120.002811 0.972110 451.229335 1.033937 0.863566 0.334485 0.613397 21.131186 4.877076 0.499885 0.0 0.0
min 100.000000 0.000000 216.000000 0.075000 -124.031700 23.152315 0.000000 1.000000 1.000000 2016.000000 80600.0 8607423.0
25% 2145.000000 1.000000 715.000000 2.400000 -122.328508 37.376501 1.000000 8.000000 2.000000 2016.000000 80600.0 8607423.0
50% 2600.000000 2.000000 922.000000 2.880435 -122.072385 37.627998 1.000000 46.000000 11.000000 2016.000000 80600.0 8607423.0
75% 3321.000000 2.000000 1175.000000 3.548896 -121.959339 37.800253 2.000000 49.000000 12.000000 2017.000000 80600.0 8607423.0
max 9995.000000 8.000000 4440.000000 8.348018 2.317761 48.887051 6.000000 52.000000 12.000000 2017.000000 80600.0 8607423.0

In [58]:
dset = sfbay
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])


valid rent_sqft range: [0.51446945337620598, 7.2709163346613499]
valid rent range: [740.0, 8750.0]
valid sqft range: [266.0, 3500.0]

In [59]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
sfbay_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(sfbay_filtered)


Out[59]:
127046

In [60]:
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=sfbay_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.349
Model:                            OLS   Adj. R-squared:                  0.349
Method:                 Least Squares   F-statistic:                 1.178e+04
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:18:33   Log-Likelihood:                -8872.7
No. Observations:               65988   AIC:                         1.775e+04
Df Residuals:                   65984   BIC:                         1.779e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        4.2813      0.034    124.822      0.000         4.214     4.349
np.log(sqft)     0.5255      0.006     93.750      0.000         0.515     0.537
bedrooms        -0.0407      0.002    -18.905      0.000        -0.045    -0.036
bathrooms        0.0692      0.003     24.993      0.000         0.064     0.075
==============================================================================
Omnibus:                     2153.014   Durbin-Watson:                   1.886
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             5136.880
Skew:                          -0.163   Prob(JB):                         0.00
Kurtosis:                       4.327   Cond. No.                         236.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [61]:
detroit = rents[rents['region']=='detroit']
detroit.describe()


Out[61]:
rent bedrooms sqft rent_sqft longitude latitude bathrooms week month year median_income population
count 32713.000000 32713.000000 32713.000000 32713.000000 32713.000000 32713.000000 19846.000000 32713.000000 32713.000000 32713.000000 32713.0 32713.0
mean 983.096659 1.937639 1018.820530 1.000261 -83.205387 42.454045 1.334551 25.245438 6.185064 2016.542231 52462.0 4296611.0
std 392.477472 0.859662 346.534819 0.342152 0.868552 0.272896 0.509521 22.119471 5.085776 0.498221 0.0 0.0
min 79.000000 0.000000 240.000000 0.092402 -122.593700 26.287874 0.000000 1.000000 1.000000 2016.000000 52462.0 4296611.0
25% 755.000000 1.000000 815.000000 0.824000 -83.311592 42.325125 1.000000 4.000000 1.000000 2016.000000 52462.0 4296611.0
50% 880.000000 2.000000 950.000000 0.938889 -83.212336 42.479646 1.000000 10.000000 3.000000 2017.000000 52462.0 4296611.0
75% 1075.000000 2.000000 1137.000000 1.097946 -83.038800 42.587700 1.500000 49.000000 12.000000 2017.000000 52462.0 4296611.0
max 6995.000000 7.000000 4468.000000 5.752467 -68.791800 44.824200 6.000000 52.000000 12.000000 2017.000000 52462.0 4296611.0

In [62]:
dset = detroit
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])


valid rent_sqft range: [0.33333333333333298, 3.2418952618453898]
valid rent range: [400.0, 3655.0]
valid sqft range: [425.0, 3450.0]

In [63]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
detroit_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(detroit_filtered)


Out[63]:
32274

In [64]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=detroit_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.364
Model:                            OLS   Adj. R-squared:                  0.364
Method:                 Least Squares   F-statistic:                     3737.
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:19:33   Log-Likelihood:                 485.68
No. Observations:               19571   AIC:                            -963.4
Df Residuals:                   19567   BIC:                            -931.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        2.8048      0.065     43.349      0.000         2.678     2.932
np.log(sqft)     0.5784      0.010     55.936      0.000         0.558     0.599
bedrooms        -0.0863      0.003    -27.896      0.000        -0.092    -0.080
bathrooms        0.1688      0.004     38.054      0.000         0.160     0.177
==============================================================================
Omnibus:                     1760.303   Durbin-Watson:                   1.853
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3150.239
Skew:                           0.635   Prob(JB):                         0.00
Kurtosis:                       4.500   Cond. No.                         286.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [65]:
denver = rents[rents['region']=='denver']
denver.describe()


Out[65]:
rent bedrooms sqft rent_sqft longitude latitude bathrooms week month year median_income population
count 114153.000000 114153.000000 114153.000000 114153.000000 114153.000000 114153.000000 69617.000000 114153.000000 114153.000000 114153.000000 114153.0 114153.0
mean 1486.556981 1.629848 964.084509 1.632192 -104.958663 39.711213 1.430240 25.843245 6.346824 2016.534152 66870.0 2754258.0
std 549.367331 0.875000 438.035934 0.435288 0.492116 0.208812 0.594603 21.862210 5.004334 0.498834 0.0 0.0
min 79.000000 0.000000 220.000000 0.087004 -148.924302 25.468229 0.000000 1.000000 1.000000 2016.000000 66870.0 2754258.0
25% 1143.000000 1.000000 700.000000 1.366120 -105.020542 39.654307 1.000000 5.000000 1.000000 2016.000000 66870.0 2754258.0
50% 1370.000000 2.000000 865.000000 1.554455 -104.968600 39.710799 1.000000 10.000000 3.000000 2017.000000 66870.0 2754258.0
75% 1698.000000 2.000000 1100.000000 1.832240 -104.884853 39.757006 2.000000 49.000000 12.000000 2017.000000 66870.0 2754258.0
max 7900.000000 6.000000 4497.000000 7.531561 -71.298300 60.924252 8.000000 52.000000 12.000000 2017.000000 66870.0 2754258.0

In [66]:
dset = denver
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])


valid rent_sqft range: [0.41025641025641002, 3.5729166666666701]
valid rent range: [600.0, 5000.0]
valid sqft range: [300.0, 3727.0]

In [67]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
denver_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(denver_filtered)


Out[67]:
113023

In [68]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=denver_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.543
Model:                            OLS   Adj. R-squared:                  0.543
Method:                 Least Squares   F-statistic:                 2.735e+04
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:21:03   Log-Likelihood:                 10979.
No. Observations:               68950   AIC:                        -2.195e+04
Df Residuals:                   68946   BIC:                        -2.191e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        2.8404      0.027    105.564      0.000         2.788     2.893
np.log(sqft)     0.6507      0.004    147.602      0.000         0.642     0.659
bedrooms        -0.0488      0.002    -28.165      0.000        -0.052    -0.045
bathrooms        0.0538      0.002     24.960      0.000         0.050     0.058
==============================================================================
Omnibus:                     2718.068   Durbin-Watson:                   1.877
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             4972.951
Skew:                           0.317   Prob(JB):                         0.00
Kurtosis:                       4.153   Cond. No.                         251.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [69]:
newyork = rents[rents['region']=='newyork']
newyork.describe()


Out[69]:
rent bedrooms sqft rent_sqft longitude latitude bathrooms week month year median_income population
count 37388.000000 37388.000000 37388.000000 37388.000000 37388.000000 37388.000000 18535.000000 37388.000000 37388.000000 37388.000000 37388.0 37388.0
mean 2695.793249 1.713518 942.389670 3.051730 -73.856081 40.810525 1.248395 28.774313 7.047368 2016.477426 67066.0 20092883.0
std 1300.740147 1.046317 376.712246 1.310790 0.528350 0.354437 0.514592 20.799308 4.777683 0.499497 0.0 0.0
min 90.000000 0.000000 220.000000 0.069231 -123.114700 9.001601 0.000000 1.000000 1.000000 2016.000000 67066.0 20092883.0
25% 1850.000000 1.000000 700.000000 2.058824 -73.982037 40.706279 1.000000 8.000000 2.000000 2016.000000 67066.0 20092883.0
50% 2399.000000 2.000000 890.000000 2.774459 -73.947049 40.746282 1.000000 46.000000 11.000000 2016.000000 67066.0 20092883.0
75% 3175.000000 2.000000 1100.000000 3.958333 -73.806137 40.863300 1.500000 48.000000 12.000000 2017.000000 67066.0 20092883.0
max 9995.000000 8.000000 4400.000000 8.415789 -68.441734 45.548600 8.000000 52.000000 12.000000 2017.000000 67066.0 20092883.0

In [70]:
dset = newyork
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])


valid rent_sqft range: [0.57894736842105299, 7.5]
valid rent range: [650.0, 9000.0]
valid sqft range: [300.0, 3100.0]

In [71]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
newyork_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(newyork_filtered)


Out[71]:
36927

In [72]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=newyork_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.242
Model:                            OLS   Adj. R-squared:                  0.242
Method:                 Least Squares   F-statistic:                     1948.
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:21:59   Log-Likelihood:                -7747.4
No. Observations:               18282   AIC:                         1.550e+04
Df Residuals:                   18278   BIC:                         1.553e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        6.9553      0.074     93.531      0.000         6.810     7.101
np.log(sqft)     0.0577      0.012      4.821      0.000         0.034     0.081
bedrooms         0.1065      0.004     27.776      0.000         0.099     0.114
bathrooms        0.2295      0.007     34.750      0.000         0.217     0.242
==============================================================================
Omnibus:                       88.451   Durbin-Watson:                   1.738
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              119.805
Skew:                          -0.054   Prob(JB):                     9.66e-27
Kurtosis:                       3.381   Cond. No.                         199.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [73]:
houston = rents[rents['region']=='houston']
houston.describe()


Out[73]:
rent bedrooms sqft rent_sqft longitude latitude bathrooms week month year median_income population
count 106567.000000 106567.000000 106567.000000 106567.000000 106567.000000 106567.000000 62335.000000 106567.000000 106567.000000 106567.000000 106567.0 106567.0
mean 1290.172098 1.626348 1014.233740 1.344060 -95.456052 29.811908 1.434387 25.829319 6.328695 2016.522066 60072.0 6490180.0
std 512.344019 0.827799 428.149051 0.458061 0.493181 0.329684 0.611912 22.055356 5.034224 0.499515 0.0 0.0
min 85.000000 0.000000 225.000000 0.065238 -158.178100 -11.469258 0.000000 1.000000 1.000000 2016.000000 60072.0 6490180.0
25% 918.000000 1.000000 720.000000 1.023529 -95.556900 29.723139 1.000000 4.000000 1.000000 2016.000000 60072.0 6490180.0
50% 1200.000000 2.000000 940.000000 1.258268 -95.447000 29.747700 1.000000 10.000000 3.000000 2017.000000 60072.0 6490180.0
75% 1500.000000 2.000000 1160.000000 1.610145 -95.401000 29.852387 2.000000 48.000000 12.000000 2017.000000 60072.0 6490180.0
max 9500.000000 7.000000 4490.000000 7.073846 -50.560455 47.601900 6.000000 52.000000 12.000000 2017.000000 60072.0 6490180.0

In [74]:
dset = houston
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])


valid rent_sqft range: [0.29004221504842298, 3.2615101289134398]
valid rent range: [385.0, 3648.0]
valid sqft range: [420.0, 3763.0]

In [75]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
houston_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(houston_filtered)


Out[75]:
105437

In [76]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=houston_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.395
Model:                            OLS   Adj. R-squared:                  0.395
Method:                 Least Squares   F-statistic:                 1.342e+04
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:23:19   Log-Likelihood:                -7892.0
No. Observations:               61753   AIC:                         1.579e+04
Df Residuals:                   61749   BIC:                         1.583e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        0.4165      0.040     10.475      0.000         0.339     0.494
np.log(sqft)     1.0274      0.006    160.648      0.000         1.015     1.040
bedrooms        -0.2141      0.003    -78.028      0.000        -0.220    -0.209
bathrooms        0.0006      0.003      0.194      0.846        -0.005     0.006
==============================================================================
Omnibus:                      166.042   Durbin-Watson:                   1.694
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              196.272
Skew:                           0.066   Prob(JB):                     2.40e-43
Kurtosis:                       3.243   Cond. No.                         265.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [77]:
losangeles = rents[rents['region']=='losangeles']
losangeles.describe()


Out[77]:
rent bedrooms sqft rent_sqft longitude latitude bathrooms week month year median_income population
count 152270.000000 152270.000000 152270.000000 152270.000000 152270.000000 152270.000000 86283.000000 152270.000000 152270.000000 152270.000000 152270.0 152270.0
mean 2528.276089 1.624962 1021.838484 2.626747 -118.333378 34.094184 1.563066 26.314730 6.443140 2016.528036 60514.0 13262220.0
std 1279.880402 1.012909 494.005066 0.973024 0.566361 0.228387 0.680458 21.513275 4.948505 0.499215 0.0 0.0
min 84.000000 0.000000 217.000000 0.075000 -134.527588 29.770021 0.000000 1.000000 1.000000 2016.000000 60514.0 13262220.0
25% 1725.000000 1.000000 700.000000 1.992857 -118.451747 34.026800 1.000000 6.000000 2.000000 2016.000000 60514.0 13262220.0
50% 2200.000000 2.000000 928.000000 2.452771 -118.366300 34.073700 1.000000 10.000000 3.000000 2017.000000 60514.0 13262220.0
75% 2899.000000 2.000000 1200.000000 3.080000 -118.258012 34.162477 2.000000 49.000000 12.000000 2017.000000 60514.0 13262220.0
max 9999.000000 8.000000 4490.000000 8.420561 -72.986200 48.166400 7.000000 52.000000 12.000000 2017.000000 60514.0 13262220.0

In [78]:
dset = losangeles
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])


valid rent_sqft range: [0.392063492063492, 7.4645257654966404]
valid rent range: [500.0, 9510.0]
valid sqft range: [300.0, 4000.0]

In [79]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
losangeles_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(losangeles_filtered)


Out[79]:
150355

In [80]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=losangeles_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.440
Model:                            OLS   Adj. R-squared:                  0.440
Method:                 Least Squares   F-statistic:                 2.233e+04
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:24:08   Log-Likelihood:                -20025.
No. Observations:               85290   AIC:                         4.006e+04
Df Residuals:                   85286   BIC:                         4.009e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        2.7641      0.031     88.693      0.000         2.703     2.825
np.log(sqft)     0.7288      0.005    143.996      0.000         0.719     0.739
bedrooms        -0.1250      0.002    -61.465      0.000        -0.129    -0.121
bathrooms        0.1347      0.003     51.602      0.000         0.130     0.140
==============================================================================
Omnibus:                     2541.837   Durbin-Watson:                   1.889
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             6812.680
Skew:                          -0.005   Prob(JB):                         0.00
Kurtosis:                       4.385   Cond. No.                         220.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [81]:
seattle = rents[rents['region']=='seattle']
seattle.describe()


Out[81]:
rent bedrooms sqft rent_sqft longitude latitude bathrooms week month year median_income population
count 160932.000000 160932.000000 160932.000000 160932.000000 160932.000000 160932.000000 98920.000000 160932.000000 160932.000000 160932.000000 160932.0 160932.0
mean 1747.559665 1.594207 921.990934 2.074973 -122.307227 47.551757 1.383800 25.065848 6.170327 2016.552606 71273.0 3671478.0
std 649.299259 0.932514 419.891009 0.781669 0.412373 0.339398 0.554367 21.737498 4.978763 0.497226 0.0 0.0
min 90.000000 0.000000 216.000000 0.075000 -139.877930 28.395700 0.000000 1.000000 1.000000 2016.000000 71273.0 3671478.0
25% 1315.000000 1.000000 669.000000 1.491818 -122.356771 47.446500 1.000000 5.000000 2.000000 2016.000000 71273.0 3671478.0
50% 1604.000000 2.000000 850.000000 1.882083 -122.313255 47.615191 1.000000 10.000000 3.000000 2017.000000 71273.0 3671478.0
75% 1995.000000 2.000000 1050.000000 2.625000 -122.209637 47.673004 2.000000 49.000000 12.000000 2017.000000 71273.0 3671478.0
max 9000.000000 8.000000 4488.000000 8.064516 -81.466600 49.032579 8.000000 52.000000 12.000000 2017.000000 71273.0 3671478.0

In [82]:
dset = seattle
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])


valid rent_sqft range: [0.45000000000000001, 5.0374531835205998]
valid rent range: [620.0, 5500.0]
valid sqft range: [247.0, 3500.0]

In [83]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
seattle_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(seattle_filtered)


Out[83]:
159209

In [84]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=seattle_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.231
Model:                            OLS   Adj. R-squared:                  0.231
Method:                 Least Squares   F-statistic:                     9787.
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:29:41   Log-Likelihood:                -15411.
No. Observations:               97975   AIC:                         3.083e+04
Df Residuals:                   97971   BIC:                         3.087e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        4.1082      0.032    128.447      0.000         4.045     4.171
np.log(sqft)     0.4893      0.005     93.209      0.000         0.479     0.500
bedrooms        -0.1241      0.002    -61.453      0.000        -0.128    -0.120
bathrooms        0.1444      0.002     58.823      0.000         0.140     0.149
==============================================================================
Omnibus:                      306.797   Durbin-Watson:                   1.933
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              278.157
Skew:                           0.093   Prob(JB):                     3.97e-61
Kurtosis:                       2.817   Cond. No.                         257.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [85]:
chicago = rents[rents['region']=='chicago']
chicago.describe()


Out[85]:
rent bedrooms sqft rent_sqft longitude latitude bathrooms week month year median_income population
count 64279.000000 64279.000000 64279.000000 64279.000000 64279.000000 64279.000000 36749.000000 64279.000000 64279.000000 64279.000000 64279.0 64279.0
mean 1747.348310 1.696884 1029.364458 1.824347 -87.739262 41.890310 1.361302 27.553524 6.657166 2016.488713 61598.0 9554598.0
std 831.031145 1.001307 443.002202 0.739249 0.584812 0.344643 0.561647 22.254252 5.103591 0.499876 0.0 0.0
min 90.000000 0.000000 220.000000 0.100000 -118.546343 25.922420 0.000000 1.000000 1.000000 2016.000000 61598.0 9554598.0
25% 1195.000000 1.000000 735.000000 1.241667 -87.732006 41.870957 1.000000 4.000000 1.000000 2016.000000 61598.0 9554598.0
50% 1595.000000 2.000000 963.000000 1.681818 -87.656049 41.899132 1.000000 45.000000 11.000000 2016.000000 61598.0 9554598.0
75% 2100.000000 2.000000 1200.000000 2.355114 -87.630654 41.954300 2.000000 49.000000 12.000000 2017.000000 61598.0 9554598.0
max 9950.000000 8.000000 4411.000000 8.150000 -73.956600 45.376717 5.500000 52.000000 12.000000 2017.000000 61598.0 9554598.0

In [86]:
dset = chicago
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])


valid rent_sqft range: [0.33333333333333298, 4.3046153846153796]
valid rent range: [450.0, 5900.0]
valid sqft range: [275.0, 3400.0]

In [87]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
chicago_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(chicago_filtered)


Out[87]:
63508

In [88]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=chicago_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.325
Model:                            OLS   Adj. R-squared:                  0.325
Method:                 Least Squares   F-statistic:                     5818.
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:30:12   Log-Likelihood:                -12975.
No. Observations:               36324   AIC:                         2.596e+04
Df Residuals:                   36320   BIC:                         2.599e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        3.2384      0.053     61.129      0.000         3.135     3.342
np.log(sqft)     0.5901      0.009     68.650      0.000         0.573     0.607
bedrooms        -0.1307      0.003    -40.895      0.000        -0.137    -0.124
bathrooms        0.2336      0.004     54.836      0.000         0.225     0.242
==============================================================================
Omnibus:                      313.020   Durbin-Watson:                   1.669
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              321.056
Skew:                          -0.230   Prob(JB):                     1.92e-70
Kurtosis:                       2.986   Cond. No.                         216.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [ ]: