notebook.community

Edit and run



In [1]:

    
import pandas as pd, numpy as np, statsmodels.api as sm
import matplotlib.pyplot as plt, matplotlib.cm as cm, matplotlib.font_manager as fm
import matplotlib.mlab as mlab
from scipy.stats import pearsonr, ttest_rel
%matplotlib inline



In [4]:

    
store = pd.HDFStore('data/filtered_listings.h5')
rents = store['rents']



In [53]:

    
rents['y17jan'] = rents['month']==1
rents['y17feb'] = rents['month']==2
rents['y17mar'] = rents['month']==3



In [54]:

    
rents.describe()









    Out[54]:






  
    
      
      rent
      bedrooms
      sqft
      rent_sqft
      longitude
      latitude
      bathrooms
      week
      month
      year
      median_income
      population
    
  
  
    
      count
      2.908942e+06
      2.908942e+06
      2.908942e+06
      2.908942e+06
      2.908942e+06
      2.908942e+06
      1.726125e+06
      2.908942e+06
      2.908942e+06
      2.908942e+06
      2.908942e+06
      2.908942e+06
    
    
      mean
      1.493998e+03
      1.754201e+00
      1.004015e+03
      1.595298e+00
      -9.722085e+01
      3.671698e+01
      1.473283e+00
      2.580334e+01
      6.328678e+00
      2.016533e+03
      6.144500e+04
      4.517404e+06
    
    
      std
      8.623287e+02
      9.353790e-01
      4.319104e+02
      8.620291e-01
      1.728631e+01
      5.212311e+00
      5.973946e-01
      2.187013e+01
      5.013247e+00
      4.989424e-01
      1.145150e+04
      3.539324e+06
    
    
      min
      7.800000e+01
      0.000000e+00
      2.160000e+02
      6.198347e-02
      -1.596241e+02
      -8.450782e+01
      0.000000e+00
      1.000000e+00
      1.000000e+00
      2.016000e+03
      4.342300e+04
      3.757510e+05
    
    
      25%
      9.200000e+02
      1.000000e+00
      7.250000e+02
      9.961538e-01
      -1.153218e+02
      3.325834e+01
      1.000000e+00
      5.000000e+00
      1.000000e+00
      2.016000e+03
      5.268900e+04
      2.069681e+06
    
    
      50%
      1.291000e+03
      2.000000e+00
      9.250000e+02
      1.352273e+00
      -9.556070e+01
      3.626040e+01
      1.000000e+00
      1.000000e+01
      3.000000e+00
      2.017000e+03
      6.007200e+04
      3.263431e+06
    
    
      75%
      1.795000e+03
      2.000000e+00
      1.158000e+03
      1.950000e+00
      -8.128679e+01
      3.989770e+01
      2.000000e+00
      4.900000e+01
      1.200000e+01
      2.017000e+03
      6.687000e+04
      6.033737e+06
    
    
      max
      9.999000e+03
      9.000000e+00
      4.498000e+03
      8.431373e+00
      2.317761e+00
      1.653830e+02
      8.000000e+00
      5.200000e+01
      1.200000e+01
      2.017000e+03
      9.119300e+04
      2.009288e+07



In [57]:

    
sfbay = rents[rents['region']=='sfbay']
sfbay.describe()









    Out[57]:






  
    
      
      rent
      bedrooms
      sqft
      rent_sqft
      longitude
      latitude
      bathrooms
      week
      month
      year
      median_income
      population
    
  
  
    
      count
      128379.000000
      128379.000000
      128379.000000
      128379.000000
      128379.000000
      128379.000000
      66682.000000
      128379.000000
      128379.000000
      128379.000000
      128379.0
      128379.0
    
    
      mean
      2862.479152
      1.763474
      1011.005881
      3.050018
      -122.129394
      37.636303
      1.489487
      28.248787
      6.897577
      2016.489208
      80600.0
      8607423.0
    
    
      std
      1120.002811
      0.972110
      451.229335
      1.033937
      0.863566
      0.334485
      0.613397
      21.131186
      4.877076
      0.499885
      0.0
      0.0
    
    
      min
      100.000000
      0.000000
      216.000000
      0.075000
      -124.031700
      23.152315
      0.000000
      1.000000
      1.000000
      2016.000000
      80600.0
      8607423.0
    
    
      25%
      2145.000000
      1.000000
      715.000000
      2.400000
      -122.328508
      37.376501
      1.000000
      8.000000
      2.000000
      2016.000000
      80600.0
      8607423.0
    
    
      50%
      2600.000000
      2.000000
      922.000000
      2.880435
      -122.072385
      37.627998
      1.000000
      46.000000
      11.000000
      2016.000000
      80600.0
      8607423.0
    
    
      75%
      3321.000000
      2.000000
      1175.000000
      3.548896
      -121.959339
      37.800253
      2.000000
      49.000000
      12.000000
      2017.000000
      80600.0
      8607423.0
    
    
      max
      9995.000000
      8.000000
      4440.000000
      8.348018
      2.317761
      48.887051
      6.000000
      52.000000
      12.000000
      2017.000000
      80600.0
      8607423.0



In [58]:

    
dset = sfbay
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])









    



valid rent_sqft range: [0.51446945337620598, 7.2709163346613499]
valid rent range: [740.0, 8750.0]
valid sqft range: [266.0, 3500.0]



In [59]:

    
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
sfbay_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(sfbay_filtered)









    Out[59]:





127046



In [60]:

    
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=sfbay_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.349
Model:                            OLS   Adj. R-squared:                  0.349
Method:                 Least Squares   F-statistic:                 1.178e+04
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:18:33   Log-Likelihood:                -8872.7
No. Observations:               65988   AIC:                         1.775e+04
Df Residuals:                   65984   BIC:                         1.779e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        4.2813      0.034    124.822      0.000         4.214     4.349
np.log(sqft)     0.5255      0.006     93.750      0.000         0.515     0.537
bedrooms        -0.0407      0.002    -18.905      0.000        -0.045    -0.036
bathrooms        0.0692      0.003     24.993      0.000         0.064     0.075
==============================================================================
Omnibus:                     2153.014   Durbin-Watson:                   1.886
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             5136.880
Skew:                          -0.163   Prob(JB):                         0.00
Kurtosis:                       4.327   Cond. No.                         236.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [61]:

    
detroit = rents[rents['region']=='detroit']
detroit.describe()









    Out[61]:






  
    
      
      rent
      bedrooms
      sqft
      rent_sqft
      longitude
      latitude
      bathrooms
      week
      month
      year
      median_income
      population
    
  
  
    
      count
      32713.000000
      32713.000000
      32713.000000
      32713.000000
      32713.000000
      32713.000000
      19846.000000
      32713.000000
      32713.000000
      32713.000000
      32713.0
      32713.0
    
    
      mean
      983.096659
      1.937639
      1018.820530
      1.000261
      -83.205387
      42.454045
      1.334551
      25.245438
      6.185064
      2016.542231
      52462.0
      4296611.0
    
    
      std
      392.477472
      0.859662
      346.534819
      0.342152
      0.868552
      0.272896
      0.509521
      22.119471
      5.085776
      0.498221
      0.0
      0.0
    
    
      min
      79.000000
      0.000000
      240.000000
      0.092402
      -122.593700
      26.287874
      0.000000
      1.000000
      1.000000
      2016.000000
      52462.0
      4296611.0
    
    
      25%
      755.000000
      1.000000
      815.000000
      0.824000
      -83.311592
      42.325125
      1.000000
      4.000000
      1.000000
      2016.000000
      52462.0
      4296611.0
    
    
      50%
      880.000000
      2.000000
      950.000000
      0.938889
      -83.212336
      42.479646
      1.000000
      10.000000
      3.000000
      2017.000000
      52462.0
      4296611.0
    
    
      75%
      1075.000000
      2.000000
      1137.000000
      1.097946
      -83.038800
      42.587700
      1.500000
      49.000000
      12.000000
      2017.000000
      52462.0
      4296611.0
    
    
      max
      6995.000000
      7.000000
      4468.000000
      5.752467
      -68.791800
      44.824200
      6.000000
      52.000000
      12.000000
      2017.000000
      52462.0
      4296611.0



In [62]:

    
dset = detroit
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])









    



valid rent_sqft range: [0.33333333333333298, 3.2418952618453898]
valid rent range: [400.0, 3655.0]
valid sqft range: [425.0, 3450.0]



In [63]:

    
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
detroit_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(detroit_filtered)









    Out[63]:





32274



In [64]:

    
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=detroit_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.364
Model:                            OLS   Adj. R-squared:                  0.364
Method:                 Least Squares   F-statistic:                     3737.
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:19:33   Log-Likelihood:                 485.68
No. Observations:               19571   AIC:                            -963.4
Df Residuals:                   19567   BIC:                            -931.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        2.8048      0.065     43.349      0.000         2.678     2.932
np.log(sqft)     0.5784      0.010     55.936      0.000         0.558     0.599
bedrooms        -0.0863      0.003    -27.896      0.000        -0.092    -0.080
bathrooms        0.1688      0.004     38.054      0.000         0.160     0.177
==============================================================================
Omnibus:                     1760.303   Durbin-Watson:                   1.853
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3150.239
Skew:                           0.635   Prob(JB):                         0.00
Kurtosis:                       4.500   Cond. No.                         286.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [65]:

    
denver = rents[rents['region']=='denver']
denver.describe()









    Out[65]:






  
    
      
      rent
      bedrooms
      sqft
      rent_sqft
      longitude
      latitude
      bathrooms
      week
      month
      year
      median_income
      population
    
  
  
    
      count
      114153.000000
      114153.000000
      114153.000000
      114153.000000
      114153.000000
      114153.000000
      69617.000000
      114153.000000
      114153.000000
      114153.000000
      114153.0
      114153.0
    
    
      mean
      1486.556981
      1.629848
      964.084509
      1.632192
      -104.958663
      39.711213
      1.430240
      25.843245
      6.346824
      2016.534152
      66870.0
      2754258.0
    
    
      std
      549.367331
      0.875000
      438.035934
      0.435288
      0.492116
      0.208812
      0.594603
      21.862210
      5.004334
      0.498834
      0.0
      0.0
    
    
      min
      79.000000
      0.000000
      220.000000
      0.087004
      -148.924302
      25.468229
      0.000000
      1.000000
      1.000000
      2016.000000
      66870.0
      2754258.0
    
    
      25%
      1143.000000
      1.000000
      700.000000
      1.366120
      -105.020542
      39.654307
      1.000000
      5.000000
      1.000000
      2016.000000
      66870.0
      2754258.0
    
    
      50%
      1370.000000
      2.000000
      865.000000
      1.554455
      -104.968600
      39.710799
      1.000000
      10.000000
      3.000000
      2017.000000
      66870.0
      2754258.0
    
    
      75%
      1698.000000
      2.000000
      1100.000000
      1.832240
      -104.884853
      39.757006
      2.000000
      49.000000
      12.000000
      2017.000000
      66870.0
      2754258.0
    
    
      max
      7900.000000
      6.000000
      4497.000000
      7.531561
      -71.298300
      60.924252
      8.000000
      52.000000
      12.000000
      2017.000000
      66870.0
      2754258.0



In [66]:

    
dset = denver
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])









    



valid rent_sqft range: [0.41025641025641002, 3.5729166666666701]
valid rent range: [600.0, 5000.0]
valid sqft range: [300.0, 3727.0]



In [67]:

    
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
denver_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(denver_filtered)









    Out[67]:





113023



In [68]:

    
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=denver_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.543
Model:                            OLS   Adj. R-squared:                  0.543
Method:                 Least Squares   F-statistic:                 2.735e+04
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:21:03   Log-Likelihood:                 10979.
No. Observations:               68950   AIC:                        -2.195e+04
Df Residuals:                   68946   BIC:                        -2.191e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        2.8404      0.027    105.564      0.000         2.788     2.893
np.log(sqft)     0.6507      0.004    147.602      0.000         0.642     0.659
bedrooms        -0.0488      0.002    -28.165      0.000        -0.052    -0.045
bathrooms        0.0538      0.002     24.960      0.000         0.050     0.058
==============================================================================
Omnibus:                     2718.068   Durbin-Watson:                   1.877
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             4972.951
Skew:                           0.317   Prob(JB):                         0.00
Kurtosis:                       4.153   Cond. No.                         251.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [69]:

    
newyork = rents[rents['region']=='newyork']
newyork.describe()









    Out[69]:






  
    
      
      rent
      bedrooms
      sqft
      rent_sqft
      longitude
      latitude
      bathrooms
      week
      month
      year
      median_income
      population
    
  
  
    
      count
      37388.000000
      37388.000000
      37388.000000
      37388.000000
      37388.000000
      37388.000000
      18535.000000
      37388.000000
      37388.000000
      37388.000000
      37388.0
      37388.0
    
    
      mean
      2695.793249
      1.713518
      942.389670
      3.051730
      -73.856081
      40.810525
      1.248395
      28.774313
      7.047368
      2016.477426
      67066.0
      20092883.0
    
    
      std
      1300.740147
      1.046317
      376.712246
      1.310790
      0.528350
      0.354437
      0.514592
      20.799308
      4.777683
      0.499497
      0.0
      0.0
    
    
      min
      90.000000
      0.000000
      220.000000
      0.069231
      -123.114700
      9.001601
      0.000000
      1.000000
      1.000000
      2016.000000
      67066.0
      20092883.0
    
    
      25%
      1850.000000
      1.000000
      700.000000
      2.058824
      -73.982037
      40.706279
      1.000000
      8.000000
      2.000000
      2016.000000
      67066.0
      20092883.0
    
    
      50%
      2399.000000
      2.000000
      890.000000
      2.774459
      -73.947049
      40.746282
      1.000000
      46.000000
      11.000000
      2016.000000
      67066.0
      20092883.0
    
    
      75%
      3175.000000
      2.000000
      1100.000000
      3.958333
      -73.806137
      40.863300
      1.500000
      48.000000
      12.000000
      2017.000000
      67066.0
      20092883.0
    
    
      max
      9995.000000
      8.000000
      4400.000000
      8.415789
      -68.441734
      45.548600
      8.000000
      52.000000
      12.000000
      2017.000000
      67066.0
      20092883.0



In [70]:

    
dset = newyork
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])









    



valid rent_sqft range: [0.57894736842105299, 7.5]
valid rent range: [650.0, 9000.0]
valid sqft range: [300.0, 3100.0]



In [71]:

    
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
newyork_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(newyork_filtered)









    Out[71]:





36927



In [72]:

    
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=newyork_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.242
Model:                            OLS   Adj. R-squared:                  0.242
Method:                 Least Squares   F-statistic:                     1948.
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:21:59   Log-Likelihood:                -7747.4
No. Observations:               18282   AIC:                         1.550e+04
Df Residuals:                   18278   BIC:                         1.553e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        6.9553      0.074     93.531      0.000         6.810     7.101
np.log(sqft)     0.0577      0.012      4.821      0.000         0.034     0.081
bedrooms         0.1065      0.004     27.776      0.000         0.099     0.114
bathrooms        0.2295      0.007     34.750      0.000         0.217     0.242
==============================================================================
Omnibus:                       88.451   Durbin-Watson:                   1.738
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              119.805
Skew:                          -0.054   Prob(JB):                     9.66e-27
Kurtosis:                       3.381   Cond. No.                         199.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [73]:

    
houston = rents[rents['region']=='houston']
houston.describe()









    Out[73]:






  
    
      
      rent
      bedrooms
      sqft
      rent_sqft
      longitude
      latitude
      bathrooms
      week
      month
      year
      median_income
      population
    
  
  
    
      count
      106567.000000
      106567.000000
      106567.000000
      106567.000000
      106567.000000
      106567.000000
      62335.000000
      106567.000000
      106567.000000
      106567.000000
      106567.0
      106567.0
    
    
      mean
      1290.172098
      1.626348
      1014.233740
      1.344060
      -95.456052
      29.811908
      1.434387
      25.829319
      6.328695
      2016.522066
      60072.0
      6490180.0
    
    
      std
      512.344019
      0.827799
      428.149051
      0.458061
      0.493181
      0.329684
      0.611912
      22.055356
      5.034224
      0.499515
      0.0
      0.0
    
    
      min
      85.000000
      0.000000
      225.000000
      0.065238
      -158.178100
      -11.469258
      0.000000
      1.000000
      1.000000
      2016.000000
      60072.0
      6490180.0
    
    
      25%
      918.000000
      1.000000
      720.000000
      1.023529
      -95.556900
      29.723139
      1.000000
      4.000000
      1.000000
      2016.000000
      60072.0
      6490180.0
    
    
      50%
      1200.000000
      2.000000
      940.000000
      1.258268
      -95.447000
      29.747700
      1.000000
      10.000000
      3.000000
      2017.000000
      60072.0
      6490180.0
    
    
      75%
      1500.000000
      2.000000
      1160.000000
      1.610145
      -95.401000
      29.852387
      2.000000
      48.000000
      12.000000
      2017.000000
      60072.0
      6490180.0
    
    
      max
      9500.000000
      7.000000
      4490.000000
      7.073846
      -50.560455
      47.601900
      6.000000
      52.000000
      12.000000
      2017.000000
      60072.0
      6490180.0



In [74]:

    
dset = houston
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])









    



valid rent_sqft range: [0.29004221504842298, 3.2615101289134398]
valid rent range: [385.0, 3648.0]
valid sqft range: [420.0, 3763.0]



In [75]:

    
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
houston_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(houston_filtered)









    Out[75]:





105437



In [76]:

    
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=houston_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.395
Model:                            OLS   Adj. R-squared:                  0.395
Method:                 Least Squares   F-statistic:                 1.342e+04
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:23:19   Log-Likelihood:                -7892.0
No. Observations:               61753   AIC:                         1.579e+04
Df Residuals:                   61749   BIC:                         1.583e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        0.4165      0.040     10.475      0.000         0.339     0.494
np.log(sqft)     1.0274      0.006    160.648      0.000         1.015     1.040
bedrooms        -0.2141      0.003    -78.028      0.000        -0.220    -0.209
bathrooms        0.0006      0.003      0.194      0.846        -0.005     0.006
==============================================================================
Omnibus:                      166.042   Durbin-Watson:                   1.694
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              196.272
Skew:                           0.066   Prob(JB):                     2.40e-43
Kurtosis:                       3.243   Cond. No.                         265.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [77]:

    
losangeles = rents[rents['region']=='losangeles']
losangeles.describe()









    Out[77]:






  
    
      
      rent
      bedrooms
      sqft
      rent_sqft
      longitude
      latitude
      bathrooms
      week
      month
      year
      median_income
      population
    
  
  
    
      count
      152270.000000
      152270.000000
      152270.000000
      152270.000000
      152270.000000
      152270.000000
      86283.000000
      152270.000000
      152270.000000
      152270.000000
      152270.0
      152270.0
    
    
      mean
      2528.276089
      1.624962
      1021.838484
      2.626747
      -118.333378
      34.094184
      1.563066
      26.314730
      6.443140
      2016.528036
      60514.0
      13262220.0
    
    
      std
      1279.880402
      1.012909
      494.005066
      0.973024
      0.566361
      0.228387
      0.680458
      21.513275
      4.948505
      0.499215
      0.0
      0.0
    
    
      min
      84.000000
      0.000000
      217.000000
      0.075000
      -134.527588
      29.770021
      0.000000
      1.000000
      1.000000
      2016.000000
      60514.0
      13262220.0
    
    
      25%
      1725.000000
      1.000000
      700.000000
      1.992857
      -118.451747
      34.026800
      1.000000
      6.000000
      2.000000
      2016.000000
      60514.0
      13262220.0
    
    
      50%
      2200.000000
      2.000000
      928.000000
      2.452771
      -118.366300
      34.073700
      1.000000
      10.000000
      3.000000
      2017.000000
      60514.0
      13262220.0
    
    
      75%
      2899.000000
      2.000000
      1200.000000
      3.080000
      -118.258012
      34.162477
      2.000000
      49.000000
      12.000000
      2017.000000
      60514.0
      13262220.0
    
    
      max
      9999.000000
      8.000000
      4490.000000
      8.420561
      -72.986200
      48.166400
      7.000000
      52.000000
      12.000000
      2017.000000
      60514.0
      13262220.0



In [78]:

    
dset = losangeles
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])









    



valid rent_sqft range: [0.392063492063492, 7.4645257654966404]
valid rent range: [500.0, 9510.0]
valid sqft range: [300.0, 4000.0]



In [79]:

    
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
losangeles_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(losangeles_filtered)









    Out[79]:





150355



In [80]:

    
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=losangeles_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.440
Model:                            OLS   Adj. R-squared:                  0.440
Method:                 Least Squares   F-statistic:                 2.233e+04
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:24:08   Log-Likelihood:                -20025.
No. Observations:               85290   AIC:                         4.006e+04
Df Residuals:                   85286   BIC:                         4.009e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        2.7641      0.031     88.693      0.000         2.703     2.825
np.log(sqft)     0.7288      0.005    143.996      0.000         0.719     0.739
bedrooms        -0.1250      0.002    -61.465      0.000        -0.129    -0.121
bathrooms        0.1347      0.003     51.602      0.000         0.130     0.140
==============================================================================
Omnibus:                     2541.837   Durbin-Watson:                   1.889
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             6812.680
Skew:                          -0.005   Prob(JB):                         0.00
Kurtosis:                       4.385   Cond. No.                         220.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [81]:

    
seattle = rents[rents['region']=='seattle']
seattle.describe()









    Out[81]:






  
    
      
      rent
      bedrooms
      sqft
      rent_sqft
      longitude
      latitude
      bathrooms
      week
      month
      year
      median_income
      population
    
  
  
    
      count
      160932.000000
      160932.000000
      160932.000000
      160932.000000
      160932.000000
      160932.000000
      98920.000000
      160932.000000
      160932.000000
      160932.000000
      160932.0
      160932.0
    
    
      mean
      1747.559665
      1.594207
      921.990934
      2.074973
      -122.307227
      47.551757
      1.383800
      25.065848
      6.170327
      2016.552606
      71273.0
      3671478.0
    
    
      std
      649.299259
      0.932514
      419.891009
      0.781669
      0.412373
      0.339398
      0.554367
      21.737498
      4.978763
      0.497226
      0.0
      0.0
    
    
      min
      90.000000
      0.000000
      216.000000
      0.075000
      -139.877930
      28.395700
      0.000000
      1.000000
      1.000000
      2016.000000
      71273.0
      3671478.0
    
    
      25%
      1315.000000
      1.000000
      669.000000
      1.491818
      -122.356771
      47.446500
      1.000000
      5.000000
      2.000000
      2016.000000
      71273.0
      3671478.0
    
    
      50%
      1604.000000
      2.000000
      850.000000
      1.882083
      -122.313255
      47.615191
      1.000000
      10.000000
      3.000000
      2017.000000
      71273.0
      3671478.0
    
    
      75%
      1995.000000
      2.000000
      1050.000000
      2.625000
      -122.209637
      47.673004
      2.000000
      49.000000
      12.000000
      2017.000000
      71273.0
      3671478.0
    
    
      max
      9000.000000
      8.000000
      4488.000000
      8.064516
      -81.466600
      49.032579
      8.000000
      52.000000
      12.000000
      2017.000000
      71273.0
      3671478.0



In [82]:

    
dset = seattle
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])









    



valid rent_sqft range: [0.45000000000000001, 5.0374531835205998]
valid rent range: [620.0, 5500.0]
valid sqft range: [247.0, 3500.0]



In [83]:

    
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
seattle_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(seattle_filtered)









    Out[83]:





159209



In [84]:

    
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=seattle_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.231
Model:                            OLS   Adj. R-squared:                  0.231
Method:                 Least Squares   F-statistic:                     9787.
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:29:41   Log-Likelihood:                -15411.
No. Observations:               97975   AIC:                         3.083e+04
Df Residuals:                   97971   BIC:                         3.087e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        4.1082      0.032    128.447      0.000         4.045     4.171
np.log(sqft)     0.4893      0.005     93.209      0.000         0.479     0.500
bedrooms        -0.1241      0.002    -61.453      0.000        -0.128    -0.120
bathrooms        0.1444      0.002     58.823      0.000         0.140     0.149
==============================================================================
Omnibus:                      306.797   Durbin-Watson:                   1.933
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              278.157
Skew:                           0.093   Prob(JB):                     3.97e-61
Kurtosis:                       2.817   Cond. No.                         257.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [85]:

    
chicago = rents[rents['region']=='chicago']
chicago.describe()









    Out[85]:






  
    
      
      rent
      bedrooms
      sqft
      rent_sqft
      longitude
      latitude
      bathrooms
      week
      month
      year
      median_income
      population
    
  
  
    
      count
      64279.000000
      64279.000000
      64279.000000
      64279.000000
      64279.000000
      64279.000000
      36749.000000
      64279.000000
      64279.000000
      64279.000000
      64279.0
      64279.0
    
    
      mean
      1747.348310
      1.696884
      1029.364458
      1.824347
      -87.739262
      41.890310
      1.361302
      27.553524
      6.657166
      2016.488713
      61598.0
      9554598.0
    
    
      std
      831.031145
      1.001307
      443.002202
      0.739249
      0.584812
      0.344643
      0.561647
      22.254252
      5.103591
      0.499876
      0.0
      0.0
    
    
      min
      90.000000
      0.000000
      220.000000
      0.100000
      -118.546343
      25.922420
      0.000000
      1.000000
      1.000000
      2016.000000
      61598.0
      9554598.0
    
    
      25%
      1195.000000
      1.000000
      735.000000
      1.241667
      -87.732006
      41.870957
      1.000000
      4.000000
      1.000000
      2016.000000
      61598.0
      9554598.0
    
    
      50%
      1595.000000
      2.000000
      963.000000
      1.681818
      -87.656049
      41.899132
      1.000000
      45.000000
      11.000000
      2016.000000
      61598.0
      9554598.0
    
    
      75%
      2100.000000
      2.000000
      1200.000000
      2.355114
      -87.630654
      41.954300
      2.000000
      49.000000
      12.000000
      2017.000000
      61598.0
      9554598.0
    
    
      max
      9950.000000
      8.000000
      4411.000000
      8.150000
      -73.956600
      45.376717
      5.500000
      52.000000
      12.000000
      2017.000000
      61598.0
      9554598.0



In [86]:

    
dset = chicago
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])









    



valid rent_sqft range: [0.33333333333333298, 4.3046153846153796]
valid rent range: [450.0, 5900.0]
valid sqft range: [275.0, 3400.0]



In [87]:

    
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
chicago_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(chicago_filtered)









    Out[87]:





63508



In [88]:

    
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=chicago_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:           np.log(rent)   R-squared:                       0.325
Model:                            OLS   Adj. R-squared:                  0.325
Method:                 Least Squares   F-statistic:                     5818.
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:30:12   Log-Likelihood:                -12975.
No. Observations:               36324   AIC:                         2.596e+04
Df Residuals:                   36320   BIC:                         2.599e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        3.2384      0.053     61.129      0.000         3.135     3.342
np.log(sqft)     0.5901      0.009     68.650      0.000         0.573     0.607
bedrooms        -0.1307      0.003    -40.895      0.000        -0.137    -0.124
bathrooms        0.2336      0.004     54.836      0.000         0.225     0.242
==============================================================================
Omnibus:                      313.020   Durbin-Watson:                   1.669
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              321.056
Skew:                          -0.230   Prob(JB):                     1.92e-70
Kurtosis:                       2.986   Cond. No.                         216.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [ ]:

	rent	bedrooms	sqft	rent_sqft	longitude	latitude	bathrooms	week	month	year	median_income	population
count	2.908942e+06	2.908942e+06	2.908942e+06	2.908942e+06	2.908942e+06	2.908942e+06	1.726125e+06	2.908942e+06	2.908942e+06	2.908942e+06	2.908942e+06	2.908942e+06
mean	1.493998e+03	1.754201e+00	1.004015e+03	1.595298e+00	-9.722085e+01	3.671698e+01	1.473283e+00	2.580334e+01	6.328678e+00	2.016533e+03	6.144500e+04	4.517404e+06
std	8.623287e+02	9.353790e-01	4.319104e+02	8.620291e-01	1.728631e+01	5.212311e+00	5.973946e-01	2.187013e+01	5.013247e+00	4.989424e-01	1.145150e+04	3.539324e+06
min	7.800000e+01	0.000000e+00	2.160000e+02	6.198347e-02	-1.596241e+02	-8.450782e+01	0.000000e+00	1.000000e+00	1.000000e+00	2.016000e+03	4.342300e+04	3.757510e+05
25%	9.200000e+02	1.000000e+00	7.250000e+02	9.961538e-01	-1.153218e+02	3.325834e+01	1.000000e+00	5.000000e+00	1.000000e+00	2.016000e+03	5.268900e+04	2.069681e+06
50%	1.291000e+03	2.000000e+00	9.250000e+02	1.352273e+00	-9.556070e+01	3.626040e+01	1.000000e+00	1.000000e+01	3.000000e+00	2.017000e+03	6.007200e+04	3.263431e+06
75%	1.795000e+03	2.000000e+00	1.158000e+03	1.950000e+00	-8.128679e+01	3.989770e+01	2.000000e+00	4.900000e+01	1.200000e+01	2.017000e+03	6.687000e+04	6.033737e+06
max	9.999000e+03	9.000000e+00	4.498000e+03	8.431373e+00	2.317761e+00	1.653830e+02	8.000000e+00	5.200000e+01	1.200000e+01	2.017000e+03	9.119300e+04	2.009288e+07

	rent	bedrooms	sqft	rent_sqft	longitude	latitude	bathrooms	week	month	year	median_income	population
count	128379.000000	128379.000000	128379.000000	128379.000000	128379.000000	128379.000000	66682.000000	128379.000000	128379.000000	128379.000000	128379.0	128379.0
mean	2862.479152	1.763474	1011.005881	3.050018	-122.129394	37.636303	1.489487	28.248787	6.897577	2016.489208	80600.0	8607423.0
std	1120.002811	0.972110	451.229335	1.033937	0.863566	0.334485	0.613397	21.131186	4.877076	0.499885	0.0	0.0
min	100.000000	0.000000	216.000000	0.075000	-124.031700	23.152315	0.000000	1.000000	1.000000	2016.000000	80600.0	8607423.0
25%	2145.000000	1.000000	715.000000	2.400000	-122.328508	37.376501	1.000000	8.000000	2.000000	2016.000000	80600.0	8607423.0
50%	2600.000000	2.000000	922.000000	2.880435	-122.072385	37.627998	1.000000	46.000000	11.000000	2016.000000	80600.0	8607423.0
75%	3321.000000	2.000000	1175.000000	3.548896	-121.959339	37.800253	2.000000	49.000000	12.000000	2017.000000	80600.0	8607423.0
max	9995.000000	8.000000	4440.000000	8.348018	2.317761	48.887051	6.000000	52.000000	12.000000	2017.000000	80600.0	8607423.0

	rent	bedrooms	sqft	rent_sqft	longitude	latitude	bathrooms	week	month	year	median_income	population
count	32713.000000	32713.000000	32713.000000	32713.000000	32713.000000	32713.000000	19846.000000	32713.000000	32713.000000	32713.000000	32713.0	32713.0
mean	983.096659	1.937639	1018.820530	1.000261	-83.205387	42.454045	1.334551	25.245438	6.185064	2016.542231	52462.0	4296611.0
std	392.477472	0.859662	346.534819	0.342152	0.868552	0.272896	0.509521	22.119471	5.085776	0.498221	0.0	0.0
min	79.000000	0.000000	240.000000	0.092402	-122.593700	26.287874	0.000000	1.000000	1.000000	2016.000000	52462.0	4296611.0
25%	755.000000	1.000000	815.000000	0.824000	-83.311592	42.325125	1.000000	4.000000	1.000000	2016.000000	52462.0	4296611.0
50%	880.000000	2.000000	950.000000	0.938889	-83.212336	42.479646	1.000000	10.000000	3.000000	2017.000000	52462.0	4296611.0
75%	1075.000000	2.000000	1137.000000	1.097946	-83.038800	42.587700	1.500000	49.000000	12.000000	2017.000000	52462.0	4296611.0
max	6995.000000	7.000000	4468.000000	5.752467	-68.791800	44.824200	6.000000	52.000000	12.000000	2017.000000	52462.0	4296611.0

	rent	bedrooms	sqft	rent_sqft	longitude	latitude	bathrooms	week	month	year	median_income	population
count	114153.000000	114153.000000	114153.000000	114153.000000	114153.000000	114153.000000	69617.000000	114153.000000	114153.000000	114153.000000	114153.0	114153.0
mean	1486.556981	1.629848	964.084509	1.632192	-104.958663	39.711213	1.430240	25.843245	6.346824	2016.534152	66870.0	2754258.0
std	549.367331	0.875000	438.035934	0.435288	0.492116	0.208812	0.594603	21.862210	5.004334	0.498834	0.0	0.0
min	79.000000	0.000000	220.000000	0.087004	-148.924302	25.468229	0.000000	1.000000	1.000000	2016.000000	66870.0	2754258.0
25%	1143.000000	1.000000	700.000000	1.366120	-105.020542	39.654307	1.000000	5.000000	1.000000	2016.000000	66870.0	2754258.0
50%	1370.000000	2.000000	865.000000	1.554455	-104.968600	39.710799	1.000000	10.000000	3.000000	2017.000000	66870.0	2754258.0
75%	1698.000000	2.000000	1100.000000	1.832240	-104.884853	39.757006	2.000000	49.000000	12.000000	2017.000000	66870.0	2754258.0
max	7900.000000	6.000000	4497.000000	7.531561	-71.298300	60.924252	8.000000	52.000000	12.000000	2017.000000	66870.0	2754258.0

	rent	bedrooms	sqft	rent_sqft	longitude	latitude	bathrooms	week	month	year	median_income	population
count	37388.000000	37388.000000	37388.000000	37388.000000	37388.000000	37388.000000	18535.000000	37388.000000	37388.000000	37388.000000	37388.0	37388.0
mean	2695.793249	1.713518	942.389670	3.051730	-73.856081	40.810525	1.248395	28.774313	7.047368	2016.477426	67066.0	20092883.0
std	1300.740147	1.046317	376.712246	1.310790	0.528350	0.354437	0.514592	20.799308	4.777683	0.499497	0.0	0.0
min	90.000000	0.000000	220.000000	0.069231	-123.114700	9.001601	0.000000	1.000000	1.000000	2016.000000	67066.0	20092883.0
25%	1850.000000	1.000000	700.000000	2.058824	-73.982037	40.706279	1.000000	8.000000	2.000000	2016.000000	67066.0	20092883.0
50%	2399.000000	2.000000	890.000000	2.774459	-73.947049	40.746282	1.000000	46.000000	11.000000	2016.000000	67066.0	20092883.0
75%	3175.000000	2.000000	1100.000000	3.958333	-73.806137	40.863300	1.500000	48.000000	12.000000	2017.000000	67066.0	20092883.0
max	9995.000000	8.000000	4400.000000	8.415789	-68.441734	45.548600	8.000000	52.000000	12.000000	2017.000000	67066.0	20092883.0

	rent	bedrooms	sqft	rent_sqft	longitude	latitude	bathrooms	week	month	year	median_income	population
count	106567.000000	106567.000000	106567.000000	106567.000000	106567.000000	106567.000000	62335.000000	106567.000000	106567.000000	106567.000000	106567.0	106567.0
mean	1290.172098	1.626348	1014.233740	1.344060	-95.456052	29.811908	1.434387	25.829319	6.328695	2016.522066	60072.0	6490180.0
std	512.344019	0.827799	428.149051	0.458061	0.493181	0.329684	0.611912	22.055356	5.034224	0.499515	0.0	0.0
min	85.000000	0.000000	225.000000	0.065238	-158.178100	-11.469258	0.000000	1.000000	1.000000	2016.000000	60072.0	6490180.0
25%	918.000000	1.000000	720.000000	1.023529	-95.556900	29.723139	1.000000	4.000000	1.000000	2016.000000	60072.0	6490180.0
50%	1200.000000	2.000000	940.000000	1.258268	-95.447000	29.747700	1.000000	10.000000	3.000000	2017.000000	60072.0	6490180.0
75%	1500.000000	2.000000	1160.000000	1.610145	-95.401000	29.852387	2.000000	48.000000	12.000000	2017.000000	60072.0	6490180.0
max	9500.000000	7.000000	4490.000000	7.073846	-50.560455	47.601900	6.000000	52.000000	12.000000	2017.000000	60072.0	6490180.0

	rent	bedrooms	sqft	rent_sqft	longitude	latitude	bathrooms	week	month	year	median_income	population
count	152270.000000	152270.000000	152270.000000	152270.000000	152270.000000	152270.000000	86283.000000	152270.000000	152270.000000	152270.000000	152270.0	152270.0
mean	2528.276089	1.624962	1021.838484	2.626747	-118.333378	34.094184	1.563066	26.314730	6.443140	2016.528036	60514.0	13262220.0
std	1279.880402	1.012909	494.005066	0.973024	0.566361	0.228387	0.680458	21.513275	4.948505	0.499215	0.0	0.0
min	84.000000	0.000000	217.000000	0.075000	-134.527588	29.770021	0.000000	1.000000	1.000000	2016.000000	60514.0	13262220.0
25%	1725.000000	1.000000	700.000000	1.992857	-118.451747	34.026800	1.000000	6.000000	2.000000	2016.000000	60514.0	13262220.0
50%	2200.000000	2.000000	928.000000	2.452771	-118.366300	34.073700	1.000000	10.000000	3.000000	2017.000000	60514.0	13262220.0
75%	2899.000000	2.000000	1200.000000	3.080000	-118.258012	34.162477	2.000000	49.000000	12.000000	2017.000000	60514.0	13262220.0
max	9999.000000	8.000000	4490.000000	8.420561	-72.986200	48.166400	7.000000	52.000000	12.000000	2017.000000	60514.0	13262220.0

	rent	bedrooms	sqft	rent_sqft	longitude	latitude	bathrooms	week	month	year	median_income	population
count	160932.000000	160932.000000	160932.000000	160932.000000	160932.000000	160932.000000	98920.000000	160932.000000	160932.000000	160932.000000	160932.0	160932.0
mean	1747.559665	1.594207	921.990934	2.074973	-122.307227	47.551757	1.383800	25.065848	6.170327	2016.552606	71273.0	3671478.0
std	649.299259	0.932514	419.891009	0.781669	0.412373	0.339398	0.554367	21.737498	4.978763	0.497226	0.0	0.0
min	90.000000	0.000000	216.000000	0.075000	-139.877930	28.395700	0.000000	1.000000	1.000000	2016.000000	71273.0	3671478.0
25%	1315.000000	1.000000	669.000000	1.491818	-122.356771	47.446500	1.000000	5.000000	2.000000	2016.000000	71273.0	3671478.0
50%	1604.000000	2.000000	850.000000	1.882083	-122.313255	47.615191	1.000000	10.000000	3.000000	2017.000000	71273.0	3671478.0
75%	1995.000000	2.000000	1050.000000	2.625000	-122.209637	47.673004	2.000000	49.000000	12.000000	2017.000000	71273.0	3671478.0
max	9000.000000	8.000000	4488.000000	8.064516	-81.466600	49.032579	8.000000	52.000000	12.000000	2017.000000	71273.0	3671478.0

	rent	bedrooms	sqft	rent_sqft	longitude	latitude	bathrooms	week	month	year	median_income	population
count	64279.000000	64279.000000	64279.000000	64279.000000	64279.000000	64279.000000	36749.000000	64279.000000	64279.000000	64279.000000	64279.0	64279.0
mean	1747.348310	1.696884	1029.364458	1.824347	-87.739262	41.890310	1.361302	27.553524	6.657166	2016.488713	61598.0	9554598.0
std	831.031145	1.001307	443.002202	0.739249	0.584812	0.344643	0.561647	22.254252	5.103591	0.499876	0.0	0.0
min	90.000000	0.000000	220.000000	0.100000	-118.546343	25.922420	0.000000	1.000000	1.000000	2016.000000	61598.0	9554598.0
25%	1195.000000	1.000000	735.000000	1.241667	-87.732006	41.870957	1.000000	4.000000	1.000000	2016.000000	61598.0	9554598.0
50%	1595.000000	2.000000	963.000000	1.681818	-87.656049	41.899132	1.000000	45.000000	11.000000	2016.000000	61598.0	9554598.0
75%	2100.000000	2.000000	1200.000000	2.355114	-87.630654	41.954300	2.000000	49.000000	12.000000	2017.000000	61598.0	9554598.0
max	9950.000000	8.000000	4411.000000	8.150000	-73.956600	45.376717	5.500000	52.000000	12.000000	2017.000000	61598.0	9554598.0