In [1]:
import geopandas as gpd
import urbansim.sim.simulation as sim
from spandex import spatialtoolz, rastertoolz, utils
import pandas as pd, numpy as np
import statsmodels.api as sm
from skimage.filter import threshold_otsu
from sklearn.decomposition import RandomizedPCA

In [2]:
# Connect to database if not already connected.
loader = utils.DataLoader()

In [3]:
#Load contra costa parcel geometry from postgis db
geodf = gpd.GeoDataFrame.from_postgis('select * from parcels_contra_costa', conn, geom_col='geom')

In [4]:
#Load raster imagery (satellite).  Ask EJ if you want the sample file
array, src = rastertoolz.from_geotiff('data//10SEG835970_200804_0x0750m_CL_1.tif')
transform = src.meta['transform']

In [5]:
#Get the masked array for the imagery of each parcel; apply threshold_otsu function to each parcel's imagery
parcel_stats, img = rastertoolz.zonal_stats(geodf, array, transform = transform, copy_properties=True, func=threshold_otsu)


c:\Anaconda\lib\site-packages\numpy\ma\core.py:3847: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

In [6]:
#Process/normalize the imagery data for each parcel
images = []
idx = []
for i in range(len(img)):
    pic = img[i]['img']
    if pic is not None:
        idx.append(img[i]['__fid__'])
        flattened = pic.data.ravel().astype('int32')
        images.append(np.resize(flattened, 45000))
images_array = pd.DataFrame(np.array(images), index=idx)

In [7]:
#Generate some additional imagery-based features using PCA
pca = RandomizedPCA(n_components=5)
X = pca.fit_transform(images_array)
df_X = pd.DataFrame(X, index=images_array.index)
df_X.columns = ['component1','component2','component3','component4','component5']

In [8]:
#Load parcel attribute data
cnc_parcels = spatialtoolz.db_to_df("""SELECT b.use_code, b.tla, b.bldg_sqft, b.yr_hs_blt, b.yr_built, b.imp_val, b.land_value, b.stories, b.units, b.apn, a.parc_py_id 
FROM  parcels_contra_costa a join parcels_contra_costa_pt b on a.parc_py_id = b.parc_py_id;""")
cnc_parcels['land_use_code'] = cnc_parcels.use_code.fillna(0).astype('int')
cnc_parcels['sfd'] = 1*(cnc_parcels.land_use_code>=10)*(cnc_parcels.land_use_code<=29)
cnc_parcels['building_sqft'] = cnc_parcels.tla*(cnc_parcels.sfd==1) + cnc_parcels.bldg_sqft*(cnc_parcels.sfd==0)
cnc_parcels['year_built'] = cnc_parcels.yr_hs_blt.fillna(0).astype('int')*(cnc_parcels.sfd==1) + cnc_parcels.yr_built.fillna(0).astype('int')*(cnc_parcels.sfd==0)
cnc_parcels['improvement_value'] = cnc_parcels.imp_val
cnc_parcels['land_value'] = cnc_parcels.land_value
cnc_parcels['stories'] = 1*(cnc_parcels.sfd==1) + cnc_parcels.stories*(cnc_parcels.sfd==0)
cnc_parcels['residential_units'] = 1*(cnc_parcels.sfd==1) + cnc_parcels.units*(cnc_parcels.sfd==0)
cnc_parcels['apn'] = cnc_parcels.apn

In [9]:
#Merge the various sources of parcel information into an estimation dataset
df = pd.DataFrame(parcel_stats)
df_pca = pd.merge (df, df_X, left_on='__fid__', right_index=True)
df_parcels = pd.merge(df_pca, cnc_parcels, left_on='parc_py_id', right_on='parc_py_id')

In [10]:
#Dependent variable is an is_non_residential dummy
df_parcels['non_residential'] = 0
df_parcels['non_residential'][df_parcels.residential_units==0] = 1
print df_parcels.non_residential.sum()


191

In [11]:
#Estimate model
train_cols = ['calc_area', 'max', 'mean', 'min', 'std', 'threshold_otsu', 'component1', 'component2','component3','component4','component5']
logit = sm.Logit(df_parcels['non_residential'], df_parcels[train_cols].fillna(0))
result = logit.fit()
print result.summary()


Optimization terminated successfully.
         Current function value: 0.225125
         Iterations 8
                           Logit Regression Results                           
==============================================================================
Dep. Variable:        non_residential   No. Observations:                 2110
Model:                          Logit   Df Residuals:                     2099
Method:                           MLE   Df Model:                           10
Date:                Mon, 01 Sep 2014   Pseudo R-squ.:                  0.2588
Time:                        22:08:15   Log-Likelihood:                -475.01
converged:                       True   LL-Null:                       -640.90
                                        LLR p-value:                 2.938e-65
==================================================================================
                     coef    std err          z      P>|z|      [95.0% Conf. Int.]
----------------------------------------------------------------------------------
calc_area          0.0004   4.56e-05      9.687      0.000         0.000     0.001
max               -0.0075      0.004     -2.049      0.040        -0.015    -0.000
mean              -0.0326      0.006     -5.295      0.000        -0.045    -0.021
min               -0.0342      0.022     -1.531      0.126        -0.078     0.010
std                0.0055      0.011      0.514      0.607        -0.015     0.026
threshold_otsu     0.0248      0.005      4.844      0.000         0.015     0.035
component1       6.81e-05   3.01e-05      2.261      0.024      9.07e-06     0.000
component2      6.764e-05   4.88e-05      1.386      0.166      -2.8e-05     0.000
component3     -7.739e-06   6.39e-05     -0.121      0.904        -0.000     0.000
component4      7.508e-05   9.16e-05      0.819      0.413        -0.000     0.000
component5     -5.846e-05      0.000     -0.483      0.629        -0.000     0.000
==================================================================================