In [1]:
import geopandas as gpd
import urbansim.sim.simulation as sim
from spandex import spatialtoolz, rastertoolz, utils
import pandas as pd, numpy as np
import statsmodels.api as sm
from skimage.filter import threshold_otsu
from sklearn.decomposition import RandomizedPCA
In [2]:
# Connect to database if not already connected.
loader = utils.DataLoader()
In [3]:
#Load contra costa parcel geometry from postgis db
geodf = gpd.GeoDataFrame.from_postgis('select * from parcels_contra_costa', conn, geom_col='geom')
In [4]:
#Load raster imagery (satellite). Ask EJ if you want the sample file
array, src = rastertoolz.from_geotiff('data//10SEG835970_200804_0x0750m_CL_1.tif')
transform = src.meta['transform']
In [5]:
#Get the masked array for the imagery of each parcel; apply threshold_otsu function to each parcel's imagery
parcel_stats, img = rastertoolz.zonal_stats(geodf, array, transform = transform, copy_properties=True, func=threshold_otsu)
In [6]:
#Process/normalize the imagery data for each parcel
images = []
idx = []
for i in range(len(img)):
pic = img[i]['img']
if pic is not None:
idx.append(img[i]['__fid__'])
flattened = pic.data.ravel().astype('int32')
images.append(np.resize(flattened, 45000))
images_array = pd.DataFrame(np.array(images), index=idx)
In [7]:
#Generate some additional imagery-based features using PCA
pca = RandomizedPCA(n_components=5)
X = pca.fit_transform(images_array)
df_X = pd.DataFrame(X, index=images_array.index)
df_X.columns = ['component1','component2','component3','component4','component5']
In [8]:
#Load parcel attribute data
cnc_parcels = spatialtoolz.db_to_df("""SELECT b.use_code, b.tla, b.bldg_sqft, b.yr_hs_blt, b.yr_built, b.imp_val, b.land_value, b.stories, b.units, b.apn, a.parc_py_id
FROM parcels_contra_costa a join parcels_contra_costa_pt b on a.parc_py_id = b.parc_py_id;""")
cnc_parcels['land_use_code'] = cnc_parcels.use_code.fillna(0).astype('int')
cnc_parcels['sfd'] = 1*(cnc_parcels.land_use_code>=10)*(cnc_parcels.land_use_code<=29)
cnc_parcels['building_sqft'] = cnc_parcels.tla*(cnc_parcels.sfd==1) + cnc_parcels.bldg_sqft*(cnc_parcels.sfd==0)
cnc_parcels['year_built'] = cnc_parcels.yr_hs_blt.fillna(0).astype('int')*(cnc_parcels.sfd==1) + cnc_parcels.yr_built.fillna(0).astype('int')*(cnc_parcels.sfd==0)
cnc_parcels['improvement_value'] = cnc_parcels.imp_val
cnc_parcels['land_value'] = cnc_parcels.land_value
cnc_parcels['stories'] = 1*(cnc_parcels.sfd==1) + cnc_parcels.stories*(cnc_parcels.sfd==0)
cnc_parcels['residential_units'] = 1*(cnc_parcels.sfd==1) + cnc_parcels.units*(cnc_parcels.sfd==0)
cnc_parcels['apn'] = cnc_parcels.apn
In [9]:
#Merge the various sources of parcel information into an estimation dataset
df = pd.DataFrame(parcel_stats)
df_pca = pd.merge (df, df_X, left_on='__fid__', right_index=True)
df_parcels = pd.merge(df_pca, cnc_parcels, left_on='parc_py_id', right_on='parc_py_id')
In [10]:
#Dependent variable is an is_non_residential dummy
df_parcels['non_residential'] = 0
df_parcels['non_residential'][df_parcels.residential_units==0] = 1
print df_parcels.non_residential.sum()
In [11]:
#Estimate model
train_cols = ['calc_area', 'max', 'mean', 'min', 'std', 'threshold_otsu', 'component1', 'component2','component3','component4','component5']
logit = sm.Logit(df_parcels['non_residential'], df_parcels[train_cols].fillna(0))
result = logit.fit()
print result.summary()