In [1]:
import numpy as np, pandas as pd, os
from synthicity.utils import misc
from drcog.models import regression_model_estimation, choice_model_estimation, dataset
dset = dataset.DRCOGDataset(os.path.join(misc.data_dir(),'drcog.h5'))
np.random.seed(1)

##Variable Library
from drcog.variables import variable_library
variable_library.calculate_variables(dset)

buildings = dset.fetch('buildings')[['building_type_id','improvement_value','land_area','non_residential_sqft','parcel_id','residential_units','sqft_per_unit','stories','tax_exempt','year_built','bldg_sq_ft','unit_price_non_residential','unit_price_residential','building_sqft_per_job','non_residential_units','base_year_jobs','all_units']]
establishments = dset.fetch('establishments')
del establishments['zone_id']
del establishments['county_id']
households = dset.fetch('households')
del households['zone_id']
del households['county_id']
parcels = dset.fetch('parcels')
parcels_urbancen = dset.store.parcels_urbancen.set_index('parcel_id')
parcels['urbancenter_id'] = parcels_urbancen.urban_cen
zones = dset.fetch('zones')
pz = pd.merge(parcels.reset_index(),zones,left_on='zone_id',right_index=True,how='left')
pz = pz.set_index('parcel_id')
bpz = pd.merge(buildings,pz,left_on='parcel_id',right_index=True)

##Merge buildings and parcels
buildings = pd.merge(buildings,parcels,left_on='parcel_id',right_index=True)

##Merge households with bulidings/parcels
households = pd.merge(households,buildings,left_on='building_id',right_index=True)

##Merge establishments with bulidings/parcels
establishments = pd.merge(establishments,buildings,left_on='building_id',right_index=True)

#####Export household points
hh = households[['building_id']].reset_index()
hh['parcel_id'] = bpz.parcel_id[hh.building_id].values
hh['urbancenter_id'] = bpz.urbancenter_id[hh.building_id].values
hh['x'] = bpz.x[hh.building_id].values.astype('int64')
hh['y'] = bpz.y[hh.building_id].values.astype('int64')
hh['taz05_id'] = bpz.external_zone_id[hh.building_id].values
hh['dist_trans'] = np.minimum(bpz.dist_rail[hh.building_id].values, bpz.dist_bus[hh.building_id].values)/5280.0
big_parcels = parcels.index.values[parcels.parcel_sqft>= 435600]
big_parcel_ids_with_hh = np.unique(hh.parcel_id[np.in1d(hh.parcel_id,big_parcels)].values)
parcel_coords = dset.parcel_coords
parcel_coords.x = parcel_coords.x.astype('int64')
parcel_coords.y = parcel_coords.y.astype('int64')
for parcel_id in big_parcel_ids_with_hh:
    idx_hh_on_parcel = np.in1d(hh.parcel_id,[parcel_id,])
    coords = parcel_coords[parcel_coords.parcel_id==parcel_id]
    idx_coord = np.random.choice(coords.index,size=idx_hh_on_parcel.sum(),replace=True)
    x = coords.x.loc[idx_coord].values
    y = coords.y.loc[idx_coord].values
    hh.x[idx_hh_on_parcel] = x
    hh.y[idx_hh_on_parcel] = y


Fetching parcels
Fetching modify_table
Fetching buildings
Fetching establishments
Fetching modify_table
Fetching modify_table
Fetching households_for_estimation
Fetching modify_table
Fetching households
Fetching modify_table
Fetching zones
Fetching modify_table
Fetching travel_data
Fetching modify_table
Fetching parcel_coords
Fetching modify_table

In [2]:
hh.describe()


Out[2]:
household_id building_id parcel_id x y taz05_id dist_trans
count 1160869.000000 1160869.000000 1160869.000000 1160869.000000 1160869.000000 1160869.000000 1160869.000000
mean 580435.179291 473059.299294 1468.339839 3145353.709304 1699498.501644 245.442482 0.931423
std 335114.414025 294924.760137 314365.345610 45183.335625 62198.538595 170944.540782 2.704356
min 1.000000 1.000000 1.000000 2890124.000000 1472247.000000 101010.000000 0.003409
25% 290218.000000 191967.000000 238158.000000 3116973.000000 1661996.000000 211200.000000 0.097727
50% 580435.000000 466724.000000 563020.000000 3145512.000000 1690691.000000 404450.000000 0.185227
75% 870652.000000 711175.000000 776663.000000 3173554.000000 1735880.000000 506240.000000 0.412121
max 1160870.000000 1015771.000000 1128921.000000 3503103.000000 1885594.000000 803120.000000 51.506818

In [ ]:
e = establishments.reset_index()
for idx in e.index:
    for job in range(e.employees[idx]):
        bids.append(e.building_id[idx])
        eids.append(e.index[idx])
        hbs.append(e.home_based_status[idx])
        sids.append(e.sector_id[idx])
print len(bids)
print len(eids)
print len(hbs)
print len(sids)
jobs = pd.DataFrame({'job_id':range(1,len(bids)+1),'building_id':bids,'establishment_id':eids,'home_based_status':hbs,'sector_id':sids})
jobs['parcel_id'] = bpz.parcel_id[jobs.building_id].values
jobs['urbancenter_id'] = bpz.urbancenter_id[jobs.building_id].values
jobs['x'] = bpz.centroid_x[jobs.building_id].values.astype('int64')
jobs['y'] = bpz.centroid_y[jobs.building_id].values.astype('int64')
big_parcel_ids_with_jobs = np.unique(jobs.parcel_id[np.in1d(jobs.parcel_id,big_parcels)].values)
for parcel_id in big_parcel_ids_with_jobs:
    idx_jobs_on_parcel = np.in1d(jobs.parcel_id,[parcel_id,])
    coords = parcel_coords[parcel_coords.parcel_id==parcel_id]
    idx_coord = np.random.choice(coords.index,size=idx_jobs_on_parcel.sum(),replace=True)
    x = coords.x.loc[idx_coord].values
    y = coords.y.loc[idx_coord].values
    jobs.x[idx_jobs_on_parcel] = x
    jobs.y[idx_jobs_on_parcel] = y

In [ ]:
jobs.describe()