Data prep for estimating models

Sam Maurer, June 2017

Python 3.6


In [88]:
import numpy as np
import pandas as pd
import zipfile

In [ ]:

Load raw CHTS tables

This requires the file named caltrans_full_survey.zip. You can download it by following the instructions in the "data" directory.


In [5]:
z = zipfile.ZipFile('../data/caltrans_full_survey.zip')

In [6]:
households = pd.read_csv(z.open('caltrans_full_survey/survey_households.csv'), low_memory=False)
len(households)


Out[6]:
42426

In [8]:
persons = pd.read_csv(z.open('caltrans_full_survey/survey_person.csv'), low_memory=False)
len(persons)


Out[8]:
109113

In [9]:
places = pd.read_csv(z.open('caltrans_full_survey/survey_place.csv'), low_memory=False)
len(places)


Out[9]:
460524

In [10]:
activities = pd.read_csv(z.open('caltrans_full_survey/survey_activity.csv'), low_memory=False)
len(activities)


Out[10]:
604711

In [ ]:

Build a master table of census tracts

Generate a table of census tracts in the 9-county Bay Area, for use in destination choice models.


In [86]:
# Suppress scientific notation in the Pandas display output

pd.set_option('display.float_format', lambda x: '%.0f' % x)

In [141]:
# Functions to move back and forth between full numerical tract ID and its components

# TO DO - it would be better to generate ints than floats, but it's not obvious
#         to me how to do this in a way that works smoothly with arrays

def full_tract_id(state_id, county_id, tract_id):
    return state_id * 1e9 + county_id * 1e6 + tract_id

def state_id(full_tract_id):
    return np.floor(full_tract_id / 1e9)

def county_id(full_tract_id):
    _county_tract = np.fmod(full_tract_id, 1e9)
    return np.floor(_county_tract / 1e6)

print(full_tract_id(6, 98, 141414))
print(state_id(6098141414))
print(county_id(6098141414))


6098141414.0
6.0
98.0

In [142]:
# Generate full tract identifiers for the `places` table

places['full_tract_id'] = full_tract_id(places.state_id, places.county_id, places.tract_id)

# Replace missing identifiers with NaN's

places.ix[(places.tract_id == 999999) |
          (places.county_id == 999) |
          (places.state_id == 99), 'full_tract_id'] = np.nan

In [122]:
# Generate a master list of census tracts from the `places` table, keeping the
# city name most commonly associated with each tract

tracts = places[['full_tract_id', 'city']].groupby('full_tract_id').\
        agg(lambda x:x.value_counts().index[0])

print(tracts.shape[0])
print(tracts.head())


9097
                     city
full_tract_id            
1015000800       ANNISTON
1101001500     MONTGOMERY
1161400100        SEVILLA
2020001000      ANCHORAGE
2020001100      ANCHORAGE

In [123]:
# Limit to the 9-county San Francisco Bay Area

tracts = tracts[(state_id(tracts.index).isin([6])) & 
                (county_id(tracts.index).\
                 isin([1, 13, 41, 55, 75, 81, 85, 95, 97]))].copy()

print(tracts.shape[0])
print(tracts.head())


1583
                   city
full_tract_id          
6001008309      TIJUANA
6001400100     BERKELEY
6001400200      OAKLAND
6001400300      OAKLAND
6001400400      OAKLAND

In [116]:
print(county_id(tracts.index).value_counts())


85    371
1     360
13    207
75    195
81    158
97     99
95     97
41     55
55     41
Name: full_tract_id, dtype: int64

In [ ]:

Calculate some tract-level covariates

Residential density, school/employment density


In [98]:
# Note: the `home_tract_id` in the households table is already a full 11-digit
# identifier, with the same format that we generated for the places table.
# Same with `empl_tract_id` and `school_tract_id` in the persons table.

In [120]:
# Residential density = sum of weighted household sizes by census tract of home

households['_weighted_persons_count'] = households.persons_count * households.hhwgt

home_density = households.groupby('home_tract_id')._weighted_persons_count.sum().\
        rename('home_density').to_frame()

In [109]:
# Employment density = sum of person weights by census tract of work location

work_density = persons.groupby('empl_tract_id').perwgt.sum().\
        rename('work_density').to_frame()

In [124]:
# School density = sum of person weights by census tract of school location

school_density = persons.groupby('school_tract_id').perwgt.sum().\
        rename('school_density').to_frame()

In [125]:
# Merge these into the census tracts table, only keeping Bay Area tracts

tracts = pd.merge(tracts, home_density, how='left', left_index=True, right_index=True)
tracts = pd.merge(tracts, work_density, how='left', left_index=True, right_index=True)
tracts = pd.merge(tracts, school_density, how='left', left_index=True, right_index=True)
tracts = tracts.fillna(0)  # fill missing values with zero

print(tracts.head())


                   city  home_density  work_density  school_density
full_tract_id                                                      
6001008309      TIJUANA             0             0               0
6001400100     BERKELEY            13            13              14
6001400200      OAKLAND            11             4               1
6001400300      OAKLAND            29             8               0
6001400400      OAKLAND            17             4               8

In [ ]:

Generate a table of trips

For now, this is a table of places visited for non-school, non-work activities


In [ ]:
# - trip destinations are in `places.full_tract_id` (sometimes missing)
# - trip purposes are in `activities.purpose`, and we want 23 thru 38
# - places and acitivities are linked by `sampno`, `perno`, `plano`, and there 
#   can be multiple activities per place

In [126]:
# Function to generate a single unique ID for places

def place_id(sampno, perno, plano):
    return sampno * 1e4 + perno * 1e2 + plano

print(place_id(1041766, 3, 12))


10417660312.0

In [127]:
# Add place_id to places table and activities table

places['place_id'] = place_id(places.sampno, places.perno, places.plano)
activities['place_id'] = place_id(activities.sampno, activities.perno, activities.plano)

In [131]:
# Get list of places that have a secondary activity

_secondary_activity_places = activities.loc[activities.purpose.isin(range(23, 38+1)),
                                            'place_id'].drop_duplicates()

In [144]:
# Generate a table of those places with some covariates

trips = places.loc[places.place_id.isin(_secondary_activity_places) &
                   places.full_tract_id.notnull(),
                  ['place_id', 'full_tract_id', 'mode', 
                   'trip_distance_miles']].set_index('place_id')

print(trips.shape[0])
print(trips.head())


145993
             full_tract_id  mode  trip_distance_miles
place_id                                             
10319850102     6095252108     6                   13
10319850202     6095251902     5                    5
10320360102     6073017051     5                    4
10320360104     6073009304     5                   19
10320360105     6073008511     5                    6

In [145]:
# Limit to destinations in the 9-county San Francisco Bay Area

trips = trips[(state_id(trips.full_tract_id).isin([6])) & 
              (county_id(trips.full_tract_id).\
               isin([1, 13, 41, 55, 75, 81, 85, 95, 97]))].copy()

print(trips.shape[0])
print(trips.head())


36765
             full_tract_id  mode  trip_distance_miles
place_id                                             
10319850102     6095252108     6                   13
10319850202     6095251902     5                    5
10335860102     6085511915     6                  156
10335860103     6085512027     6                    2
10335860104     6085512027     6                    0

In [ ]:

Save estimaton data to disk


In [134]:
tracts.to_csv('../data/tracts.csv')

In [146]:
trips.to_csv('../data/trips.csv')

In [ ]:


In [ ]: