In [88]:
import numpy as np
import pandas as pd
import zipfile
In [ ]:
In [5]:
z = zipfile.ZipFile('../data/caltrans_full_survey.zip')
In [6]:
households = pd.read_csv(z.open('caltrans_full_survey/survey_households.csv'), low_memory=False)
len(households)
Out[6]:
In [8]:
persons = pd.read_csv(z.open('caltrans_full_survey/survey_person.csv'), low_memory=False)
len(persons)
Out[8]:
In [9]:
places = pd.read_csv(z.open('caltrans_full_survey/survey_place.csv'), low_memory=False)
len(places)
Out[9]:
In [10]:
activities = pd.read_csv(z.open('caltrans_full_survey/survey_activity.csv'), low_memory=False)
len(activities)
Out[10]:
In [ ]:
In [86]:
# Suppress scientific notation in the Pandas display output
pd.set_option('display.float_format', lambda x: '%.0f' % x)
In [141]:
# Functions to move back and forth between full numerical tract ID and its components
# TO DO - it would be better to generate ints than floats, but it's not obvious
# to me how to do this in a way that works smoothly with arrays
def full_tract_id(state_id, county_id, tract_id):
return state_id * 1e9 + county_id * 1e6 + tract_id
def state_id(full_tract_id):
return np.floor(full_tract_id / 1e9)
def county_id(full_tract_id):
_county_tract = np.fmod(full_tract_id, 1e9)
return np.floor(_county_tract / 1e6)
print(full_tract_id(6, 98, 141414))
print(state_id(6098141414))
print(county_id(6098141414))
In [142]:
# Generate full tract identifiers for the `places` table
places['full_tract_id'] = full_tract_id(places.state_id, places.county_id, places.tract_id)
# Replace missing identifiers with NaN's
places.ix[(places.tract_id == 999999) |
(places.county_id == 999) |
(places.state_id == 99), 'full_tract_id'] = np.nan
In [122]:
# Generate a master list of census tracts from the `places` table, keeping the
# city name most commonly associated with each tract
tracts = places[['full_tract_id', 'city']].groupby('full_tract_id').\
agg(lambda x:x.value_counts().index[0])
print(tracts.shape[0])
print(tracts.head())
In [123]:
# Limit to the 9-county San Francisco Bay Area
tracts = tracts[(state_id(tracts.index).isin([6])) &
(county_id(tracts.index).\
isin([1, 13, 41, 55, 75, 81, 85, 95, 97]))].copy()
print(tracts.shape[0])
print(tracts.head())
In [116]:
print(county_id(tracts.index).value_counts())
In [ ]:
In [98]:
# Note: the `home_tract_id` in the households table is already a full 11-digit
# identifier, with the same format that we generated for the places table.
# Same with `empl_tract_id` and `school_tract_id` in the persons table.
In [120]:
# Residential density = sum of weighted household sizes by census tract of home
households['_weighted_persons_count'] = households.persons_count * households.hhwgt
home_density = households.groupby('home_tract_id')._weighted_persons_count.sum().\
rename('home_density').to_frame()
In [109]:
# Employment density = sum of person weights by census tract of work location
work_density = persons.groupby('empl_tract_id').perwgt.sum().\
rename('work_density').to_frame()
In [124]:
# School density = sum of person weights by census tract of school location
school_density = persons.groupby('school_tract_id').perwgt.sum().\
rename('school_density').to_frame()
In [125]:
# Merge these into the census tracts table, only keeping Bay Area tracts
tracts = pd.merge(tracts, home_density, how='left', left_index=True, right_index=True)
tracts = pd.merge(tracts, work_density, how='left', left_index=True, right_index=True)
tracts = pd.merge(tracts, school_density, how='left', left_index=True, right_index=True)
tracts = tracts.fillna(0) # fill missing values with zero
print(tracts.head())
In [ ]:
In [ ]:
# - trip destinations are in `places.full_tract_id` (sometimes missing)
# - trip purposes are in `activities.purpose`, and we want 23 thru 38
# - places and acitivities are linked by `sampno`, `perno`, `plano`, and there
# can be multiple activities per place
In [126]:
# Function to generate a single unique ID for places
def place_id(sampno, perno, plano):
return sampno * 1e4 + perno * 1e2 + plano
print(place_id(1041766, 3, 12))
In [127]:
# Add place_id to places table and activities table
places['place_id'] = place_id(places.sampno, places.perno, places.plano)
activities['place_id'] = place_id(activities.sampno, activities.perno, activities.plano)
In [131]:
# Get list of places that have a secondary activity
_secondary_activity_places = activities.loc[activities.purpose.isin(range(23, 38+1)),
'place_id'].drop_duplicates()
In [144]:
# Generate a table of those places with some covariates
trips = places.loc[places.place_id.isin(_secondary_activity_places) &
places.full_tract_id.notnull(),
['place_id', 'full_tract_id', 'mode',
'trip_distance_miles']].set_index('place_id')
print(trips.shape[0])
print(trips.head())
In [145]:
# Limit to destinations in the 9-county San Francisco Bay Area
trips = trips[(state_id(trips.full_tract_id).isin([6])) &
(county_id(trips.full_tract_id).\
isin([1, 13, 41, 55, 75, 81, 85, 95, 97]))].copy()
print(trips.shape[0])
print(trips.head())
In [ ]:
In [134]:
tracts.to_csv('../data/tracts.csv')
In [146]:
trips.to_csv('../data/trips.csv')
In [ ]:
In [ ]: