In [22]:
%matplotlib inline
import matplotlib
import numpy as np
import pandas as pd
import zipfile
In [2]:
# See ../data/README.md for instructions about how to get the data
z = zipfile.ZipFile('../data/caltrans_full_survey.zip')
In [7]:
households = pd.read_csv(z.open('caltrans_full_survey/survey_households.csv'), low_memory=False)
len(households)
Out[7]:
In [8]:
# Limit to the Bay Area
households_ba = households[households.home_county_id.isin([1, 13, 41, 55, 75, 81, 85, 95, 97])]
len(households_ba)
Out[8]:
In [75]:
# Top home locations
households_ba.home_city.value_counts()[:15]
Out[75]:
In [40]:
households.persons_count.describe()
Out[40]:
In [41]:
households.hhwgt.describe()
Out[41]:
In [42]:
households.exphhwgt.describe()
Out[42]:
In [79]:
households.home_tract_id.describe()
Out[79]:
In [ ]:
In [3]:
persons = pd.read_csv(z.open('caltrans_full_survey/survey_person.csv'), low_memory=False)
len(persons)
Out[3]:
In [94]:
print(persons[['sampno', 'perno', 'travel_date', 'gender', 'relation',
'education', 'race1']].head(10))
In [85]:
# What is `person_trips`? -- not sure, but it looks related to the `tripno` field
persons.person_trips.describe()
Out[85]:
In [80]:
persons.empl_tract_id.describe()
Out[80]:
In [47]:
persons.empl_tract_id.notnull().describe()
Out[47]:
In [81]:
persons.school_tract_id.describe()
Out[81]:
In [49]:
persons.school_tract_id.notnull().describe()
Out[49]:
In [48]:
persons.perwgt.describe()
Out[48]:
In [ ]:
In [4]:
places = pd.read_csv(z.open('caltrans_full_survey/survey_place.csv'), low_memory=False)
len(places)
Out[4]:
In [9]:
# Filter for places visited by people who live in the Bay Area (may want to do use a
# different filter depending on the application)
places_ba = places[places.sampno.isin(households_ba.sampno)]
len(places_ba)
Out[9]:
In [93]:
# Is there a unique identifier?
# Might need to use combination of `sampno` (household), `perno` (person within hh),
# `plano` (place within person's travel diary)
# What's `tripno`? ("unlinked trip ID" - maybe representing transfer between modes)
print(places_ba[['sampno', 'perno', 'plano', 'tripno']].head(10))
In [10]:
# Is every combination of `sampno`, `perno`, `plano` unique? -- Yes
len(places_ba.groupby(['sampno', 'perno', 'plano']))
Out[10]:
In [11]:
# How many places have a `tripno`? -- about 80%
places_ba.tripno.count()
Out[11]:
In [12]:
# Is the `tripno` ever repeated? -- No
len(places_ba.groupby(['sampno', 'perno', 'plano', 'tripno']))
Out[12]:
In [13]:
places_ba.tripno.describe()
Out[13]:
In [14]:
# Can we see the place names? -- No
places_ba.place_name.head()
Out[14]:
In [22]:
places_ba.city.value_counts().head(15)
Out[22]:
In [56]:
places_ba.trip_distance_miles.plot.hist(bins=20, range=(0,15));
In [16]:
# Most detailed spatial identifier in public data is tract_id
# How many different tracts are visited?
places_ba.tract_id.unique().shape[0]
Out[16]:
In [17]:
# How many different households?
places_ba.sampno.unique().shape[0]
Out[17]:
In [18]:
# How many different people?
len(places_ba.groupby(['sampno','perno']))
Out[18]:
In [ ]:
In [52]:
# Suppress scientific notation
pd.set_option('display.float_format', lambda x: '%.0f' % x)
In [52]:
# Is the mapping between census tracts and city names consistent? -- No
print(places.tract_id.drop_duplicates().shape[0])
print(places[['tract_id', 'city']].drop_duplicates().shape[0])
In [53]:
places.tract_id.describe()
Out[53]:
In [54]:
places.county_id.describe()
Out[54]:
In [58]:
places.state_id.describe()
Out[58]:
In [57]:
places.state_id.value_counts().head(5)
Out[57]:
In [77]:
# How to deal with this? I think `tract_id` is an integer representation
# of the 4-digit tract ID within the couty plus the 2 digit suffix.
# So the full unique identifier is `state_id` + `county_id` (3 digits) + `tract_id` (6 digits)
places['_full_tract_id'] = places.state_id * 1e9 + places.county_id * 1e6 + places.tract_id
# Presumably the all-9 entries reflect missing data, but documentation doesn't specify
places.ix[(places.tract_id == 999999) |
(places.county_id == 999) |
(places.state_id == 99), '_full_tract_id'] = np.nan
In [78]:
print(places._full_tract_id.drop_duplicates().shape[0])
print(places[['_full_tract_id', 'city']].drop_duplicates().shape[0])
In [73]:
places[['_full_tract_id', 'city']].drop_duplicates().\
_full_tract_id.value_counts().head(5)
Out[73]:
In [72]:
print(places[['_full_tract_id', 'city']].drop_duplicates().\
loc[places._full_tract_id == 6115041100])
So, there are still many census tracts that correspond to more than one city. I think we probably just want to use the census tracts as our unit of analysis.
For descriptive purposes we can map each census tract to its most common corresponding city.
In [75]:
# Map each tract to its most common corresponding city
tracts = places[['_full_tract_id', 'city']].groupby('_full_tract_id').\
agg(lambda x:x.value_counts().index[0])
print(tracts.head())
In [76]:
print(places._full_tract_id.drop_duplicates().shape[0])
print(tracts.shape[0])
In [ ]:
In [ ]:
In [84]:
activities = pd.read_csv(z.open('caltrans_full_survey/survey_activity.csv'), low_memory=False)
len(activities)
Out[84]:
In [85]:
# TO DO - fix to reflect households
activities_ba = activities[activities.county_id.isin([1, 13, 41, 55, 75, 81, 85, 95, 97])]
len(activities_ba)
Out[85]:
In [95]:
# What do the identifiers look like?
print(activities_ba[['sampno', 'perno', 'plano', 'actno', 'tripno']].head(10))
In [72]:
# Each place occurs in the activities table at least once
print((activities_ba.actno == 1).sum()) # number of activities with id 1
print(len(activities_ba.groupby(['sampno', 'perno', 'plano']))) # unique places referenced
print(len(places_ba)) # records in places table
In [87]:
activities.sampno.describe()
Out[87]:
In [88]:
activities.perno.describe()
Out[88]:
In [89]:
activities.plano.describe()
Out[89]:
In [ ]:
What's the correct way to aggregate places into trips?
It seems like each person recorded their travel for a single day as a sequence of places visited, without explicit classification into trips or tours. So that's up to us to do by applying whatever rules seem appropriate.
Probably it's not even possible to identify tours with certainty from the anonymized data, because the place names and precise locations are redacted.
In [21]:
# Dig into `tripno` some more
places_ba.groupby(['sampno', 'perno']).plano.max().head(10)
Out[21]:
In [23]:
# Do any respondents have multiple trip sequences? -- No!
plano_counts = places_ba.groupby(['sampno', 'perno']).plano.max()
tripno_counts = places_ba.groupby(['sampno', 'perno']).tripno.max()
(plano_counts - tripno_counts).describe()
Out[23]:
In [25]:
(places_ba.plano - places_ba.tripno).describe()
Out[25]:
In [100]:
# What does a sequence of places look like?
varlist = ['travel_date', 'arr_time', 'dep_time', 'tract_id', 'city', 'mode',
'trip_distance_miles', 'prev_trip_duration_min', 'act_dur']
places_ba.loc[(places_ba.sampno == 1035274) & (places_ba.perno == 1), varlist]
Out[100]:
So, it looks like the key to identifying trip/tour semantics involves looking at the trip purposes in the activities table. Transfers are noted as a particular purpose, and those trip legs need to be aggregated together.
The first and last activities of the day probably take place at home, but we can't verify using the public data.
It looks like the arrival and departure times, and trip durations, are approximate based on people's recollections, but distances are precise because they come from the Google Maps interface.
In [ ]:
In [35]:
# What are the travel modes?
places_ba['mode'].value_counts().head(10)
Out[35]:
FROM DATA DICTIONARY
Travel mode:
In [ ]:
In [39]:
# What are the trip purposes?
activities_ba.purpose.value_counts().head(15)
Out[39]:
FROM DATA DICTIONARY
[Somewhere there's a ptype
key indicating categories of purposes, probably based on the home/ work/ school locations, but I can't find it in these data tables.]
Activity purpose:
[These look like activities at home]
[These look like activites at work]
[These look like activities at school]
[These look like transport-related]
[These look like activities at non-home, non-work, non-school locations]
[Misc]
In [ ]:
In [ ]:
In [ ]:
# TO DO
# - set up destination choice model
# - make two tables: (1) trips, (2) destinations
# - write a function to generate choice set
# - for covariates, calculate home/work/etc density endogenously
# - can probably generate average travel time between tracts, by mode
# - then can use that to build a mode choice model
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: