In [ ]:
import json
import requests
import pickle
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from math import radians, cos, sin, sqrt, atan2
Unfortunately, making the queries directly in django is complicated and it takes a long time to run. Instead, we query directly the database using postico and export the results to csv for further processing.
SELECT
agency.id,
service_date.id, service_date.date,
route.id, route.short_name, route.long_name,
trip.id, trip.headsign, trip.short_name,
stop_time.id, stop_time.arrival_time, stop_time.departure_time, stop_time.stop_sequence,
stop.id, stop.stop_id, stop.name,
capacity_path.id, capacity_path.path,
capacity_capacity.id, capacity_capacity.capacity1st, capacity_capacity.capacity2nd
FROM service_date
LEFT OUTER JOIN trip ON (service_date.service_id = trip.service_id)
LEFT OUTER JOIN route ON (route.id = trip.route_id)
LEFT OUTER JOIN agency ON (agency.id = route.agency_id)
LEFT OUTER JOIN stop_time ON (stop_time.trip_id = trip.id)
LEFT OUTER JOIN stop ON (stop.id = stop_time.stop_id)
LEFT OUTER JOIN capacity_path ON (capacity_path.trip_id = trip.id AND capacity_path.stop_id = stop.id)
LEFT OUTER JOIN capacity_capacity ON (capacity_capacity.trip_id = trip.id AND capacity_capacity.stop_id = stop.id AND capacity_capacity.service_date_id = service_date.id)
WHERE
(agency.id = 31)
AND service_date.date = '2017-01-30'
AND stop.stop_id NOT IN ('132','133','134','135','136','137','138', '139', '140', '141','142','174','175', '176')
ORDER BY
trip.id ASC,
stop_time.stop_sequence ASC
In [ ]:
def strip_id(s):
try:
index = s.index(':')
except ValueError:
index = len(s)
return s[:index]
In [ ]:
columns = [
'agency_id',
'service_date_id', 'service_date_date',
'route_id', 'route_short_name', 'route_long_name',
'trip_id', 'trip_headsign', 'trip_short_name',
'stop_time_id', 'stop_time_arrival_time', 'stop_time_departure_time', 'stop_time_stop_sequence',
'stop_id', 'stop_stop_id', 'stop_name',
'capacity_path_id', 'capacity_path_path',
'capacity_capacity_id', 'capacity_capacity_capacity1st', 'capacity_capacity_capacity2nd'
]
In [ ]:
in_dir = "in_data/"
out_dir = "out_data/"
We process the CSV to stem the stop_id as they are currently not in the official form. The geops dataset add a suffix to each stop_id if they correspond to differente route.
In [ ]:
dates = ['2017-01-30','2017-01-31','2017-02-01','2017-02-02','2017-02-03','2017-02-04','2017-02-05']
In [ ]:
for date in dates:
file = in_dir + date + '.csv'
df = pd.read_csv(file)
df.columns = columns
df['stop_stop_id'] = df['stop_stop_id'].apply(lambda x: strip_id(x))
df.to_csv(out_dir + date + '_processed.csv')