In [ ]:

    
import json
import requests
import pickle
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from math import radians, cos, sin, sqrt, atan2

Unfortunately, making the queries directly in django is complicated and it takes a long time to run. Instead, we query directly the database using postico and export the results to csv for further processing.

PostgreSQL command

You can use the following sql command to retrieve the data corresponding to one day.

SELECT
agency.id,
service_date.id, service_date.date, 
route.id, route.short_name, route.long_name,
trip.id, trip.headsign, trip.short_name,
stop_time.id, stop_time.arrival_time, stop_time.departure_time, stop_time.stop_sequence, 
stop.id, stop.stop_id, stop.name, 
capacity_path.id, capacity_path.path, 
capacity_capacity.id, capacity_capacity.capacity1st, capacity_capacity.capacity2nd

FROM service_date
LEFT OUTER JOIN trip ON (service_date.service_id = trip.service_id)
LEFT OUTER JOIN route ON (route.id = trip.route_id)
LEFT OUTER JOIN agency ON (agency.id = route.agency_id)
LEFT OUTER JOIN stop_time ON (stop_time.trip_id = trip.id)  
LEFT OUTER JOIN stop ON (stop.id = stop_time.stop_id)
LEFT OUTER JOIN capacity_path ON (capacity_path.trip_id = trip.id AND capacity_path.stop_id = stop.id)
LEFT OUTER JOIN capacity_capacity ON (capacity_capacity.trip_id = trip.id AND capacity_capacity.stop_id = stop.id AND capacity_capacity.service_date_id = service_date.id)
WHERE 
(agency.id = 31) 
AND service_date.date = '2017-01-30'
AND stop.stop_id NOT IN ('132','133','134','135','136','137','138', '139', '140', '141','142','174','175', '176')
ORDER BY 
trip.id ASC,
stop_time.stop_sequence ASC

Helper



In [ ]:

    
def strip_id(s):
    try:
        index = s.index(':')
    except ValueError:
        index = len(s)
    return s[:index]



In [ ]:

    
columns = [
    'agency_id', 
    'service_date_id', 'service_date_date',
    'route_id', 'route_short_name', 'route_long_name',
    'trip_id', 'trip_headsign', 'trip_short_name',
    'stop_time_id', 'stop_time_arrival_time', 'stop_time_departure_time', 'stop_time_stop_sequence', 
    'stop_id', 'stop_stop_id', 'stop_name', 
    'capacity_path_id', 'capacity_path_path', 
    'capacity_capacity_id', 'capacity_capacity_capacity1st', 'capacity_capacity_capacity2nd'
]



In [ ]:

    
in_dir = "in_data/"
out_dir = "out_data/"

We process the CSV to stem the stop_id as they are currently not in the official form. The geops dataset add a suffix to each stop_id if they correspond to differente route.



In [ ]:

    
dates = ['2017-01-30','2017-01-31','2017-02-01','2017-02-02','2017-02-03','2017-02-04','2017-02-05']



In [ ]:

    
for date in dates:
    file = in_dir + date + '.csv'
    df = pd.read_csv(file) 
    df.columns = columns
    
    df['stop_stop_id'] = df['stop_stop_id'].apply(lambda x: strip_id(x))
    
    df.to_csv(out_dir + date + '_processed.csv')