In [ ]:

    
import json
import requests
import pandas as pd
import pickle
from tqdm import tqdm_notebook as tqdm
from math import radians, cos, sin, sqrt, atan2
from IPython.display import display, HTML



In [ ]:

    
columns = [
    'agency_id', 
    'service_date_id', 'service_date_date',
    'route_id', 'route_short_name', 'route_long_name',
    'trip_id', 'trip_headsign', 'trip_short_name',
    'stop_time_id', 'stop_time_arrival_time', 'stop_time_departure_time', 'stop_time_stop_sequence', 
    'stop_id', 'stop_stop_id', 'stop_name', 
    'capacity_path_id', 'capacity_path_path', 
    'capacity_capacity_id', 'capacity_capacity_capacity1st', 'capacity_capacity_capacity2nd'
]



In [ ]:

    
in_dir = "in_data/"
out_dir = "out_data/"

Helper



In [ ]:

    
R = 6373.0
# Compute the distances between two (lat,lng)
def compute_distance(lat1, lon1, lat2, lon2):
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return (R * c)*1000



In [ ]:

    
max_depth = 20
max_queue = 100000
def bfs(graph, start, end, max_depth=max_depth):
    queue = []
    queue.append([start])
    while queue:
        path = queue.pop(0)
        if len(path) > max_depth or len(queue) > max_queue:
            return []
        
        node = path[-1]
        
        if node == end:
            return path

        for adjacent in graph.get(node, []):
            new_path = list(path)
            new_path.append(adjacent)
            queue.append(new_path)
            
    return [-1]

Processing

Preparation

First, we create some dictionnaries. We begin by creating the relation to go from edge_id to the feature of the edge (meaning the path in geojson format).



In [ ]:

    
with open(in_dir + 'edges.geojson') as file:
    edgeid2feature = {}
    data1 = json.load(file)
    for feature in data1['features']:
        edgeid2feature[feature['properties']['edge_id']] = feature

Then, we do the same to map the station id to the features of station. We are mainly interested the the coordinate of the station but we keep all of them.



In [ ]:

    
with open(in_dir + 'stations.geojson') as file:
    stopid2coord = {}
    data2 = json.load(file)
    for feature in data2['features']:
        stopid2coord[feature['properties']['station_id']] = feature

We need to create a map of edge id to list of edge ids. Indeed, to find the path we first need to know which edge are adjacent to each other. To do this, we look independently at the start and end of each segment and look for path with end of start geographic position close the one we are interested in.



In [ ]:

    
max_d = 5
edgeid2edgeid = {}

for edge_id1, feature1 in tqdm(edgeid2feature.items()):
    edge_start_lat1 = feature1['geometry']['coordinates'][0][1]
    edge_start_lng1 = feature1['geometry']['coordinates'][0][0]
    edge_end_lat1 = feature1['geometry']['coordinates'][-1][1]
    edge_end_lng1 = feature1['geometry']['coordinates'][-1][0]
    
    edges = []
    
    for edge_id2, feature2 in edgeid2feature.items():
        if edge_id2 != edge_id1:
            edge_start_lat2 = feature2['geometry']['coordinates'][0][1]
            edge_start_lng2 = feature2['geometry']['coordinates'][0][0]
            edge_end_lat2 = feature2['geometry']['coordinates'][-1][1]
            edge_end_lng2 = feature2['geometry']['coordinates'][-1][0]

            d1 = compute_distance(edge_start_lat1, edge_start_lng1, edge_start_lat2, edge_start_lng2)
            d2 = compute_distance(edge_start_lat1, edge_start_lng1, edge_end_lat2, edge_end_lng2) 
            d3 = compute_distance(edge_end_lat1, edge_end_lng1, edge_start_lat2, edge_start_lng2)
            d4 = compute_distance(edge_end_lat1, edge_end_lng1, edge_end_lat2, edge_end_lng2) 

            if d1 < max_d or d2 < max_d or d3 < max_d or d4 < max_d: 
                if (min(d1,d2,d3,d4) > 0.5):
                    print(edge_id1, min(d1,d2,d3,d4))
                edges.append(edge_id2)
                
    if not edges:
         print(edge_id1)
    
    edgeid2edgeid[edge_id1] = edges

Finally, we find the edges going "out" of each station, also by looking at the latitude and longitude of the station and of the edges.



In [ ]:

    
stopid2edgeid = {}
for stop_id, feature in tqdm(stopid2coord.items()):
    stop_lat = feature['geometry']['coordinates'][1]
    stop_lng = feature['geometry']['coordinates'][0]
    
    edges = []
    
    for edge_id, feature in edgeid2feature.items():
        edge_start_lat = feature['geometry']['coordinates'][0][1]
        edge_start_lng = feature['geometry']['coordinates'][0][0]
        edge_end_lat = feature['geometry']['coordinates'][-1][1]
        edge_end_lng = feature['geometry']['coordinates'][-1][0]
        
        d_start = compute_distance(stop_lat, stop_lng, edge_start_lat, edge_start_lng)
        d_end = compute_distance(stop_lat, stop_lng, edge_end_lat, edge_end_lng)
        if d_start < max_d or d_end < max_d: 
            if (min(d_start,d_end) > 0.5):
                    print(edge_id1, min(d_start,d_end))
            edges.append(edge_id)
            
    if not edges:
        print("Error", stop_id)
    
    stopid2edgeid[stop_id] =  edges

edgeid2stopid = {}
for stopid, edgeids in tqdm(stopid2edgeid.items()):
    for edgeid in edgeids:
        if edgeid in edgeid2stopid:
            edgeid2stopid[edgeid].append(stopid)
        else:
            edgeid2stopid[edgeid] = [stopid]

Processing

We import the data from the csv and do a groupby based on the trip id and service date id. We have to group by service date as we are processing all the data in one go.



In [ ]:

    
dates = ['2017-01-30','2017-01-31','2017-02-01','2017-02-02','2017-02-03','2017-02-04','2017-02-05']



In [ ]:

    
df = pd.concat([pd.read_csv(out_dir + date + '_processed.csv', index_col=0)  for date in dates])
df.columns = columns
grouped = df.groupby(['trip_id', 'service_date_id'])



In [ ]:

    
len(df)

Prepare keys

We create a set a pairs composed of stations where each pair represent a part of a trip. We use a set to avoid having duplicates.



In [ ]:

    
keys = set()

for name, group in tqdm(grouped, desc="Trips"):
    trip = group.sort_values(['stop_time_stop_sequence'])
    
    rows = trip.iterrows()
    last_index, last_stop = next(rows)
    
    for next_index, next_stop in rows:
        stop_1 = str(last_stop.stop_stop_id)
        stop_2 = str(next_stop.stop_stop_id)
        
        if (stop_1, stop_2) not in keys and (stop_2, stop_1) not in keys:
            keys.add((stop_1, stop_2))
    
        last_index, last_stop = (next_index, next_stop)
    
print(len(keys))

Find all paires of stations and their path

Now, the real work begin. We use a breadth first search algorithm to find the path between each station. The full algo goes like this:

Find the two station id (stop_id).
Verify that the path for the pair has not been already calculated
Find every path going out from each station using the previously calculated map
For each pair of edges, run the BFS and insert if found a path
If find a path shorter than previously found, update

It takes some time, so be patient.



In [ ]:

    
trips_by_station_id = {}

for key in tqdm(keys):
    stop_1 = key[0] 
    stop_2 = key[1]
    
    if stop_1 != stop_2:
        if key not in trips_by_station_id and stop_1 in stopid2edgeid and stop_2 in stopid2edgeid:
            start = sorted(stopid2edgeid[stop_1])
            end = sorted(stopid2edgeid[stop_2])

            for s in start:
                for e in end:
                    r = bfs(edgeid2edgeid, s, e)
                    if key not in trips_by_station_id or (r and len(trips_by_station_id[key]) > len(r)):
                        trips_by_station_id[key] = r
                        print(key, r)
        else:
            print(key, "Error",)

pickle.dump(trips_by_station_id, file=open(out_dir + "path_trips_by_station_id.dump", 'wb'), protocol=2)

Unfortunately, some path are way too long to be found using a algo in a decent amount of time.



In [ ]:

    
sum([1 for k, v in trips_by_station_id.items() if not v])

So we rely on manual inspection and complete by end the set.



In [ ]:

    
new = {
('8500010', '8503000'): [584, 404, 403, 346, 369, 445, 1921, 446, 1973, 447, 448, 449, 450, 451, 452, 1945, 371, 591, 592, 455, 1960, 456, 2002,  2407, 2373, 2374, 2375, 2377, 2376, 2414, 2423,  2415, 2396, 2395, 2379, 2389, 2400, 2401, 2372, 2340, 2339, 2338, 2337, 2224,  2225, 2226, 2227, 2228,2303, 2243, 2250, 2287, 2246, 2245],
('8500218', '8503000'): [350,358,357,360,359,1984,376,424,423,426,425,210,106,1959,2406,2405,2412,2378,2389,2400,2401,2372,2340,2260,2368,2259,2249,2308,2309,2252,2299],
('8501506', '8501300'): [234,236,233,232,237,228,412,411,1944,220,222,409,408,1360,273,221,223,226,224,225],
('8502113', '8503000'): [1984, 376, 424, 423, 426, 425, 210, 106, 1959, 105, 2406, 2405, 2412,2378,2389,2400,2401,2372,2340,2260,2368,2259,2249,2308,2309,2252,2299],
('8502119', '8503001'): [424, 423, 426, 425, 210, 106, 1959, 105, 2406, 2405, 2412,2378,2389,2400,2401,2372,2340,2260,2368,2259,2249,2308,2309,2252,2299],
('8502202', '8503000'): [428, 373, 504, 505, 157, 156, 155, 154, 366, 367, 161, 502, 503, 165, 2001, 2000, 1999, 162, 1998, 163, 164, 142, 1991, 2099, 2356, 2229],
('8502204', '8503000'): [161, 502, 503, 165, 2001, 2000, 1999, 162, 1998, 163, 164, 142, 1991,  2099, 2356, 2229],
('8503000', '8503003'): [2318,2321,1992,384,1993],
('8503000', '8503020'): [2218,2317,2320,2241,2315,2313,2234],
('8503000', '8503424'): [2310,2301,2248,2303,2267,2220,2175,2137,2352,2149,2136,2146,2155,2123,2112,2163,2111,458,457,595,855,459,460,461,594,593,463,462,386,1936,483,1334,1333],
('8503000', '8509002'): [2324, 2325, 2236, 2237, 2359, 2238, 2239, 2240, 144, 145, 1995, 146, 143, 147, 142, 140, 139, 138, 141, 611, 612, 124, 123, 613, 614, 604, 605,  125, 122, 113, 129, 128, 617, 618, 632, 633, 127, 126, 136, 137, 130, 133, 134, 132, 629, 630, 135, 131],
('8503001', '8502105'): [2419, 2423, 2414, 2376, 2369, 1997, 2406, 105, 1959, 106, 210],
('8503001', '8502220'): [2362, 2331, 2351],
('8503001', '8503000'): [2332, 2349, 2367, 2366, 2347, 2242, 2314, 2313, 2334, 2313, 2315, 2316, 2317, 2261, 2249,2308, 2311, 2290, 2251, 2285],
('8503016', '8503000'): [212, 2133, 2145, 2174, 2172, 2135, 2138, 2146, 2354, 2143, 2137, 2175, 2220,2267, 2303, 2243, 2251, 2291, 2293],
('8503202', '8503000'): [142, 1991, 2099, 2356, 2229],
('8503206', '8503000'): [141, 138, 139, 140, 142, 1991, 2099, 2356, 2229],
('8503504', '8503000'): [1960, 456, 2002, 2002, 2407, 2373, 2374, 2375, 2377, 2376, 2414, 2423,  2415, 2396, 2395, 2379, 2389, 2400, 2401, 2372, 2340, 2339, 2338, 2337, 2224,  2225, 2226, 2227, 2228,2303, 2243, 2250, 2287, 2246, 2245],
('8503508', '8503001'): [2387, 2388, 2397, 2403, 2425, 2426],
('8503509', '8503001'): [2409],
('8509411', '8503000'): [132, 134, 133, 130, 137, 136, 126, 127, 633, 632, 618, 617, 128, 129, 113, 122,125, 605, 604, 614, 613, 123, 124, 612, 611, 141, 138, 139, 140, 142,142, 1991, 2099, 2356, 2229],
#('8503506', '8516219'): [2405, 2406, 105], 
('8507000', '8503000'): [1300, 1303, 1302, 356, 396, 107, 1952, 1954, 1956, 1957, 203, 59, 363, 362, 350, 358, 357,360, 359, 1984, 376, 424, 423, 426, 425, 210, 106, 1959, 105, 2406, 2406, 2405, 2412,2378,2389,2400,2401,2372,2340,2260,2368,2259,2249,2308,2309,2252,2299],
('8500010', '8500309'): [584,404,403,346,368,347,345,348,344,566,565,349,568,567,358,357,360,359,1984,376,394,546,453,454],
('8500113', '8500010'): [545,176,179,177,178,856,272,1324,584],
('8500207', '8500218'): [33,34,35,1377,415,36,2200,37,38,441,442,443,444,362],
('8500207', '8504300'): [430,429,561,2216,560,31,30,29,353,27,26,2214,25,24,16],
('8500218', '8500010'): [567,568,349,565,566,344,348,345,347,368,369,346,403,404,584],
('8500218', '8500023'): [350,567,568,349,565,566,344,348,345],
('8500218', '8505000'): [362, 363, 196, 375, 422, 421, 487, 488, 489,  490, 492, 491, 495, 494, 493, 496, 497, 498, 499 ,2004, 501, 500, 419, 535, 534, 148],
('8501008', '8501030'): [175,174,173,166,172,167,171,168,170,169,95],
('8501037', '8501008'): [397,101,102,100,103,99,104,95,169,170,168,171,167,172,166,173,174,175],
('8501120', '8501008'): [2087,116,1362,361,97,98,96,398,397,101,102,100,103,99,104,95,169,170,168,171,167,172,166,173,174,175],
('8501120', '8501103'): [2087,116,1362, 389,390,391,587,267,392,549,393,1,572,573],
('8501120', '8504200'): [2087,116,1362, 389,390,391,587,115,697,698,696,695,548],
('8501200', '8501120'): [581,580,64,65,66,60,61,62,63,94],
('8501300', '8501120'): [110,112,111,109,581,580,64,65,66,60,61,62,63,94],
('8501400', '8501120'): [221,223,226,224,225,110,112,111,109,581,580,64,65,66,60,61,62,63,94],
('8501400', '8501500'): [273,1360,408,409,222,220,1944,411],
('8501605', '8507483'): [1906,2081,2082,1907,275,218,219,1942,254],
('8501609', '8501506'): [227,1906,1905,240,239,238,230,229,231,235],
('8502009', '8500218'): [496,493,494,495,491,492,490,489,488,487,421,422,196,363,362],
('8502119', '8503000'): [426, 425, 210, 106, 1959, 105, 2406, 2405, 2412,2378,2389,2400,2401,2372,2340,2260,2368,2259,2249,2308,2309,2252,2299],
('8502204', '8503202'): [161,502,503,165,2001,2000,1999,162,1998,163,164,142],
('8502205', '8502204'): [2020,1919,509,508,388,1918,2018,387,367],
('8502206', '8503010'): [165,2001,2000,1999,162,1998,163,164,142,147,143,146,1995,145],
('8503003', '8503104'): [385, 1994, 1647, 1669, 1670, 1671, 1672, 1673, 1674],
('8503006', '8503007'): [2155],
('8503006', '8503016'): [2155, 2138, 2135, 2172, 2174, 2145, 2133, 212],
##('8503006', '8503310')
('8503006', '8503340'): [2113, 2140, 2142, 2141, 2139, 2135, 2172, 2174],
('8503006', '8503526'): [2113, 2107, 2106, 2110, 2177, 2104, 2105, 850, 1609],
('8503008', '8503006'): [850, 2105, 2103, 2178, 2110, 2106, 2154],
('8503020', '8503006'): [2366, 2347, 2346, 2342, 2365, 2333, 93, 2108, 2102, 2154, 2106],
('8503209', '8509411'): [605, 125, 122, 113, 129, 128, 617, 618, 632, 633, 127, 126, 136, 137, 130, 133, 134, 132,],
('8503400', '8503006'): [459,855,457,458,2162,2115,2140],
('8503424', '8503400'): [1332,1333,1334,483,1936,386,462,463,593,594,461,460],
('8503504', '8503508'): [1960,456,2002,2407,2380,2381,2382,2379,2389],
('8503505', '8503508'): [456,2002,2407,2380,2381,2382,2379,2389],
('8503506', '8503508'): [2417,2418,2390],
('8503508', '8502119'): [2389,2378,2412,2405,2406,1959,106,210,425,426],
('8503508', '8503512'): [2391,2389,2378,2412,2405,2406,1959,106,210,425,426],
('8504014', '8504100'): [569,570,89,88,1985,2096,80,80,74,79,75,78,76,77],
('8504100', '8501120'): [77,76,78,75,79,74,81,80,2096,1985,88,89,570,569,86,87,579,578,85,84,82,83,94],
('8504200', '8501037'): [573,572,1,393,549,392,267,587,391,390,389,1362, 361,97,98,96,],
('8504200', '8501118'): [573,572,1,393,549,392,267,587,391,390,389,1362],
('8504221', '8504200'): [554,553,8,552,551,7,6,5,4,3,2,644,645],
('8504300', '8504221'): [563,562,15,14,13,12,11,10,9,577,576],
('8504414', '8507000'): [20,21,22,23,364,274,1903,1902,643,1297,1298,1299,1300],
('8505000', '8502007'): [534,535,500,501,2004,499,498,497,496,493,494],
('8505000', '8502009'): [534,535,500,501,2004,499,498,497],
('8505000', '8502202'): [534,535,395,158,160,159,427],
('8505000', '8502204'): [534,535,395,158,160,159,427,428,373,504,505,157,156,155,154,366,367],
('8505000', '8505004'): [534,1330,1329,1851,1850,1849,1848,440,506],
('8505004', '8502204'): [2022,507,2020,1919,509,508,388,2018,387,367],
('8505004', '8505112'): [510,511,512,513,2023,2029,2027,2034,2032,514,2026,2025],
('8505007', '8505112'): [2025,2026,514,2032,2034,2027,2029,2023,513],
('8505112', '8505213'): [515],
('8505114', '8505119'): [2037,2038,2039,2040,2036,2035,2048,2041,2042,2043,2044,2045,517],
('8505213', '8505004'): [510,511,512,513,2023,2029,2027,2034,2032,514,2026,2025,515,516,2037,2038,2039,2040,2036,2035,2048,2041,2042,2043,2044,2045,517,277,2046,2047,518,1917,2049,2050,2051,2052,2054,2053,1965,1966,2055,2058,2059,2056,1962,2057,519,520,2061,521],
('8506000', '8503003'): [600, 603, 602, 330, 329, 92, 377, 1311, 1310, 383, 1993],
('8506000', '8506302'): [620,619,343,340,624,623,339,338,341,622,621,336,337,342,335,331,333,334,332,379,378,372,601],
('8506105', '8014586'): [473,479,480,481,482,484,485,486,596,597],
('8506206', '8506000'): [336,337,342,335,331,333,334,332,379,378,372,601,],
('8507000', '8500010'): [1300,1299,1298,1297,643,1902,1903,274,364,23,22,21,20,19,413,18,17,24,25,2214,26,27,28,558,559,32,264,545,176,179,177,178,856,272,1324,584,],
('8507000', '8500218'): [1300,1303,1302,356,396,107,1952,1954,1956,1957,203,59,363,362,],
('8507000', '8502001'): [1300,1303,1302,356,396,107,1952,1954,1956,1957,203,374,422,421],
('8507000', '8504100'): [1301,324,1363,72,73,71,68,69,70,67,1327,575,574],
('8507000', '8507493'): [1300,1302,356,352,323,322,317,318,319,320,582,434,433,321,255,254,243,245,244,246],
('8507100', '8507000'): [433,434,582,320,319,318,317,322,323,352,356,1302,1303,1300],
('8507478', '8507475'): [1950,2072,2076,2075,2073,1949,2074,2078,2077,1596,2079],
('8507483', '8501609'): [254,1942,219,218,275,2082,1906,227],
('8508005', '8507000'): [786,432,207,199,351,365,364,274,1903,1902,643,1297,1298,1299,1300],
('8516219', '8503508'): [2406,2405,2412,2378,2389]
}



In [ ]:

    
trips_by_station_id.update(new)



In [ ]:

    
sum([1 for k, v in trips_by_station_id.items() if not v])

Finally, we go through each stop sequence for every trip. We can now determine the path each of the element and create a dictionary that can be imported in the database.



In [ ]:

    
paths = {}
for name, group in tqdm(grouped, desc="Trips"):
    trip = group.sort_values(['stop_time_stop_sequence'])
    
    rows = list(trip.iterrows())
    last_index, last_stop = rows[0]
        
    for next_index, next_stop in rows[1:]:
        stop_1 = str(last_stop.stop_stop_id)
        stop_2 = str(next_stop.stop_stop_id)
        
        key1 = (stop_1, stop_2)
        key2 = (stop_2, stop_1)
        
        key_full = (name[0], last_stop.stop_id)
        
        path = None   
        if key1 in trips_by_station_id:
            path = trips_by_station_id[key1]
        elif key2 in trips_by_station_id:
            path = trips_by_station_id[key2]
            path.reverse()
      
        if (key_full not in paths) or (path and len(paths[key_full]) > len(path)):
            paths[key_full] = path
    
        last_index, last_stop = (next_index, next_stop)
        
pickle.dump(paths, file=open(out_dir + "paths.dump", 'wb'), protocol=2)