In [1]:
from pathlib import Path
import json
from functools import reduce
import math
import datetime as dt
import pytz 
from itertools import product
from collections import OrderedDict
import time

import requests
import numpy as np
import pandas as pd
import geopandas as gpd
import shapely.ops as so

ROOT = Path('../')
DATA_DIR = ROOT/'data'

CRS_NZGD49 = {'init': 'epsg:27200', 'no_defs': True}
CRS_NZTM = {'init': 'epsg:2193', 'no_defs': True}
CRS_WGS84 = {'init': 'epsg:4326'}

%matplotlib inline

Use Google Maps to compute commute matrices


In [9]:
def get_secret(key, secrets_path=ROOT/'secrets.json'):
    secrets_path = Path(secrets_path)
    with secrets_path.open() as src:
        secrets = json.load(src)
    return secrets[key]

GOOGLE_MATRIX_URL = "https://maps.googleapis.com/maps/api/distancematrix/json"
GOOGLE_KEY = get_secret('GOOGLE_API_KEY_ALEX')

def get_matrix(origins, destinations, mode, departure_time=None, url=GOOGLE_MATRIX_URL, key=GOOGLE_KEY, 
  timezone='Pacific/Auckland'):
    """
    Call Google to compute the duration-distance matrix from the list of origins to the list of destinations
    by the given mode at the given departure time.
    
    INPUT:
        origins
            List of WGS84 longitude-latitude pairs that will be round to 5 decimal places 
        destinations
            List of WGS84 longitude-latitude pairs that will be round to 5 decimal places 
        mode
            String; one of 'driving', 'walking', 'bicycling', or 'transit'
        departure_time
            Optional; ISO 8601 datetime string or 'now'; e.g. '2017-06-01T07:30:00'; can't be from the past
        timezone
            String; timezone for query, e.g. 'Pacific/Auckland'; see https://en.wikipedia.org/wiki/List_of_tz_database_time_zones 
            
    OUTPUT:
        A decoded JSON string described at https://developers.google.com/maps/documentation/distance-matrix/intro#DirectionsResponseElements .
    NOTES:
        - If ``departure_time`` is not null and ``mode='driving'``, then each request must contain at most 100 elements, 
          where the number of elements equals the product of the number of origins and number of destinations.
    """
    valid_modes = ['driving', 'walking', 'bicycling', 'transit']
    if mode not in valid_modes:
        raise ValueError('mode must be one of {!s}'.format(valid_modes))
    
    origs = '|'.join(["{:.05f},{:.05f}".format(lat, lon) for lon, lat in origins])
    dests = '|'.join(["{:.05f},{:.05f}".format(lat, lon) for lon, lat in destinations])
    if departure_time not in [None, 'now']:
        tz = pytz.timezone(timezone)
        departure_time = dt.datetime.strptime(departure_time, '%Y-%m-%dT%H:%M:%S')
        departure_time = int(tz.localize(departure_time).timestamp())
        
    params = {
        'origins': origs,
        'destinations': dests,
        'key': key,
        'mode': mode,
        'departure_time': departure_time,
    }
    r = requests.get(url, params=params)

    # Raise an error if bad request
    r.raise_for_status()

    return r.json()         

def matrix_to_df(matrix, orig_names=None, dest_names=None):
    """
    Given a (decoded) JSON time-distance matrix of the form output by :func:``get_matrix``, 
    a list of origin names (defaults to [0, 1, 2, etc.]), 
    and a list of destination names (defaults to [0, 1, 2, etc.]), convert the matrix to a DataFrame with
    the columns:
    
    - ``'origin'``: one of ``orig_names``
    - ``'destination'``: one of ``dest_names``
    - ``'duration'``: time from origin to destination
    - ``'distance'``: distance from origin to destination
    
    The origin and destination names should be listed in the same order as the 'sources' and 'targets' 
    attributes of ``matrix``, respectively.
    """
    # Initialize DataFrame
    columns = ['orig', 'orig_name', 'dest', 'dest_name', 'duration', 'distance']
    f = pd.DataFrame([], columns=columns)
    
    # Append origins and destinations
    origs, dests =  zip(*product(matrix['origin_addresses'], matrix['destination_addresses']))
    f['orig'] = origs
    f['dest'] = dests
    if orig_names is not None and dest_names is not None:
        orig_names, dest_names = zip(*product(orig_names, dest_names))
        f['orig_name'] = orig_names
        f['dest_name'] = dest_names
        
    # Append durations and distances
    if 'duration_in_traffic' in matrix['rows'][0]['elements'][0]:
        dur_key = 'duration_in_traffic'
    else:
        dur_key = 'duration'
    durs = []
    dists = []
    for r in matrix['rows']:
        for e in r['elements']:
            if e['status'] == 'OK':
                durs.append(e[dur_key]['value'])
                dists.append(e['distance']['value'])
            else:
                durs.append(None)
                dists.append(None)
    f['duration'] = durs
    f['distance'] = dists

    return f

def build_matrix(rental_area_points, mode, departure_time=None, chunk_size=100, 
  url=GOOGLE_MATRIX_URL, key=GOOGLE_KEY):
    """
    Compute the duration-distance matrix between all pairs of rental area points given,
    but skip the diagonal entries, that is, the ones with origin equal to destination.
    To do this, call:func:`get_matrix` repeatedly.
    Group the duration-distance calls into ``chunk_size``-to-1 chunks. 
    
    INPUT:
        rental_area_points
            GeoDataFrame
        mode
            See :func:`get_matrix`
        departure_time
            See :func:`get_matrix`
        chunk_size
            Max number of origin-destination rows per matrix query
        url
            See :func:`get_matrix`
        key
            See :func:`get_matrix`
            
    OUTPUT:
        A DataFrame of the form...
        
    NOTES:
        - Sleeps for 1 second after every call to :func:`get_matrix` to stay within API usage limits
    """
    f = rental_area_points.copy()
    frames = []
    status = 'OK'
    for __, row in f.iterrows():
        # Quit if bad status
        if status != 'OK':
            print('Quitting because of bad status:', status)
            break
            
        # Set the single destination
        dests = [row['geometry'].coords[0]]  
        ra = row['rental_area']
        dest_names = [ra]
        
        # Create origin chunks and compute matrix for each chunk to destination 
        ff = f[f['rental_area'] != ra].copy()
        num_chunks = math.ceil(ff.shape[0]/chunk_size)
        for g in np.array_split(ff, num_chunks):
            # Get origins
            origs = [geo.coords[0] for geo in g['geometry']] 
            orig_names = g['rental_area'].values 
            # Get matrix
            try:
                j = get_matrix(origs, dests, mode=mode, departure_time=departure_time, url=url, key=key)
                status = j['status']
                if status != 'OK':
                    break
                df = matrix_to_df(j, orig_names, dest_names)
            except:
                df = pd.DataFrame()
                df['orig'] = np.nan
                df['orig_name'] = orig_names
                df['dest'] = np.nan
                df['dest_name'] = ra
                df['duration'] = np.nan
                df['distance'] = np.nan
            frames.append(df)
            time.sleep(1)
            
    return pd.concat(frames).sort_values(['orig', 'dest'])

In [ ]:
# Test some
origs = [
    [174.66339111328125, -36.45000844447082], 
    [174.76158142089844, -36.86533886128865],
    [174.85633850097656, -37.20517535620264],
]
dests = origs[:2]
matrix = get_matrix(origs, dests, mode='transit', departure_time='2017-06-01T08:00:00')
matrix_to_df(matrix, ['bingo', 'bongo', 'boom'], ['bingo', 'bongo'])

In [10]:
# Estimate cost of job at 0.5/1000 USD/element beyond 2500 elements

def compute_google_cost(n, with_freebies=False):
    """
    If ``with_freebies``, then ignore the first 2500 elements, which are free.
    """
    d = OrderedDict()
    d['#rental areas'] = n
    N = 4*(n**2 - n)
    d['#elements needed for 4 modes'] = N
    d['exceeds 100000-element daily limit?'] = N > 100000 
    d['duration for job in minutes'] = (N/100)/60
    if with_freebies:
        d['cost for job in USD'] = (N - 2500)*(0.5/1000)
    else:
        d['cost for job in USD'] = N*(0.5/1000)
    
    return pd.Series(d)

compute_google_cost(f.shape[0])


Out[10]:
#rental areas                               40
#elements needed for 4 modes              6400
exceeds 100000-element daily limit?      False
duration for job in minutes            1.06667
cost for job in USD                       1.95
dtype: object

In [15]:
key = get_secret('GOOGLE_API_KEY_PHIL')
regions = ['auckland', 'wellington', 'canterbury']

for region in regions:   
    # Get poits
    path = DATA_DIR/region/'rental_area_points.geojson'
    f = gpd.read_file(str(path))
    print('* ', region)
    print(compute_google_cost(f.shape[0]))
    
    # Build matrices
    departure_time='2017-06-01T07:30:00'
    for mode in ['driving', 'walking', 'bicycling', 'transit']:
        %time m = build_matrix(f, mode=mode, departure_time=departure_time, key=key)
        n = m.shape[0]
        k = m[m['distance'].notnull()].shape[0]
        print(mode, 'hit rate', k/n)
        path = DATA_DIR/region/'commutes_{!s}.csv'.format(mode)
        m.to_csv(str(path), index=False)


*  wellington
#rental areas                               43
#elements needed for 4 modes              7396
exceeds 100000-element daily limit?      False
duration for job in minutes            1.23267
cost for job in USD                      2.448
dtype: object
CPU times: user 844 ms, sys: 16 ms, total: 860 ms
Wall time: 1min 4s
driving hit rate 1.0
CPU times: user 936 ms, sys: 24 ms, total: 960 ms
Wall time: 1min 4s
walking hit rate 1.0
CPU times: user 1.08 s, sys: 8 ms, total: 1.09 s
Wall time: 1min 7s
bicycling hit rate 1.0
CPU times: user 836 ms, sys: 12 ms, total: 848 ms
Wall time: 1min 5s
transit hit rate 0.8593576965669989
*  canterbury
#rental areas                               40
#elements needed for 4 modes              6400
exceeds 100000-element daily limit?      False
duration for job in minutes            1.06667
cost for job in USD                       1.95
dtype: object
CPU times: user 632 ms, sys: 8 ms, total: 640 ms
Wall time: 1min 2s
driving hit rate 1.0
CPU times: user 892 ms, sys: 28 ms, total: 920 ms
Wall time: 1min
walking hit rate 1.0
CPU times: user 968 ms, sys: 28 ms, total: 996 ms
Wall time: 1min
bicycling hit rate 1.0
CPU times: user 784 ms, sys: 20 ms, total: 804 ms
Wall time: 1min 1s
transit hit rate 0.6102564102564103

In [ ]: