In [ ]:
import datetime
import functools
from imposm.parser import OSMParser
from matplotlib import collections as mc
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle as pkl
import pyproj
import requests
import scipy as sp
import rtree
from scipy import signal
import shapely.geometry
%pylab inline

import data_munging

In [ ]:
os.chdir(data_munging.src_dir)
metro_area = 'chicago_illinois'
data_dir = '../dat/'

We iterate through an OSM file and find all the segments in it so that we can match on it later. We should be filtering out highways, but we're not currently doing so.


In [ ]:
# simple class that handles the parsed OSM data.
class HighwayCounter(object):
    highways = set()

    def ways(self, ways):
        # callback method for ways
        for osmid, tags, refs in ways:
            if 'highway' in tags:
              self.highways.add(osmid)

class SegmentFinder(object):
    segments = []
    
    def ways(self, ways):
        # callback method for ways
        for osmid, tags, refs in ways:
            prev_ref = None
            for ref in refs:
                if prev_ref is not None:
                    self.segments.append((prev_ref, ref))
                prev_ref = ref

class CoordsFinder(object):
    
    coords_dict = dict()
    def coords(self, coords):
        for osmid, lon, lat in coords:
            self.coords_dict[osmid] = (lon, lat)

class NodeCoords(object):
    node_coords = dict()

    def nodes(self, nodes):
        # callback method for nodes
        for osmid, tags, coords in nodes:
            self.node_coords[osmid] = coords

# instantiate counter and parser and start parsing
all_segments = SegmentFinder()
some_coords = NodeCoords()
all_coords = CoordsFinder()
all_highways = HighwayCounter()
p = OSMParser(concurrency=4,
              coords_callback=all_coords.coords,
              ways_callback=all_segments.ways)
p.parse(data_dir + metro_area + '.osm.pbf')
p_highways = OSMParser(concurrency=4,
                       ways_callback=all_highways.ways)
p_highways.parse(data_dir + metro_area + '.osm.pbf')

In [ ]:
data_files =  set(os.listdir('../dat'))
print data_files

In [ ]:
# rtree_idx_to_osm_data = dict()

In [ ]:
previous_index_exists = (os.path.isfile('../dat/' + metro_area + '.idx')
                         or os.path.isfile('../dat/' + metro_area + '.dat'))
#                          or os.path.isfile('../dat/' + metro_area + '_rtree_to_osm.pkl'))
print previous_index_exists

In [ ]:
segments_idx = rtree.index.Index('../dat/chicago_illinois')
if not previous_index_exists:
    total_failures = 0
    total_tries = 0
    for unique_index, segment in enumerate(SegmentFinder.segments):
        total_tries += 1
        if segment[0] not in all_highways.highways and segment[1] not in all_highways.highways:
            try:
                lon0, lat0 = all_coords.coords_dict[segment[0]]
                lon1, lat1 = all_coords.coords_dict[segment[1]]
                coord0 = data_munging.NAD83(lon0, lat0)
                coord1 = data_munging.NAD83(lon1, lat1)
                segment_data =  {'segment_ids': segment,
                                'shapely_line': shapely.geometry.LineString([coord0, coord1]),
                                'lonlat': ((lon0, lat0), (lon1, lat1))}
                segments_idx.insert(id=unique_index,
                                    coordinates=data_munging.coords_to_bb(coord0, coord1),
                                   obj=segment_data)
            except:
                total_failures += 1
#     pkl.dump(rtree_idx_to_osm_data, open('../dat/' + metro_area + '_rtree_to_osm.pkl', 'wb'))
else:
    print('It appears as though this has already been indexed.')

In [ ]:
segments_idx = rtree.index.Index('../chicago_illinois')

In [ ]:
print (total_failures, total_tries)

In [ ]:
print 'This is a sanity check to make sure that our bounds look reasonable.'
print segments_idx.bounds

In [ ]:
rides, readings = data_munging.read_raw_data()

In [ ]:
# This is here temporarily both to create a static dataset and because
# some of the newer android readings are bogus and need to be removed.
rides = rides.loc[rides['created_at'] < '2016-03-01', :]
readings = readings.loc[readings['created_at'] < '2016-03-01', :]

In [ ]:
readings = data_munging.clean_readings(readings)
readings = data_munging.add_proj_to_readings(readings, data_munging.NAD83)

In [ ]:
# here we are dropping all the readings with weird speeds as they probably will not provide a lot of information
readings = readings.loc[np.logical_and(readings.gps_mph < 30, readings.gps_speed > 4), :]

Here we grab all the bounding boxes so we can do statistics on them and figure out how much we need to expand them in order to do reasonable index matching.


In [ ]:
bb_areas = list()
bb_width = list()
bb_height = list()
bb_max = list()
for i, reading in readings.iterrows():
    the_bb = data_munging.reading_to_bb(reading)
    bb_area = data_munging.area_of_bb(the_bb)
    bb_areas.append(bb_area)
    bb_width.append(np.abs(reading.start_x - reading.end_x))
    bb_height.append(np.abs(reading.start_y- reading.end_y))
    bb_max.append(np.max([bb_width[-1], bb_height[-1]]))

In [ ]:
print the_bb

In [ ]:
plt.hist(bb_areas)
plt.show()

In [ ]:
plt.scatter(bb_width, bb_height)
plt.show()

In [ ]:
plt.hist(bb_max)
plt.show()

In [ ]:
osm_seg_to_data = dict()

In [ ]:
readings['osm_node_0'] = np.NaN
readings['osm_node_1'] = np.NaN

In [ ]:
readings_to_osm_segments = dict()
for i, reading in readings.iterrows():
    the_bb = data_munging.reading_to_bb(reading)
    expanded_bb = data_munging.expand_bb(the_bb, 3.0)
    line_string = shapely.geometry.LineString([[reading.start_x, reading.start_y],
                                               [reading.end_x, reading.end_y]])
    closest_segment = None
    closest_dist = np.inf
    for segment in segments_idx.intersection(expanded_bb, objects=True):
#         segment_is_not_highway = (segment.object['segment_ids'][0] not in all_highways.highways and
#                               segment.object['segment_ids'][1] not in all_highways.highways)
        if segment.object['shapely_line'].distance(line_string) < closest_dist:
            closest_segment = segment.object['segment_ids']
            if closest_segment not in osm_seg_to_data:
                osm_seg_to_data[closest_segment] = segment.object
    readings_to_osm_segments[reading.id] = closest_segment
    try:
        readings.osm_node_0[i] = closest_segment[0]
        readings.osm_node_1[i] = closest_segment[1]
    except:
        'There is no segment that matches.'

In [ ]:
readings_by_segment = readings.groupby(['osm_node_0', 'osm_node_1']).size().sort_by_values()

In [ ]:
readings_to_osm_segments[588]

In [ ]:
a_shapely = osm_seg_to_shapely[(2207607827L, 2207607826L)]

In [ ]:
list(a_shapely.coords)

In [ ]:
for shapely_line in osm_seg_to_shapely.values():
    coords = list(shapely_line.coords)
    plt.plot([[coords[0][0], coords[1][0]],
             [coords[0][1], coords[1][1]]])
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()

In [ ]:
total_matches = 0
total_misses = 0
for key in readings_to_osm_segments:
    if readings_to_osm_segments[key] is None:
        total_misses += 1
    else:
        total_matches += 1

In [ ]:
total_misses

In [ ]:
pkl.dumps(readings_to_osm_segments, open('readings_to_segments_chi.pkl', 'wb'))

In [ ]:
reading['id']

In [ ]:
for i in total_bb:
    for col in total_bb:
        the_bb = data_munging.reading_to_bb(readings.ix[i, :], 1)
        expand_bb = data_munging.expand_bb(readings.ix[i, :])
        total_bb[i, col] = len(segments_idx.intersection(data_munging.reading_to_bb(readings.ix[i,:])))

In [ ]:
print len(all_segments.segments)
print len(some_coords.node_coords)
Here we show a sample of how one might use OSRM to snap our lat-long coordinates to streets. This example uses OSRM running on localhost at port 5000 for Chicago OSM data with a bicycle profile. One can clean data either by snapping lat-longs to the closest street or by putting together multiple readings from a ride and using combined information to get a likely series of streets taken. It takes time information as well, so I'm not sure if it uses information from the lua bicycle profile where it estimates speeds on various roads? Currently, I am unsure how to mark up OSM data with bumpiness information, as we have data that look like this: " Here, the nodes contains the actual lat-lon information. I am not sure what data structures OSRM uses, so I don't know how to appropriately associate our pavement data.

In [ ]:
rides, readings = data_munging.read_raw_data()
readings = data_munging.clean_readings(readings)
readings = data_munging.add_proj_to_readings(readings, data_munging.NAD83)

In [ ]:


In [ ]:
readings.loc[0, ['num_accel_x', 'num_accel_y', 'num_accel_z', 'num_accel_total']]

In [ ]:
print rides.shape
print readings.shape
n, p = readings.shape

In [ ]:
nearest_request = 'http://127.0.0.1:5000/nearest?loc={0},{1}'
map_request = 'http://127.0.0.1:5000/match?loc={0},{1}&t={2}&loc={3},{4}&t={5}'

In [ ]:
map_request

In [ ]:
for snapped_var in ['snapped_' + var for var in ['start_lat', 'start_lon', 'end_lat', 'end_lon']]:
    readings[snapped_var] = 0

In [ ]:
osrm_response['matchings'][0]['matched_points']

In [ ]:
total_calls = 0
for i, reading in readings.iterrows():
    data = tuple(reading[['start_lat', 'start_lon']])+9
    print i
    print data 
    osrm_request = nearest_request.format(*data)
    osrm_response = requests.get(osrm_request).json()
    print osrm_response
    total_calls += 1
    if total_calls > 100:
        break

In [ ]:
total_calls = 0
for i, reading in readings.iterrows():
    data = tuple(reading[['start_lat', 'start_lon']]) + (0,) + tuple(reading[['end_lat', 'end_lon']]) + (1,)   
    print i
    print data 
    osrm_request = map_request.format(*data)
    osrm_response = requests.get(osrm_request).json()
    print osrm_response
    total_calls += 1
    if total_calls > 100:
        break

In [ ]:
for i, reading in readings.iterrows():
    data = tuple(reading[['start_lat', 'start_lon']]) + (0,) + tuple(reading[['end_lat', 'end_lon']]) + (1,)   
    osrm_request = map_request.format(*data)
    osrm_response = requests.get(osrm_request).json()
    print osrm_response
    readings.loc[i, ['snapped_start_lat', 'snapped_start_lon']] = osrm_response['matchings'][0]['matched_points'][0]
    break

In [ ]:
osrm_request = map_request.format(*reading[['start_lat', 'start_lon']], 0)

In [ ]:
(0, 4) + (0,5,9)

In [ ]:
# This does not work because OSRM does not accept floats as times. 
test_map_request = map_request.format(*tuple(a_reading[['start_lat', 'start_lon', 'start_time',
                                                'end_lat', 'end_lon', 'end_time']]))

In [ ]:
test_map_request = map_request.format(a_reading['start_lat'],
                                      a_reading['start_lon'], 
                                      0,
                                      a_reading['end_lat'],
                                      a_reading['end_lon'],
                                      1)

In [ ]:
print test_map_request
print requests.get(test_map_request).json()