In [ ]:
import datetime
import functools
from imposm.parser import OSMParser
from matplotlib import collections as mc
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle as pkl
import pyproj
import requests
import scipy as sp
import rtree
from scipy import signal
import shapely.geometry
%pylab inline
import data_munging
In [ ]:
os.chdir(data_munging.src_dir)
metro_area = 'chicago_illinois'
data_dir = '../dat/'
We iterate through an OSM file and find all the segments in it so that we can match on it later. We should be filtering out highways, but we're not currently doing so.
In [ ]:
# simple class that handles the parsed OSM data.
class HighwayCounter(object):
highways = set()
def ways(self, ways):
# callback method for ways
for osmid, tags, refs in ways:
if 'highway' in tags:
self.highways.add(osmid)
class SegmentFinder(object):
segments = []
def ways(self, ways):
# callback method for ways
for osmid, tags, refs in ways:
prev_ref = None
for ref in refs:
if prev_ref is not None:
self.segments.append((prev_ref, ref))
prev_ref = ref
class CoordsFinder(object):
coords_dict = dict()
def coords(self, coords):
for osmid, lon, lat in coords:
self.coords_dict[osmid] = (lon, lat)
class NodeCoords(object):
node_coords = dict()
def nodes(self, nodes):
# callback method for nodes
for osmid, tags, coords in nodes:
self.node_coords[osmid] = coords
# instantiate counter and parser and start parsing
all_segments = SegmentFinder()
some_coords = NodeCoords()
all_coords = CoordsFinder()
all_highways = HighwayCounter()
p = OSMParser(concurrency=4,
coords_callback=all_coords.coords,
ways_callback=all_segments.ways)
p.parse(data_dir + metro_area + '.osm.pbf')
p_highways = OSMParser(concurrency=4,
ways_callback=all_highways.ways)
p_highways.parse(data_dir + metro_area + '.osm.pbf')
In [ ]:
data_files = set(os.listdir('../dat'))
print data_files
In [ ]:
# rtree_idx_to_osm_data = dict()
In [ ]:
previous_index_exists = (os.path.isfile('../dat/' + metro_area + '.idx')
or os.path.isfile('../dat/' + metro_area + '.dat'))
# or os.path.isfile('../dat/' + metro_area + '_rtree_to_osm.pkl'))
print previous_index_exists
In [ ]:
segments_idx = rtree.index.Index('../dat/chicago_illinois')
if not previous_index_exists:
total_failures = 0
total_tries = 0
for unique_index, segment in enumerate(SegmentFinder.segments):
total_tries += 1
if segment[0] not in all_highways.highways and segment[1] not in all_highways.highways:
try:
lon0, lat0 = all_coords.coords_dict[segment[0]]
lon1, lat1 = all_coords.coords_dict[segment[1]]
coord0 = data_munging.NAD83(lon0, lat0)
coord1 = data_munging.NAD83(lon1, lat1)
segment_data = {'segment_ids': segment,
'shapely_line': shapely.geometry.LineString([coord0, coord1]),
'lonlat': ((lon0, lat0), (lon1, lat1))}
segments_idx.insert(id=unique_index,
coordinates=data_munging.coords_to_bb(coord0, coord1),
obj=segment_data)
except:
total_failures += 1
# pkl.dump(rtree_idx_to_osm_data, open('../dat/' + metro_area + '_rtree_to_osm.pkl', 'wb'))
else:
print('It appears as though this has already been indexed.')
In [ ]:
segments_idx = rtree.index.Index('../chicago_illinois')
In [ ]:
print (total_failures, total_tries)
In [ ]:
print 'This is a sanity check to make sure that our bounds look reasonable.'
print segments_idx.bounds
In [ ]:
rides, readings = data_munging.read_raw_data()
In [ ]:
# This is here temporarily both to create a static dataset and because
# some of the newer android readings are bogus and need to be removed.
rides = rides.loc[rides['created_at'] < '2016-03-01', :]
readings = readings.loc[readings['created_at'] < '2016-03-01', :]
In [ ]:
readings = data_munging.clean_readings(readings)
readings = data_munging.add_proj_to_readings(readings, data_munging.NAD83)
In [ ]:
# here we are dropping all the readings with weird speeds as they probably will not provide a lot of information
readings = readings.loc[np.logical_and(readings.gps_mph < 30, readings.gps_speed > 4), :]
Here we grab all the bounding boxes so we can do statistics on them and figure out how much we need to expand them in order to do reasonable index matching.
In [ ]:
bb_areas = list()
bb_width = list()
bb_height = list()
bb_max = list()
for i, reading in readings.iterrows():
the_bb = data_munging.reading_to_bb(reading)
bb_area = data_munging.area_of_bb(the_bb)
bb_areas.append(bb_area)
bb_width.append(np.abs(reading.start_x - reading.end_x))
bb_height.append(np.abs(reading.start_y- reading.end_y))
bb_max.append(np.max([bb_width[-1], bb_height[-1]]))
In [ ]:
print the_bb
In [ ]:
plt.hist(bb_areas)
plt.show()
In [ ]:
plt.scatter(bb_width, bb_height)
plt.show()
In [ ]:
plt.hist(bb_max)
plt.show()
In [ ]:
osm_seg_to_data = dict()
In [ ]:
readings['osm_node_0'] = np.NaN
readings['osm_node_1'] = np.NaN
In [ ]:
readings_to_osm_segments = dict()
for i, reading in readings.iterrows():
the_bb = data_munging.reading_to_bb(reading)
expanded_bb = data_munging.expand_bb(the_bb, 3.0)
line_string = shapely.geometry.LineString([[reading.start_x, reading.start_y],
[reading.end_x, reading.end_y]])
closest_segment = None
closest_dist = np.inf
for segment in segments_idx.intersection(expanded_bb, objects=True):
# segment_is_not_highway = (segment.object['segment_ids'][0] not in all_highways.highways and
# segment.object['segment_ids'][1] not in all_highways.highways)
if segment.object['shapely_line'].distance(line_string) < closest_dist:
closest_segment = segment.object['segment_ids']
if closest_segment not in osm_seg_to_data:
osm_seg_to_data[closest_segment] = segment.object
readings_to_osm_segments[reading.id] = closest_segment
try:
readings.osm_node_0[i] = closest_segment[0]
readings.osm_node_1[i] = closest_segment[1]
except:
'There is no segment that matches.'
In [ ]:
readings_by_segment = readings.groupby(['osm_node_0', 'osm_node_1']).size().sort_by_values()
In [ ]:
readings_to_osm_segments[588]
In [ ]:
a_shapely = osm_seg_to_shapely[(2207607827L, 2207607826L)]
In [ ]:
list(a_shapely.coords)
In [ ]:
for shapely_line in osm_seg_to_shapely.values():
coords = list(shapely_line.coords)
plt.plot([[coords[0][0], coords[1][0]],
[coords[0][1], coords[1][1]]])
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
In [ ]:
total_matches = 0
total_misses = 0
for key in readings_to_osm_segments:
if readings_to_osm_segments[key] is None:
total_misses += 1
else:
total_matches += 1
In [ ]:
total_misses
In [ ]:
pkl.dumps(readings_to_osm_segments, open('readings_to_segments_chi.pkl', 'wb'))
In [ ]:
reading['id']
In [ ]:
for i in total_bb:
for col in total_bb:
the_bb = data_munging.reading_to_bb(readings.ix[i, :], 1)
expand_bb = data_munging.expand_bb(readings.ix[i, :])
total_bb[i, col] = len(segments_idx.intersection(data_munging.reading_to_bb(readings.ix[i,:])))
In [ ]:
print len(all_segments.segments)
print len(some_coords.node_coords)
In [ ]:
rides, readings = data_munging.read_raw_data()
readings = data_munging.clean_readings(readings)
readings = data_munging.add_proj_to_readings(readings, data_munging.NAD83)
In [ ]:
In [ ]:
readings.loc[0, ['num_accel_x', 'num_accel_y', 'num_accel_z', 'num_accel_total']]
In [ ]:
print rides.shape
print readings.shape
n, p = readings.shape
In [ ]:
nearest_request = 'http://127.0.0.1:5000/nearest?loc={0},{1}'
map_request = 'http://127.0.0.1:5000/match?loc={0},{1}&t={2}&loc={3},{4}&t={5}'
In [ ]:
map_request
In [ ]:
for snapped_var in ['snapped_' + var for var in ['start_lat', 'start_lon', 'end_lat', 'end_lon']]:
readings[snapped_var] = 0
In [ ]:
osrm_response['matchings'][0]['matched_points']
In [ ]:
total_calls = 0
for i, reading in readings.iterrows():
data = tuple(reading[['start_lat', 'start_lon']])+9
print i
print data
osrm_request = nearest_request.format(*data)
osrm_response = requests.get(osrm_request).json()
print osrm_response
total_calls += 1
if total_calls > 100:
break
In [ ]:
total_calls = 0
for i, reading in readings.iterrows():
data = tuple(reading[['start_lat', 'start_lon']]) + (0,) + tuple(reading[['end_lat', 'end_lon']]) + (1,)
print i
print data
osrm_request = map_request.format(*data)
osrm_response = requests.get(osrm_request).json()
print osrm_response
total_calls += 1
if total_calls > 100:
break
In [ ]:
for i, reading in readings.iterrows():
data = tuple(reading[['start_lat', 'start_lon']]) + (0,) + tuple(reading[['end_lat', 'end_lon']]) + (1,)
osrm_request = map_request.format(*data)
osrm_response = requests.get(osrm_request).json()
print osrm_response
readings.loc[i, ['snapped_start_lat', 'snapped_start_lon']] = osrm_response['matchings'][0]['matched_points'][0]
break
In [ ]:
osrm_request = map_request.format(*reading[['start_lat', 'start_lon']], 0)
In [ ]:
(0, 4) + (0,5,9)
In [ ]:
# This does not work because OSRM does not accept floats as times.
test_map_request = map_request.format(*tuple(a_reading[['start_lat', 'start_lon', 'start_time',
'end_lat', 'end_lon', 'end_time']]))
In [ ]:
test_map_request = map_request.format(a_reading['start_lat'],
a_reading['start_lon'],
0,
a_reading['end_lat'],
a_reading['end_lon'],
1)
In [ ]:
print test_map_request
print requests.get(test_map_request).json()