In [1]:
# Allow us to load `open_cp` without installing
import sys, os.path
sys.path.insert(0, os.path.abspath(os.path.join("..", "..")))
The TIGER/Line dataset from the US Census bureau provides a great resource for getting block-level address data.
It is also possible to extract address details from OpenStreetMap. In particular, for Chicago, there are a very large number of individual buildings, with addresses, available. However, I have also noticed that the OSM data is not as complete as the US Census dataset (for example, minor roads missing names, even when those roads appear in the crime dataset).
The Census data from 2013 seems good to use. The 2016 data seems to be more "aggregated": it is easy to find examples of a single block which is split into, say, 3 paths in the geometry. In the 2013 data each part will contain a range of addresses, while in the 2016 data, only one part contains all the addresses, the other parts having "None" as the address range.
In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import geopandas as gpd
import os
import numpy as np
import pandas as pd
#filename = os.path.join("/media", "disk", "tl_2013_17031_edges", "tl_2013_17031_edges.shp")
filename = os.path.join("..", "..", "..", "..", "..", "Data", "tl_2013_17031_edges", "tl_2013_17031_edges.shp")
In [3]:
edges = gpd.read_file(filename)
In [4]:
# Slightly curious, if you look it up. But lon/lat
edges.crs
Out[4]:
In [5]:
edges.ix[12]
Out[5]:
We only care about the columns "geometry" and "FULLNAME" (giving the road name) and LFROMADD, LTOADD, RFROMADD, RTOADD
In [6]:
want = {"geometry", "FULLNAME", "LFROMADD", "LTOADD", "RFROMADD", "RTOADD"}
edges = gpd.GeoDataFrame({key:edges[key] for key in want})
edges.crs={'init': 'epsg:4269'}
edges.head()
Out[6]:
In [7]:
import open_cp.sources.chicago as chicago
frame = chicago.load_to_geoDataFrame().dropna()
In [8]:
frame = frame.to_crs({'init': 'epsg:3435', "preserve_units":True})
In [9]:
edges = edges.to_crs({'init': 'epsg:3435', "preserve_units":True})
In [111]:
xcs = frame.geometry.map(lambda pt : pt.coords[0][0])
ycs = frame.geometry.map(lambda pt : pt.coords[0][1])
import shapely.geometry
import numpy as np
frame.geometry = [shapely.geometry.Point(np.round(x), np.round(y)) for x,y in zip(xcs, ycs)]
In [8]:
one = frame[frame.address == frame.address.unique()[0]].copy()
one.head()
Out[8]:
In [9]:
def find_match_via_distance(point):
dist = edges.geometry.distance(point)
return edges.ix[dist.argmin]
def via_distance(one):
return [ find_match_via_distance(point).name for point in one.geometry ]
one["edge_index"] = via_distance(one)
one.head()
Out[9]:
In [10]:
def find_match_via_intersection(point):
possibles = edges[edges.geometry.intersects( point.buffer(0.001) )]
i = possibles.geometry.distance(point).argmin()
return edges.ix[i]
def via_intersection(one):
return [ find_match_via_intersection(point).name
for point in one.geometry ]
one["edge_index1"] = via_intersection(one)
one.head()
Out[10]:
In [11]:
np.all(one.edge_index == one.edge_index1)
Out[11]:
We'll use an R-Tree. Building the initial index takes a bit of time, but it's vastly quicker once you have the index.
In [12]:
import rtree
In [13]:
gap = 0.001
def gen():
for i, row in edges.iterrows():
bds = list(row.geometry.bounds)
bds = [bds[0]-gap, bds[1]-gap, bds[2]+gap, bds[3]+gap]
yield i, bds, None
idx = rtree.index.Index(gen())
In [24]:
def find_match_via_rtree(point):
possibles = edges.ix[list(idx.intersection(point.coords[0]))]
if len(possibles) == 0:
#raise ValueError("Found no candidates for {}".format(point))
from collections import namedtuple
Error = namedtuple("Error", ["name"])
return Error(name=-1)
i = possibles.geometry.distance(point).argmin()
return edges.ix[i]
def via_rtree(one):
return [ find_match_via_rtree(point).name
for point in one.geometry ]
one["edge_index2"] = via_rtree(one)
one.head()
Out[24]:
In [15]:
all(one.edge_index1 == one.edge_index2)
Out[15]:
In [16]:
%timeit(via_distance(one))
In [17]:
%timeit(via_intersection(one))
In [18]:
%timeit(via_rtree(one))
In [17]:
for i in frame.index[100:200]:
pt = frame.ix[i].geometry
a = find_match_via_distance(pt).name
b = find_match_via_intersection(pt).name
c = find_match_via_rtree(pt).name
assert a == b
assert b == c
In [25]:
frame["edge_index"] = via_rtree(frame)
In [28]:
frame[frame.edge_index == -1].location.unique()
Out[28]:
In [30]:
frame = frame[frame.edge_index != -1].copy()
In [35]:
lookup = pd.DataFrame({"case": frame.case, "edge":frame.edge_index})
lookup.head()
Out[35]:
In [36]:
lookup.to_csv("edge_lookup.csv")
In [ ]:
In [ ]:
'0000X E GRAND AVE' Seems to be an error?
'028XX N MILWAUKEE AVE' is an example where the closest point method just gets it wrong. Here it would be appropriate to then look at all the edges, select all those in the correct address range, and etc.
'080XX S PULASKI RD' Nothing wrong, but we miss a part of the road (so again joni with all edges...)
'020XX E 71ST ST' A genuinely split road (east and west lanes). We also pick up an "alley". This could be discarded without issue.
'026XX W NORTH AVE' In the "unbalanced aspect ratio" plot, the points don't match the edge.
In [8]:
lookup = pd.read_csv("edge_lookup.csv", names = ["case", "edge"], header=0)
# Make a lookup. This breaks things for "HOMICIDE" crime, but we don't care.
l = dict()
for _, row in lookup.iterrows():
l[row.case] = row.edge
lookup = l
In [403]:
block = frame.address.unique()[132]
block
Out[403]:
In [404]:
edge_ids = set()
for case in frame[frame.address == block].case:
edge_ids.add(lookup[case])
edge_ids
Out[404]:
In [405]:
edges.ix[edge_ids]
Out[405]:
In [406]:
def get_xy(frame):
x = frame.geometry.map(lambda pt : pt.coords[0][0])
y = frame.geometry.map(lambda pt : pt.coords[0][1])
return x, y
fig, axes = plt.subplots(ncols=2, figsize=(16,8))
for ax in axes:
edges.ix[edge_ids].plot(ax=ax, color="black")
ax.scatter(*get_xy(frame[frame.address == block]), color="blue", marker="x")
axes[1].set_aspect(1)
None
In [ ]: