In [1]:
import sys, os
sys.path.insert(0, os.path.join("..", ".."))

Into a graph

Explore putting the geometric data into a graph data structure. We need to check that the input geometry reasonably respects the "graph/network nature" of the road system.


In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

import geopandas as gpd
import os
import descartes
import numpy as np

import open_cp.network
import open_cp.plot
import matplotlib.collections

UK Ordnance Survey data


In [4]:
ordnancedir = os.path.join("/media", "disk", "OS_OpenMap")
#ordnancedir = os.path.join("..", "..", "..", "..", "..", "Data", "OS_OpenMap")
openroadsdir = os.path.join(ordnancedir, "oproad_essh_gz", "data")
file = "SE_RoadLink.shp"
se_frame = gpd.GeoDataFrame.from_file(os.path.join(openroadsdir, file))
se_frame.head()


Out[4]:
class endNode fictitious formOfWay function geometry identifier length loop name1 name1_lang name2 name2_lang nameTOID numberTOID primary roadNumber startNode structure trunkRoad
0 Not Classified 0CFF70C2-9B29-472E-8A27-CCEA9997037B false Single Carriageway Restricted Local Access Road LINESTRING Z (498830.04 441139.46 0, 499026.35... EA197664-1E58-443D-9370-7C0CD5178DA1 394 false None None None None None None false None A96F9634-8C61-4CF4-9AD0-C8EADA3D7A5B None false
1 Unclassified 7D94180C-9D5B-4793-BD55-8BEA91240B9C false Single Carriageway Minor Road LINESTRING Z (498926.33 439826.29 0, 498919.15... 9C3EA9EC-5EBB-4487-BEC6-7AC72DA0C31A 57 false Pudding Gate None None None osgb4000000010952647 None false None 6B9832C7-0778-4ECA-9EC5-C67C700FDFED None false
2 Unclassified 0EE39E69-E9D6-4D3A-92A3-E173B28B3C1B false Single Carriageway Minor Road LINESTRING Z (498911.98 439771.47 0, 498917.32... CA83540D-83BF-4204-BADE-DEC248606D64 17 false Joby Lane None None None osgb4000000010968479 None false None 7D94180C-9D5B-4793-BD55-8BEA91240B9C None false
3 Unclassified D00D9ABE-2FE6-41CA-A56C-C594B9777AEB false Single Carriageway Minor Road LINESTRING Z (498922.66 439757.84 0, 498928.97... E3E62810-76D5-42B0-89D5-69721D2F719B 72 false Joby Lane None None None osgb4000000010968479 None false None 0EE39E69-E9D6-4D3A-92A3-E173B28B3C1B None false
4 Unclassified 6B9832C7-0778-4ECA-9EC5-C67C700FDFED false Single Carriageway Local Road LINESTRING Z (498922.66 439757.84 0, 498944.29... F2867AC1-AB45-4E54-825B-B1CF4FD90AED 123 false None None None None None None false None 0EE39E69-E9D6-4D3A-92A3-E173B28B3C1B None false

We notice that each edge has a UUID-like identifier for the start and end node.

  • Does the said ID always correspond to exactly the same coordinate? YES!
  • Does exactly the same coordinate always correspond to the same ID? YES!
  • Can we have exact repeats of the coordinates of interior nodes to a line? YES!
    • However, in the examples which I've look at, these are also things like over/under passes

However, we find that some nodes are very, very close together.

So it is safe to use the "graph structure" as is, without processing.


In [5]:
node_lookup = dict()
for endid, line in zip(se_frame.endNode, se_frame.geometry):
    x,y,z = list(line.coords)[-1]
    if endid in node_lookup:
        if not (x,y) == node_lookup[endid]:
            print(endid, node_lookup[endid], x, y)
            raise AssertionError()
    node_lookup[endid] = (x,y)
    
for startid, line in zip(se_frame.startNode, se_frame.geometry):
    x,y,z = list(line.coords)[0]
    if startid in node_lookup:
        if not (x,y) == node_lookup[startid]:
            print(startid, node_lookup[startid], x, y)
            raise AssertionError()
    node_lookup[startid] = (x,y)

In [6]:
node_lookup = dict()
for endid, line in zip(se_frame.endNode, se_frame.geometry):
    x,y,z = list(line.coords)[-1]
    if (x,y) in node_lookup:
        if not endid == node_lookup[(x,y)]:
            print(endid, node_lookup[(x,y)], x, y)
            raise AssertionError()
    node_lookup[(x,y)] = endid

for startid, line in zip(se_frame.startNode, se_frame.geometry):
    x,y,z = list(line.coords)[0]
    if (x,y) in node_lookup:
        if not startid == node_lookup[(x,y)]:
            print(startid, node_lookup[(x,y)], x, y)
            raise AssertionError()
    node_lookup[(x,y)] = startid

In [7]:
int_nodes = set()
end = False
for i, line in enumerate(se_frame.geometry):
    lines = list(line.coords)[1:-1]
    for x,y,z in lines:
        if (x,y) in int_nodes:
            print(i, "-->", x, y)
            end = True
        int_nodes.add((x,y))
    if end:
        break


444 --> 493875.67 429181.32

In [8]:
se_frame.ix[444]


Out[8]:
class                                     Classified Unnumbered
endNode                    D4B06C6C-AD07-4F61-8D74-EBB2EEA84D82
fictitious                                                false
formOfWay                                    Single Carriageway
function                                             Minor Road
geometry      LINESTRING Z (494007.1 429346.87 0, 494003.37 ...
identifier                 0F871560-D4A4-4915-B5A8-6A3CEF95087C
length                                                      265
loop                                                      false
name1                                               The Outgang
name1_lang                                                 None
name2                                                      None
name2_lang                                                 None
nameTOID                                   osgb4000000010931311
numberTOID                                                 None
primary                                                   false
roadNumber                                                 None
startNode                  2ABEE995-88EF-43A1-A9A3-E39719B34996
structure                                                  None
trunkRoad                                                 false
Name: 444, dtype: object

Into our code

  • Put the geometry into our graph class
  • Project some random points onto the network.

In [9]:
b = open_cp.network.PlanarGraphGeoBuilder()
for line in se_frame.geometry:
    b.add_path(line.coords)

In [10]:
graph = b.build()

In [11]:
fig, ax = plt.subplots(figsize=(12,12))

lc = matplotlib.collections.LineCollection(graph.as_lines(), color="black", linewidth=0.5)
ax.add_collection(lc)

nodes = list(graph.vertices.values())
nodes = np.asarray(nodes)
ax.scatter(nodes[:,0], nodes[:,1])

xc, yc = 493875.67, 429181.32
si = 1000
ax.set(xlim=[xc - si, xc + si], ylim=[yc - si, yc + si])
None


Optionally save

Save the file in our format (bzip2 compressed json where the underlying data is saved using numpy to be cross-platform compatible.)


In [12]:
b = graph.dump_bytes()
with open("OS_Yorkshire.graph", "wb") as f:
    f.write(b)

For US TIGER/Lines data

Here we have a lot less to go on: basically just geometry and a "name" for each edge.

What could go wrong if we form a graph in the same way:

  • Perhaps edges which are meant to join up have slightly different x,y coords for the start/end points? Answer: Yes! But only by "slightly"
  • In looking at this, we also find genuine points of intersection which are not the start/end points of lines.
  • So it's almost the exact opposite of the UK data...!

In [13]:
tiger_path = os.path.join("/media", "disk", "TIGER Data")
#iger_path = os.path.join("..", "..", "..", "..", "..", "Data", "TIGER Data")
filename = os.path.join(tiger_path, "tl_2016_17031_roads")
tiger_frame = gpd.GeoDataFrame.from_file(filename)
chicago = tiger_frame.to_crs({"init":"epsg:3528"})
chicago.head()


Out[13]:
FULLNAME LINEARID MTFCC RTTYP geometry
0 47th Pl Exd 110380277026 S1400 M LINESTRING (361283.8098974457 571774.357903373...
1 Golden Spr 110380298305 S1400 M LINESTRING (334283.9373028304 555952.566653928...
2 Edens Expy Spr 1104259027148 S1100 M LINESTRING (338674.4822622131 608835.390285893...
3 Edens Expy Spr 1104259564382 S1100 M LINESTRING (341418.6466204406 608352.224489629...
4 Edens Expy Spr 1104472109755 S1100 M LINESTRING (337922.7936322958 609174.983032155...

In [14]:
import pyproj
proj = pyproj.Proj({"init":"epsg:3528"})

Look at start/end nodes

We find that these can fail to be identical while clearly actually needing to join.


In [15]:
start_end_nodes = []
for geo in chicago.geometry:
    lines = list(geo.coords)
    start_end_nodes.append(lines[0])
    start_end_nodes.append(lines[-1])
start_end_nodes = np.asarray(start_end_nodes)
start_end_nodes.shape


Out[15]:
(158700, 2)

In [16]:
dists = []
odd_points = []
for i, pt in enumerate(start_end_nodes):
    diffs = start_end_nodes[i+1:, :] - pt[None,:]
    distsq = diffs[:,0]**2 + diffs[:,1]**2
    mask = (distsq < 1) & (distsq > 0)
    dists.extend(distsq[mask])
    if np.any(mask) > 0:
        for j, m in enumerate(mask):
            if m:
                odd_points.append((i,j+i+1))
    if len(dists) > 1000000:
        raise Bob

In [17]:
# We looked at those cases where the distance was <1, so these small numbers show a clear "gap"
# (and are probably symptomatic of data errors due to rounding in the input data.)
min(dists), max(dists)


Out[17]:
(4.87890977618477e-19, 6.5899163296384566e-18)

Look at one example to see that the lines should indeed join.


In [18]:
odd_points[0]


Out[18]:
(50, 110545)

In [19]:
# 50 == start point of 25
# 110545 == end point of 110544//2
chicago.ix[[25, 110544//2]]


Out[19]:
FULLNAME LINEARID MTFCC RTTYP geometry
25 Cedar Glen Dr N 110380338074 S1400 M LINESTRING (364977.3143999526 540150.459265323...
55272 Cedar Glen Dr 1102154911164 S1640 M LINESTRING (364948.2723670056 540150.086583519...

In [20]:
list(chicago.ix[25].geometry.coords)[0], list(chicago.ix[110544//2].geometry.coords)[-1]


Out[20]:
((364977.3143999526, 540150.4592653238),
 (364977.3143999526, 540150.4592653245))

In [21]:
# Unprojected, shows an input rounding error
list(tiger_frame.ix[25].geometry.coords)[0], list(tiger_frame.ix[110544//2].geometry.coords)[-1]


Out[21]:
((-87.554755, 41.52960499999999), (-87.554755, 41.529605))

In [22]:
xc, yc = list(chicago.ix[25].geometry.coords)[0]
print("As lon lat:", proj(xc, yc, inverse=True))

fig, ax = plt.subplots(figsize=(8,8))

lines = open_cp.plot.lines_from_geometry(chicago.geometry)
lc = matplotlib.collections.LineCollection(lines, color="black", linewidth=0.5)
ax.add_collection(lc)

nodes = np.asarray(start_end_nodes)
ax.scatter(start_end_nodes[:,0], start_end_nodes[:,1])

si = 100
ax.set(xlim=[xc - si, xc + si], ylim=[yc - si, yc + si])
None


As lon lat: (-87.55475499999996, 41.52960499999997)

Look at all nodes

Interested in wherether there is always a node at a "junction"... Answer: yes.

(This is very slow to run...)


In [ ]:
nodes, all_nodes = [], []
for geo in chicago.geometry:
    for pt in geo.coords:
        all_nodes.append(pt)
nodes = list(set(all_nodes))
nodes = np.asarray(nodes)
all_nodes = np.asarray(all_nodes)
nodes.shape, all_nodes.shape

In [ ]:
dists = []
odd_points = []
for i, pt in enumerate(nodes):
    diffs = nodes[i+1:, :] - pt[None,:]
    distsq = diffs[:,0]**2 + diffs[:,1]**2
    assert not np.any(distsq==0)
    mask = (distsq < 1) #& (distsq > 0)
    dists.extend(distsq[mask])
    if np.any(mask) > 0:
        for j, m in enumerate(mask):
            if m:
                odd_points.append((i,j+i+1))
    if len(dists) > 1000000:
        raise Bob

In [ ]:
dists = np.asarray(dists)
small = dists[dists < 0.5]
print(min(small), max(small))
large = dists[dists >= 0.5]
print(min(large), max(large))

In [ ]:
xc, yc = list(chicago.ix[25].geometry.coords)[0]
print("As lon lat:", proj(xc, yc, inverse=True))

fig, ax = plt.subplots(figsize=(8,8))

lines = open_cp.plot.lines_from_geometry(chicago.geometry)
lc = matplotlib.collections.LineCollection(lines, color="black", linewidth=0.5)
ax.add_collection(lc)

dx = np.random.random(all_nodes.shape[0]) * 2
dy = np.random.random(all_nodes.shape[0]) * 2
ax.scatter(all_nodes[:,0]+dx, all_nodes[:,1]+dy, alpha=1, marker="+")

si = 100
ax.set(xlim=[xc - si, xc + si], ylim=[yc - si, yc + si])
None

Into our graph class

We discover that we end up with multiple edges between nodes. It's worth checking one of these to see why.


In [23]:
all_nodes = []
for geo in chicago.geometry:
    for pt in geo.coords:
        all_nodes.append(pt)
        
b = open_cp.network.PlanarGraphNodeOneShot(all_nodes)
for geo in chicago.geometry:
    path = list(geo.coords)
    b.add_path(path)
    
graph = b.build()


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-23-f38ada7a4908> in <module>()
      9     b.add_path(path)
     10 
---> 11 graph = b.build()

/nfs/see-fs-02_users/matmdpd/Crime Predict Project/PredictCode/open_cp/network.py in build(self)
    174     def build(self):
    175         vertices = [ (key, x, y) for key, (x,y) in enumerate(self._nodes) ]
--> 176         return PlanarGraph(vertices, self._edges)
    177 
    178 

/nfs/see-fs-02_users/matmdpd/Crime Predict Project/PredictCode/open_cp/network.py in __init__(self, vertices, edges)
    702                 raise ValueError("Keys of vertices should be unique; but {} is repeated".format(key))
    703             verts[key] = (x,y)
--> 704         super().__init__(verts.keys(), edges)
    705         self._vertices = verts
    706         quads = self.as_quads().T

/nfs/see-fs-02_users/matmdpd/Crime Predict Project/PredictCode/open_cp/network.py in __init__(self, vertices, edges, lengths)
    398             e = frozenset((key1, key2))
    399             if e in edges_set:
--> 400                 raise ValueError("Trying to add a 2nd edge from {} to {}".format(key1, key2))
    401             edges_set.add(e)
    402             self._edges.append((key1, key2))

ValueError: Trying to add a 2nd edge from 500 to 501

In [24]:
proj(*all_nodes[28720], inverse=True), proj(*all_nodes[627010], inverse=True)


Out[24]:
((-87.91149899999999, 41.953738999999985), (-87.90249699999998, 42.064))

A bit of visual investigation with QGIS gives the following. We plot these with an offset: they are identical for time. I think this is due to the same physical road having two logical identities ("State Rte 19" and "Irving Park Rd") and so appearing in the data twice.


In [26]:
mask = (chicago["LINEARID"] == "1104469295862") | (chicago["LINEARID"] == "110380350397")
geo = chicago[mask].geometry
geo


Out[26]:
1982     LINESTRING (357103.5344025511 587295.603073253...
33248    LINESTRING (341172.3691466944 586896.633914537...
Name: geometry, dtype: object

In [27]:
fig, ax = plt.subplots(figsize=(12,6))

x = np.asarray(list(geo[33248].coords))
lc = matplotlib.collections.LineCollection([x])
ax.add_collection(lc)

x = np.asarray(list(geo[1982].coords)) + np.array([0,100])
lc = matplotlib.collections.LineCollection([x])
ax.add_collection(lc)

xmin, ymin, xmax, ymax = chicago[mask].total_bounds
d = 500
ax.set(xlim=[xmin-d, xmax+d], ylim=[ymin-d, ymax+d])
None



In [28]:
b.remove_duplicate_edges()
graph = b.build()

In [29]:
fig, ax = plt.subplots(figsize=(12,12))

lc = matplotlib.collections.LineCollection(graph.as_lines(), color="black", linewidth=0.5)
ax.add_collection(lc)

xmin, ymin, xmax, ymax = chicago.total_bounds
xd, yd = xmax - xmin, ymax - ymin
ax.set(xlim=(xmin-xd/20, xmax+xd/20), ylim=(ymin-yd/20, ymax+yd/20))
None



In [30]:
b = graph.dump_bytes()
with open("chicago.graph", "wb") as f:
    f.write(b)

TIGER/Lines edge data


In [31]:
tiger_path = os.path.join("/media", "disk", "TIGER Data")
#iger_path = os.path.join("..", "..", "..", "..", "..", "Data", "TIGER Data")
filename = os.path.join(tiger_path, "tl_2016_17031_edges")
tiger_frame = gpd.GeoDataFrame.from_file(filename)
tiger_frame = tiger_frame.to_crs({"init":"epsg:3528"})
chicago = gpd.GeoDataFrame(tiger_frame["FULLNAME"])
chicago.geometry = tiger_frame.geometry
chicago.crs = tiger_frame.crs
chicago.head()


Out[31]:
FULLNAME geometry
0 None LINESTRING (364405.5379445553 567268.128558744...
1 Great Lakes LINESTRING (363194.8858743247 567861.246061401...
2 None LINESTRING (358543.3791577177 585961.056174848...
3 None LINESTRING (357948.3889643896 588986.831090772...
4 None LINESTRING (363495.2114223191 568083.949034775...

Look at start and end nodes. Again, we find that these do not exactly match.


In [32]:
start_end_nodes = []
for geo in chicago.geometry:
    lines = list(geo.coords)
    start_end_nodes.append(lines[0])
    start_end_nodes.append(lines[-1])
start_end_nodes = np.asarray(start_end_nodes)
start_end_nodes.shape


Out[32]:
(611568, 2)

In [33]:
dists = []
odd_points = []
for i, pt in enumerate(start_end_nodes):
    diffs = start_end_nodes[i+1:, :] - pt[None,:]
    distsq = diffs[:,0]**2 + diffs[:,1]**2
    mask = (distsq < 1) & (distsq > 0)
    dists.extend(distsq[mask])
    for j in np.arange(0, len(mask))[mask]:
        odd_points.append((i,j+i+1))
    if len(dists) > 1000000:
        raise Bob

In [34]:
len(odd_points), min(dists), max(dists)


Out[34]:
(3168, 4.87890977618477e-19, 9.8797922967741592e-18)

In [35]:
odd_points[0]


Out[35]:
(27, 153333)

In [36]:
# 27 == end point of 27//2
# 153333 == end point of 153333//2
chicago.ix[[27//2, 153333//2]]


Out[36]:
FULLNAME geometry
13 Great Lakes LINESTRING (360662.1981091806 580507.343951121...
76666 Great Lakes LINESTRING (360069.0104651943 580245.504242039...

In [37]:
list(chicago.geometry[13].coords)[-1], list(chicago.geometry[76666].coords)[-1]


Out[37]:
((360661.68679858616, 580382.2663500732),
 (360661.68679858505, 580382.2663500739))

Into a graph

Aparently, we don't need to remove duplicates now...??


In [38]:
all_nodes = []
for geo in chicago.geometry:
    for pt in geo.coords:
        all_nodes.append(pt)
        
b = open_cp.network.PlanarGraphNodeOneShot(all_nodes)
for geo in chicago.geometry:
    path = list(geo.coords)
    b.add_path(path)
    
graph = b.build()

In [39]:
fig, ax = plt.subplots(figsize=(12,12))

lc = matplotlib.collections.LineCollection(graph.as_lines(), color="black", linewidth=0.5)
ax.add_collection(lc)

xmin, ymin, xmax, ymax = chicago.total_bounds
xd, yd = xmax - xmin, ymax - ymin
ax.set(xlim=(xmin-xd/20, xmax+xd/20), ylim=(ymin-yd/20, ymax+yd/20))
None


Save to shapefile


In [40]:
import shapely.geometry

geo = []
for (v1,v2) in graph.edges:
    l = shapely.geometry.LineString([graph.vertices[v1], graph.vertices[v2]])
    geo.append(l)

frame = gpd.GeoDataFrame({"edge" : list(range(len(graph.edges)))})
frame.geometry = geo
frame.crs = {"init":"epsg:3528"}
frame.head()


Out[40]:
edge geometry
0 0 LINESTRING (364405.5379445553 567268.128558744...
1 1 LINESTRING (364946.1589606669 566692.977664090...
2 2 LINESTRING (363194.8858743247 567861.246061401...
3 3 LINESTRING (358543.3791577177 585961.056174848...
4 4 LINESTRING (358649.6530192145 585403.210817212...

In [41]:
frame.to_file("roads_to_graph")

Projecting to nearest line segment

In the above diagram, we wish to find $t$. We know that $v = b-a$, that $u\cdot v=0$, and that $x-a = tv + u$. Thus $$ (x-a) \cdot v = t\|v\|^2 \quad\implies\quad t = \|v\|^{-2} (x-a)\cdot v. $$

Rather than (tediously) explain how our code work, we just summarise it:

  • If we can load the rtree Python package, then we put the vertices into an R-Tree
  • This cannot directly do this calculation, but it can (vastly) limit the searching we need to do.
  • Then, for each edge in range (either all edges, or a subset provided using the r-tree) we perform the above calculation, and select the nearest point. This is performed using parallel numpy code.

In [42]:
with open("OS_Yorkshire.graph", "rb") as f:
    graph = open_cp.network.PlanarGraph.from_bytes(f.read())

In [43]:
fig, ax = plt.subplots(figsize=(12,12))

lc = matplotlib.collections.LineCollection(graph.as_lines(), color="black", linewidth=0.5)
ax.add_collection(lc)

xc, yc = 493875.67, 429181.32
si = 1000
ax.set(xlim=[xc - si, xc + si], ylim=[yc - si, yc + si])

xcs = np.random.random(50) * si * 2 - si + xc
ycs = np.random.random(50) * si * 2 - si + yc
ax.scatter(xcs, ycs)
xxcs, yycs = [], []
for x, y in zip(xcs, ycs):
    edge, t = graph.project_point_to_graph(x, y)
    xx, yy = graph.edge_to_coords(*edge, t)
    xxcs.append(xx)
    yycs.append(yy)
ax.scatter(xxcs, yycs)

lines = [((x,y),(xx,yy)) for x,y,xx,yy in zip(xcs, ycs, xxcs, yycs)]
lc = matplotlib.collections.LineCollection(lines, color="blue")
ax.add_collection(lc)


Out[43]:
<matplotlib.collections.LineCollection at 0x7f1f5fad7e80>

In [44]:
with open("chicago.graph", "rb") as f:
    graph = open_cp.network.PlanarGraph.from_bytes(f.read())

In [45]:
fig, ax = plt.subplots(figsize=(12,12))

lc = matplotlib.collections.LineCollection(graph.as_lines(), color="black", linewidth=0.5)
ax.add_collection(lc)

x, y = 355000, 565000
size = 1000
ax.set(xlim=(x-size,x+size), ylim=(y-size,y+size))

xcs = np.random.random(50) * size * 2 - size + x
ycs = np.random.random(50) * size * 2 - size + y
ax.scatter(xcs, ycs)
xxcs, yycs = [], []
for x, y in zip(xcs, ycs):
    edge, t = graph.project_point_to_graph(x, y)
    xx, yy = graph.edge_to_coords(*edge, t)
    xxcs.append(xx)
    yycs.append(yy)
ax.scatter(xxcs, yycs)

lines = [((x,y),(xx,yy)) for x,y,xx,yy in zip(xcs, ycs, xxcs, yycs)]
lc = matplotlib.collections.LineCollection(lines, color="blue")
ax.add_collection(lc)
None



In [ ]: