In [1]:
# Allow to import without installing
import sys
sys.path.insert(0, "..")
I want to understand how buildings and addresses are represented in OSM data.
Required reading: http://wiki.openstreetmap.org/wiki/Addresses
In [2]:
import osmdigest.digest as digest
In [3]:
import os
#filename = os.path.join("//media", "disk", "OSM_Data", "isle-of-wight-latest.osm.xz")
filename = os.path.join("..", "..", "..", "Data", "isle-of-wight-latest.osm.xz")
In [4]:
building_node_ids = []
addr_node_ids = []
for x in digest.parse(filename):
if isinstance(x, digest.Node):
if "building" in x.tags:
building_node_ids.append(x)
if any(key.startswith("addr:") for key in x.tags):
addr_node_ids.append(x)
In [5]:
len(building_node_ids), building_node_ids[:5]
Out[5]:
In [6]:
len(addr_node_ids), addr_node_ids[:5]
Out[6]:
We expected buildings to mostly have a polygonal outline, and so be "ways".
The situation here is reversed: lots of buildings, and fewer addresses. From eyeballing a few ways which have address information, but which are not buildings, we find that the "way" gives the total outline of, say, a school, which may contain a number of buildings, playing fields etc.
In [7]:
building_way_ids = []
addr_way_ids = []
for x in digest.parse(filename):
if isinstance(x, digest.Way):
if "building" in x.tags:
building_way_ids.append(x)
if any(key.startswith("addr:") for key in x.tags):
addr_way_ids.append(x)
In [8]:
len(building_way_ids), building_way_ids[:5]
Out[8]:
In [9]:
len(addr_way_ids), addr_way_ids[:5]
Out[9]:
In [10]:
building_rel_ids = []
addr_rel_ids = []
for x in digest.parse(filename):
if isinstance(x, digest.Relation):
if "building" in x.tags:
building_rel_ids.append(x)
if any(key.startswith("addr:") for key in x.tags):
addr_rel_ids.append(x)
In [11]:
len(building_rel_ids), building_rel_ids[:5]
Out[11]:
In [12]:
len(addr_rel_ids), addr_rel_ids[:5]
Out[12]:
In [13]:
import numpy as np
import pandas as pd
In [14]:
gen = digest.parse(filename)
print(next(gen))
print(next(gen))
possible_address_tags = set()
for x in gen:
for key in x.tags:
if key.startswith("addr:"):
possible_address_tags.add(key)
possible_address_tags
Out[14]:
In [15]:
gen = digest.parse(filename)
osm = next(gen)
bounds = next(gen)
address_data = { key : [] for key in possible_address_tags }
address_data["osm_id"] = []
for x in gen:
addr = {key : x.tags[key] for key in x.tags if key.startswith("addr:")}
if len(addr) > 0:
address_data["osm_id"].append(x.name+"/"+str(x.osm_id))
for key in possible_address_tags:
if key in addr:
address_data[key].append(addr[key])
else:
address_data[key].append(np.nan)
In [16]:
data = pd.DataFrame(address_data)
data = data.set_index("osm_id")
data[:5]
Out[16]:
Let's look at a different country, namely the state of Illinois in the USA.
We'll then load the data from the SQLite database, as this is far more memory efficient. We'll find that we still end up with massive python data structures...
The overall pattern is the same, even though the dataset is much larger.
In [4]:
import osmdigest.sqlite as sq
import os
In [5]:
filename = os.path.join("//tmp", "aaa", "illinois-latest.db")
#filename = os.path.join("..", "..", "..", "Data", "illinois-latest.db")
In [6]:
db = sq.OSM_SQLite(filename)
In [7]:
def iterate_over_tags(iterator):
buildings, addresses = [], []
for element in iterator:
if any(key.startswith("building") for key in element.tags):
buildings.append(element)
if any(key.startswith("addr") for key in element.tags):
addresses.append(element)
return buildings, addresses
In [8]:
building_nodes, address_nodes = iterate_over_tags(db.nodes())
In [9]:
len(building_nodes), building_nodes[:5]
Out[9]:
In [10]:
len(address_nodes), address_nodes[:5]
Out[10]:
In [11]:
building_ways, address_ways = iterate_over_tags(db.ways())
In [12]:
len(building_ways), building_ways[:5]
Out[12]:
In [13]:
len(address_ways), address_ways[:5]
Out[13]:
In [14]:
building_rels, address_rels = iterate_over_tags(db.relations())
In [15]:
len(building_rels), building_rels[:5]
Out[15]:
In [16]:
len(address_rels), address_rels[:5]
Out[16]:
Ultimately, I want:
addr:street and addr:housenumber.We construct a GeoJSON-like data structure, and then import it into geoPandas.
In [62]:
features = []
def make_feature(el, centroid):
return { "properties": {
"street": el.tags["addr:street"],
"housenumber": el.tags["addr:housenumber"],
"osm_id": "{}/{}".format(el.name, el.osm_id)
},
"geometry": { "type": "Point",
"coordinates": centroid } }
for el in db.search_node_tag_keys({"addr:street", "addr:housenumber"}):
features.append(make_feature(el, [el.longitude, el.latitude]))
for el in db.search_way_tag_keys({"addr:street", "addr:housenumber"}):
way = db.complete_way(el)
features.append(make_feature(el, way.centroid()))
for el in db.search_relation_tag_keys({"addr:street", "addr:housenumber"}):
rel = db.complete_relation(el)
features.append(make_feature(el, rel.centroid()))
In [63]:
import geopandas as gpd
In [64]:
frame = gpd.GeoDataFrame.from_features(features)
#frame = frame.set_geometry("centroid")
frame[:5]
Out[64]:
Many of the "housenumber" values are not integers.
In [65]:
unexpected_addresses = frame[~ frame.housenumber.map(lambda x : all(y>='0' and y<='9' for y in x))]
unexpected_addresses.head()
Out[65]:
In [66]:
unexpected_addresses[5:10]
Out[66]:
So let's filter out things which match. (Stand-back, I know regular expressions).
In [67]:
import re
one = re.compile("^\\d+\\s+#\\d+$")
assert one.match("4262 #12") is not None
assert one.match("4 62 #12") is None
two = re.compile("^\\d+[NSEW]\\d+$")
assert two.match("19N479") is not None
assert two.match("19NS479") is None
assert two.match("19 479") is None
three = re.compile("^\\d+\\s*[a-zA-Z]\\.*$")
assert three.match("152A") is not None
assert three.match("152 A") is not None
assert three.match("152 A.") is not None
assert three.match("152Ac") is None
four = re.compile("^\\d+\\s*1/2")
matches = {one, two, three, four}
left = unexpected_addresses[~ unexpected_addresses.housenumber.map(lambda x : any(m.match(x) is not None for m in matches))]
left.head()
Out[67]:
This leaves a lot left over.
What interests me is whether there are any "way"s which "interpolate" addresses, see http://wiki.openstreetmap.org/wiki/Addresses
The answer is: "no". Eyeballing these few in OSM doesn't show anything interesting.
In [70]:
for way in db.search_way_tag_keys({"addr:interpolation"}):
print(way)
Finally, save the data out. We can use any supported fiona driver. Here I use GeoJSON, as it's human readable, and no less space efficient than a Shapefile. A Shapefile can be imported into QGis etc., of course.
In [71]:
import fiona
fiona.supported_drivers
Out[71]:
In [72]:
frame.to_file("illinois_building.json", driver="GeoJSON")
In [ ]: