In [1]:
# Allow to import without installing
import sys
sys.path.insert(0, "..")
These are downloaded from http://download.geofabrik.de/ which offers snapshots of various parts of the planet in a variety of formats.
I have found that "xz" offers better compression that bzip2. Linux users can install the "xz" package, or on Windows, use 7zip. Python itself can easily recompress a file.
The examples below use the data, in uncompressed format, for the Isle of Wight (a small island off the south coast of England, known to me from childhood holidays). See http://download.geofabrik.de/europe/great-britain/england.html
In [2]:
import os, lzma
#basedir = os.path.join("/media/disk", "OSM_Data")
basedir = os.path.join("e:\\", "OSM_Data")
filename = "isle-of-wight-latest.osm.xz"
with lzma.open(os.path.join(basedir, filename), mode="rt", encoding="utf-8") as f:
print(next(f), end="")
print(next(f), end="")
print(next(f), end="")
print(next(f), end="")
In [3]:
import osmdigest.detail as detail
import datetime
In [4]:
possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()
#with detail.Parser(os.path.join(basedir, filename)) as gen:
start = datetime.datetime.now()
with detail.Parser("isle-of-wight-latest.osm") as gen:
for x in gen:
if isinstance(x, detail.OSM) or isinstance(x, detail.Bounds):
print(x)
elif isinstance(x, detail.Node):
for y in x.subobjs:
assert isinstance(y, detail.Tag)
possible_node_tags.add(y.key)
elif isinstance(x, detail.Way):
for y in x.subobjs:
if isinstance(y, detail.Tag):
possible_way_tags.add(y.key)
else:
assert isinstance(y, detail.NodeRef)
elif isinstance(x, detail.Relation):
for y in x.subobjs:
if isinstance(y, detail.Tag):
possible_relation_tags.add(y.key)
else:
assert isinstance(y, detail.Member)
assert y.type in {"way", "node", "relation"}
else:
raise Exception("Should see this")
print("Took {}".format(datetime.datetime.now()-start))
In [5]:
len(possible_node_tags), list(possible_node_tags)[:5]
Out[5]:
In [6]:
len(possible_way_tags), list(possible_way_tags)[:5]
Out[6]:
In [7]:
len(possible_relation_tags), list(possible_relation_tags)[:5]
Out[7]:
In [8]:
import osmdigest.digest as digest
In [9]:
possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()
start = datetime.datetime.now()
for x in digest.parse_sax("isle-of-wight-latest.osm"):
if isinstance(x, digest.OSM) or isinstance(x, digest.Bounds):
print(x)
elif isinstance(x, digest.Node):
for key in x.tags.keys():
possible_node_tags.add(key)
elif isinstance(x, digest.Way):
for key in x.tags.keys():
possible_way_tags.add(key)
elif isinstance(x, digest.Relation):
for key in x.tags.keys():
possible_relation_tags.add(key)
print("Took {}".format(datetime.datetime.now()-start))
In [10]:
len(possible_node_tags), list(possible_node_tags)[:5]
Out[10]:
In [11]:
len(possible_way_tags), list(possible_way_tags)[:5]
Out[11]:
In [12]:
len(possible_relation_tags), list(possible_relation_tags)[:5]
Out[12]:
In [13]:
possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()
start = datetime.datetime.now()
for x in digest.parse("isle-of-wight-latest.osm"):
if isinstance(x, digest.OSM) or isinstance(x, digest.Bounds):
print(x)
elif isinstance(x, digest.Node):
for key in x.tags.keys():
possible_node_tags.add(key)
elif isinstance(x, digest.Way):
for key in x.tags.keys():
possible_way_tags.add(key)
elif isinstance(x, digest.Relation):
for key in x.tags.keys():
possible_relation_tags.add(key)
print("Took {}".format(datetime.datetime.now()-start))
In [14]:
len(possible_node_tags), list(possible_node_tags)[:5]
Out[14]:
In [15]:
len(possible_way_tags), list(possible_way_tags)[:5]
Out[15]:
In [16]:
len(possible_relation_tags), list(possible_relation_tags)[:5]
Out[16]:
In [17]:
class Handler(digest.OSMDataHandler):
def __init__(self):
self.possible_node_tags = set()
self.possible_way_tags = set()
self.possible_relation_tags = set()
def start(self, osm):
print(osm)
def bounds(self, bounds):
print(bounds)
def node(self, x):
for key in x.tags.keys():
self.possible_node_tags.add(key)
def way(self, x):
for key in x.tags.keys():
self.possible_way_tags.add(key)
def relation(self, x):
for key in x.tags.keys():
self.possible_relation_tags.add(key)
start = datetime.datetime.now()
handler = Handler()
digest.parse_callback("isle-of-wight-latest.osm", handler)
print("Took {}".format(datetime.datetime.now()-start))
In [18]:
len(handler.possible_node_tags), list(handler.possible_node_tags)[:5]
Out[18]:
In [19]:
len(handler.possible_way_tags), list(handler.possible_way_tags)[:5]
Out[19]:
In [20]:
len(handler.possible_relation_tags), list(handler.possible_relation_tags)[:5]
Out[20]:
In [21]:
import osmdigest.utils.cbtogen as cbtogen
In [22]:
class Handler(digest.OSMDataHandler):
def __init__(self, delegate):
self.delegate = delegate
def start(self, osm):
self.delegate.notify(osm)
def bounds(self, bounds):
self.delegate.notify(bounds)
def node(self, x):
self.delegate.notify(x)
def way(self, x):
self.delegate.notify(x)
def relation(self, x):
self.delegate.notify(x)
generator = cbtogen.CallbackToGenerator()
handler = Handler(generator)
def func():
digest.parse_callback("isle-of-wight-latest.osm", handler)
generator.set_callback_function(func)
possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()
with generator:
start = datetime.datetime.now()
for x in generator:
if isinstance(x, digest.OSM) or isinstance(x, digest.Bounds):
print(x)
elif isinstance(x, digest.Node):
for key in x.tags.keys():
possible_node_tags.add(key)
elif isinstance(x, digest.Way):
for key in x.tags.keys():
possible_way_tags.add(key)
elif isinstance(x, digest.Relation):
for key in x.tags.keys():
possible_relation_tags.add(key)
print("Took {}".format(datetime.datetime.now()-start))
len(possible_node_tags), len(possible_way_tags), len(possible_relation_tags)
Out[22]:
Using 2 threads to convert from a callback to a generator might be fun, but it's not performant. At all.
I suspect what happens is that the inter-thread communication (and whatever context switching Python does, as CPYthon is essentially single threaded) adds a certain overhead. It we put this overhead at the XML parsing level, then we generate a huge number of temporary objects which are pushed onto the queue only to be removed and essentially ignored (e.g. "character" messages). Those events which aren't ignored are often converted into a much smaller number of OSM specific objects (e.g. a way with many tags and node references yields only osm object but could be 50 XML events).
We'll check our example to see how the bounding box actually reflects the data.
In [23]:
gen = digest.parse("isle-of-wight-latest.osm")
osm, bounds = next(gen), next(gen)
osm, bounds
Out[23]:
In [24]:
minlon, maxlon, minlat, maxlat = None, None, None, None
nodes, ways, relations = [], [], []
for el in digest.parse("isle-of-wight-latest.osm"):
if el.name == "node":
nodes.append(el)
if minlon is None or el.longitude < minlon:
minlon = el.longitude
if maxlon is None or el.longitude > maxlon:
maxlon = el.longitude
if minlat is None or el.latitude < minlat:
minlat = el.latitude
if maxlat is None or el.latitude > maxlat:
maxlat = el.latitude
elif el.name == "way":
ways.append(el)
elif el.name == "relation":
relations.append(el)
minlon, maxlon, minlat, maxlat
Out[24]:
In [25]:
node_ids_in_bb = { node.osm_id for node in nodes if
node.longitude >= bounds.min_longitude and node.longitude <= bounds.max_longitude
and node.latitude >= bounds.min_latitude and node.latitude <= bounds.max_latitude }
node_ids = { node.osm_id for node in nodes }
way_ids = { way.osm_id for way in ways }
In [26]:
# Find ways which feature no node in the bounding box
for way in ways:
if not any(noderef in node_ids_in_bb for noderef in way.nodes):
print(way)
In [27]:
# Check that all nodes in each way are in the dataset
for way in ways:
if not all(noderef in node_ids for noderef in way.nodes):
print(way)
In [28]:
relations = {rel.osm_id : rel for rel in relations}
In [29]:
# Find relations which feature a member not in the dataset
for rel in relations.values():
failed = False
for member in rel.members:
if member.type == "node" and member.ref not in node_ids:
failed = True
elif member.type == "way" and member.ref not in way_ids:
failed = True
elif member.type == "relation" and member.ref not in relations:
failed = True
if failed:
print(rel.osm_id)
In [30]:
def relation_has_member_in_bb(rel):
for member in rel.members:
if member.type == "node" and member.ref in node_ids_in_bb:
return True
elif member.type == "way" and member.ref in way_ids:
return True
elif member.type == "relation" and member.ref in relations:
return relation_has_member_in_bb(relations[member.ref])
return False
all(relation_has_member_in_bb(rel) for rel in relations.values())
Out[30]:
In [ ]: