In [1]:
# Allow to import without installing
import sys
sys.path.insert(0, "..")

Example files

These are downloaded from http://download.geofabrik.de/ which offers snapshots of various parts of the planet in a variety of formats.

I have found that "xz" offers better compression that bzip2. Linux users can install the "xz" package, or on Windows, use 7zip. Python itself can easily recompress a file.

The examples below use the data, in uncompressed format, for the Isle of Wight (a small island off the south coast of England, known to me from childhood holidays). See http://download.geofabrik.de/europe/great-britain/england.html


In [2]:
import os, lzma
#basedir = os.path.join("/media/disk", "OSM_Data")
basedir = os.path.join("e:\\", "OSM_Data")
filename = "isle-of-wight-latest.osm.xz"
with lzma.open(os.path.join(basedir, filename), mode="rt", encoding="utf-8") as f:
    print(next(f), end="")
    print(next(f), end="")
    print(next(f), end="")
    print(next(f), end="")


<?xml version='1.0' encoding='UTF-8'?>
<osm version="0.6" generator="osmconvert 0.8.5" timestamp="2017-04-25T20:43:28Z">
	<bounds minlat="50.50555" minlon="-1.659074" maxlat="50.80102" maxlon="-1.0313699"/>
	<node id="195206" lat="50.6275781" lon="-1.1730057" version="10" timestamp="2016-03-29T12:53:40Z" changeset="38143882" uid="3099236" user="iwhs"/>

Look at the generated data


In [3]:
import osmdigest.detail as detail
import datetime

In [4]:
possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()

#with detail.Parser(os.path.join(basedir, filename)) as gen:
start = datetime.datetime.now()
with detail.Parser("isle-of-wight-latest.osm") as gen:
    for x in gen:
        if isinstance(x, detail.OSM) or isinstance(x, detail.Bounds):
            print(x)
        elif isinstance(x, detail.Node):
            for y in x.subobjs:
                assert isinstance(y, detail.Tag)
                possible_node_tags.add(y.key)
        elif isinstance(x, detail.Way):
            for y in x.subobjs:
                if isinstance(y, detail.Tag):
                    possible_way_tags.add(y.key)
                else:
                    assert isinstance(y, detail.NodeRef)
        elif isinstance(x, detail.Relation):
            for y in x.subobjs:
                if isinstance(y, detail.Tag):
                    possible_relation_tags.add(y.key)
                else:
                    assert isinstance(y, detail.Member)
                    assert y.type in {"way", "node", "relation"}
        else:
            raise Exception("Should see this")
print("Took {}".format(datetime.datetime.now()-start))


OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(longitude:[-1.659074,-1.0313699], latitude:[50.50555,50.80102]
Took 0:02:55.461164

In [5]:
len(possible_node_tags), list(possible_node_tags)[:5]


Out[5]:
(335, ['note', 'religion', 'alt_name', 'condition', 'recycling:glass'])

In [6]:
len(possible_way_tags), list(possible_way_tags)[:5]


Out[6]:
(484, ['note', 'electrified', 'condition', 'bridge:name', 'source'])

In [7]:
len(possible_relation_tags), list(possible_relation_tags)[:5]


Out[7]:
(151, ['note', 'religion', 'name:es', 'alt_name', 'political_division'])

Parse the data in a reduced way


In [8]:
import osmdigest.digest as digest

In [9]:
possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()

start = datetime.datetime.now()
for x in digest.parse_sax("isle-of-wight-latest.osm"):
    if isinstance(x, digest.OSM) or isinstance(x, digest.Bounds):
        print(x)
    elif isinstance(x, digest.Node):
        for key in x.tags.keys():
            possible_node_tags.add(key)
    elif isinstance(x, digest.Way):
        for key in x.tags.keys():
            possible_way_tags.add(key)
    elif isinstance(x, digest.Relation):
        for key in x.tags.keys():
            possible_relation_tags.add(key)
print("Took {}".format(datetime.datetime.now()-start))


OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(longitude:[-1.659074,-1.0313699], latitude:[50.50555,50.80102]
Took 0:02:39.068315

In [10]:
len(possible_node_tags), list(possible_node_tags)[:5]


Out[10]:
(335, ['note', 'religion', 'alt_name', 'condition', 'recycling:glass'])

In [11]:
len(possible_way_tags), list(possible_way_tags)[:5]


Out[11]:
(484, ['note', 'electrified', 'condition', 'bridge:name', 'source'])

In [12]:
len(possible_relation_tags), list(possible_relation_tags)[:5]


Out[12]:
(151, ['note', 'religion', 'name:es', 'alt_name', 'political_division'])

Use xml.etree instead

A different Python standard library xml parser.


In [13]:
possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()

start = datetime.datetime.now()
for x in digest.parse("isle-of-wight-latest.osm"):
    if isinstance(x, digest.OSM) or isinstance(x, digest.Bounds):
        print(x)
    elif isinstance(x, digest.Node):
        for key in x.tags.keys():
            possible_node_tags.add(key)
    elif isinstance(x, digest.Way):
        for key in x.tags.keys():
            possible_way_tags.add(key)
    elif isinstance(x, digest.Relation):
        for key in x.tags.keys():
            possible_relation_tags.add(key)
print("Took {}".format(datetime.datetime.now()-start))


OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(longitude:[-1.659074,-1.0313699], latitude:[50.50555,50.80102]
Took 0:00:12.496971

In [14]:
len(possible_node_tags), list(possible_node_tags)[:5]


Out[14]:
(335, ['note', 'religion', 'alt_name', 'condition', 'recycling:glass'])

In [15]:
len(possible_way_tags), list(possible_way_tags)[:5]


Out[15]:
(484, ['note', 'electrified', 'condition', 'bridge:name', 'source'])

In [16]:
len(possible_relation_tags), list(possible_relation_tags)[:5]


Out[16]:
(151, ['note', 'religion', 'name:es', 'alt_name', 'political_division'])

Via a callback

Just to show that it's not the SAX library itself which is the bottleneck.


In [17]:
class Handler(digest.OSMDataHandler):
    def __init__(self):
        self.possible_node_tags = set()
        self.possible_way_tags = set()
        self.possible_relation_tags = set()

    def start(self, osm):
        print(osm)
        
    def bounds(self, bounds):
        print(bounds)

    def node(self, x):
        for key in x.tags.keys():
            self.possible_node_tags.add(key)

    def way(self, x):
        for key in x.tags.keys():
            self.possible_way_tags.add(key)

    def relation(self, x):
        for key in x.tags.keys():
            self.possible_relation_tags.add(key)

start = datetime.datetime.now()
handler = Handler()
digest.parse_callback("isle-of-wight-latest.osm", handler)
print("Took {}".format(datetime.datetime.now()-start))


OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(longitude:[-1.659074,-1.0313699], latitude:[50.50555,50.80102]
Took 0:00:11.229552

In [18]:
len(handler.possible_node_tags), list(handler.possible_node_tags)[:5]


Out[18]:
(335, ['note', 'religion', 'alt_name', 'condition', 'recycling:glass'])

In [19]:
len(handler.possible_way_tags), list(handler.possible_way_tags)[:5]


Out[19]:
(484, ['note', 'electrified', 'condition', 'bridge:name', 'source'])

In [20]:
len(handler.possible_relation_tags), list(handler.possible_relation_tags)[:5]


Out[20]:
(151, ['note', 'religion', 'name:es', 'alt_name', 'political_division'])

Convert the callback to a generator at the OSM data level

This works fairly well.


In [21]:
import osmdigest.utils.cbtogen as cbtogen

In [22]:
class Handler(digest.OSMDataHandler):
    def __init__(self, delegate):
        self.delegate = delegate
        
    def start(self, osm):
        self.delegate.notify(osm)
        
    def bounds(self, bounds):
        self.delegate.notify(bounds)

    def node(self, x):
        self.delegate.notify(x)

    def way(self, x):
        self.delegate.notify(x)

    def relation(self, x):
        self.delegate.notify(x)
        
generator = cbtogen.CallbackToGenerator()
handler = Handler(generator)
def func():
    digest.parse_callback("isle-of-wight-latest.osm", handler)
generator.set_callback_function(func)

possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()

with generator:
    start = datetime.datetime.now()
    for x in generator:
        if isinstance(x, digest.OSM) or isinstance(x, digest.Bounds):
            print(x)
        elif isinstance(x, digest.Node):
            for key in x.tags.keys():
                possible_node_tags.add(key)
        elif isinstance(x, digest.Way):
            for key in x.tags.keys():
                possible_way_tags.add(key)
        elif isinstance(x, digest.Relation):
            for key in x.tags.keys():
                possible_relation_tags.add(key)
    print("Took {}".format(datetime.datetime.now()-start))
    
len(possible_node_tags), len(possible_way_tags), len(possible_relation_tags)


OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(longitude:[-1.659074,-1.0313699], latitude:[50.50555,50.80102]
Took 0:00:16.574323
Out[22]:
(335, 484, 151)

Conclusion

Using 2 threads to convert from a callback to a generator might be fun, but it's not performant. At all.

I suspect what happens is that the inter-thread communication (and whatever context switching Python does, as CPYthon is essentially single threaded) adds a certain overhead. It we put this overhead at the XML parsing level, then we generate a huge number of temporary objects which are pushed onto the queue only to be removed and essentially ignored (e.g. "character" messages). Those events which aren't ignored are often converted into a much smaller number of OSM specific objects (e.g. a way with many tags and node references yields only osm object but could be 50 XML events).

Check bounding box

We'll check our example to see how the bounding box actually reflects the data.

  • Quite a few nodes fall outside of the bounding box
  • However, almost all ways feature at least one node in the bounding box,
  • and for every way, all nodes can be found in the dataset.
  • Conversely, there are a few relations which feature nodes / ways not in the dataset
  • but all relations in the dataset have some node or way or sub-relation in the dataset

In [23]:
gen = digest.parse("isle-of-wight-latest.osm")
osm, bounds = next(gen), next(gen)

osm, bounds


Out[23]:
(OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28),
 Bounds(longitude:[-1.659074,-1.0313699], latitude:[50.50555,50.80102])

In [24]:
minlon, maxlon, minlat, maxlat = None, None, None, None
nodes, ways, relations = [], [], []
for el in digest.parse("isle-of-wight-latest.osm"):
    if el.name == "node":
        nodes.append(el)
        if minlon is None or el.longitude < minlon:
            minlon = el.longitude
        if maxlon is None or el.longitude > maxlon:
            maxlon = el.longitude
        if minlat is None or el.latitude < minlat:
            minlat = el.latitude
        if maxlat is None or el.latitude > maxlat:
            maxlat = el.latitude
    elif el.name == "way":
        ways.append(el)
    elif el.name == "relation":
        relations.append(el)
        
minlon, maxlon, minlat, maxlat


Out[24]:
(-5.4136531, -0.9118864, 43.3548941, 50.8959907)

In [25]:
node_ids_in_bb = { node.osm_id for node in nodes if
    node.longitude >= bounds.min_longitude and node.longitude <= bounds.max_longitude
    and node.latitude >= bounds.min_latitude and node.latitude <= bounds.max_latitude }
node_ids = { node.osm_id for node in nodes }
way_ids = { way.osm_id for way in ways }

In [26]:
# Find ways which feature no node in the bounding box
for way in ways:
    if not any(noderef in node_ids_in_bb for noderef in way.nodes):
        print(way)


Way(10157044 ->  [13718394, 13718395, 13718398, 13718404, 13718407, 13718412] {'source': 'PGS & Bing', 'natural': 'coastline'})
Way(10157047 ->  [256790402, 13718604, 13718617, 13718621, 13718629, 2101289190, 13718635, 1691252673, 13718647, 13718651, 2101289129, 2101289124, 2101289121, 1691252669, 13718664, 2101289120, 2101289117, 2101289114, 1691252671, 2101289112, 2101289105, 13718674, 256790387, 256790385, 2101289076, 2101289059, 2101289053, 2101289050, 2101289045, 13718692, 2101289044, 2101289029, 2101289030, 2101289017, 2101289018, 13718700, 2101289011, 13718705, 2101289000, 13718710, 13718712, 13718718, 13718723, 2106275306, 13718733, 2106275303, 2106275301, 2106275300, 2106275298, 2106275295, 2106275294, 2106275292, 2106275290, 3101656908, 2106275287, 3101656839, 2106275284, 2106275281, 2106275279, 2106275275, 2106275270, 2106275265] {'source': 'PGS & Bing', 'natural': 'coastline'})
Way(39498695 ->  [13717968, 2101289492, 2101289491, 2101289490, 2101289488, 13717971, 13717975, 2101289486, 13717981] {'source': 'PGS', 'natural': 'coastline'})
Way(39498700 ->  [13717981, 2101289485, 2101289484, 2101289482, 2101289481, 13717989, 310253725, 13718001, 13718006, 13718033, 2101289473, 2101289471, 2101289470, 13718036, 13718037, 13718050, 13718056, 13718061, 2101289467, 2101289465, 2101289464, 2101289461, 2101289463, 2101289462, 13718073, 2101289460, 13718082, 13718085, 2101289459, 13718109, 13718125, 13718147, 13718150, 13718178, 13718191, 13718225, 13718239, 13718247, 13718275, 13718284, 13718293, 13718303, 13718308, 13718315, 13718333, 13718346, 13718353, 13718358, 13718363, 13718369, 13718373, 13718385, 13718392, 13718394] {'source': 'OS_OpenData_StreetView', 'natural': 'coastline'})
Way(199346431 ->  [13718412, 13718418, 13718419, 13718424, 13718427, 13718432, 2101289398, 13718436, 2101289401, 2101289403, 2101289405, 2101289407, 2101289404, 2101289406, 13718441, 2101289402, 13718446, 13718451, 13718456, 310253759, 13718459, 13718464, 13718469, 1691252687, 13718472, 13718476, 33318734, 33318735, 1691252697, 1691252695, 33318736, 321789287, 1691252702, 321789288, 33318737, 1691252700, 33318738, 600200813, 600200815, 600200817, 600200819, 246254949, 246254950, 1691252688, 1691252690, 1691252693, 13718498, 13718504, 1691252689, 13718513, 13718526, 13718539, 1691252680, 13718557, 13718564, 13718570, 13718571, 13718575, 13718578, 2101289351, 13718588, 2101289231, 2101289226, 13718591, 2101289209, 1691252675, 1691252678, 4633590097, 2101289211, 256790402] {'source': 'PGS & Bing', 'natural': 'coastline'})
Way(200662663 ->  [2106275265, 2106275264, 2106275263, 13718813, 2106275262] {'source': 'PGS', 'natural': 'coastline'})
Way(200662665 ->  [2106275262, 2106275261, 2106275258, 2106275248, 2106275242, 2106275238, 2106275236, 2106275234] {'source': 'OS_OpenData_StreetView', 'natural': 'coastline'})
Way(222281730 ->  [13713007, 13713009, 13713012, 13713015, 13713026, 13713028, 13713031, 1241434785, 13713038, 13713048, 13713050, 13713051, 13713053, 13713066, 13713075, 13713079, 13713080, 13713081, 13713082, 13713084, 13713085, 3804317987, 13713106, 13713123, 13713126, 13713133, 13713136, 13713144, 13713154, 13713166, 13713188, 1241434535, 13713189, 13713190, 287362645, 3804318018, 4277941860, 4277941859, 1241434798] {'source': 'PGS', 'natural': 'coastline'})
Way(263265337 ->  [1241434798, 13717968] {})

In [27]:
# Check that all nodes in each way are in the dataset
for way in ways:
    if not all(noderef in node_ids for noderef in way.nodes):
        print(way)

In [28]:
relations = {rel.osm_id : rel for rel in relations}

In [29]:
# Find relations which feature a member not in the dataset
for rel in relations.values():
    failed = False
    for member in rel.members:
        if member.type == "node" and member.ref not in node_ids:
            failed = True
        elif member.type == "way" and member.ref not in way_ids:
            failed = True
        elif member.type == "relation" and member.ref not in relations:
            failed = True
    if failed:
        print(rel.osm_id)


67578
151304
168529
192009
311462
311463
1254604
1561822
1959386
2705259
3746024
3820651
3999378

In [30]:
def relation_has_member_in_bb(rel):
    for member in rel.members:
        if member.type == "node" and member.ref in node_ids_in_bb:
            return True
        elif member.type == "way" and member.ref in way_ids:
            return True
        elif member.type == "relation" and member.ref in relations:
            return relation_has_member_in_bb(relations[member.ref])
    return False

all(relation_has_member_in_bb(rel) for rel in relations.values())


Out[30]:
True

In [ ]: