In [1]:

    
# Allow to import without installing
import sys
sys.path.insert(0, "..")

Example files

These are downloaded from http://download.geofabrik.de/ which offers snapshots of various parts of the planet in a variety of formats.

I have found that "xz" offers better compression that bzip2. Linux users can install the "xz" package, or on Windows, use 7zip. Python itself can easily recompress a file.

The examples below use the data, in uncompressed format, for the Isle of Wight (a small island off the south coast of England, known to me from childhood holidays). See http://download.geofabrik.de/europe/great-britain/england.html



In [2]:

    
import os, lzma
#basedir = os.path.join("/media/disk", "OSM_Data")
basedir = os.path.join("e:\\", "OSM_Data")
filename = "isle-of-wight-latest.osm.xz"
with lzma.open(os.path.join(basedir, filename), mode="rt", encoding="utf-8") as f:
    print(next(f), end="")
    print(next(f), end="")
    print(next(f), end="")
    print(next(f), end="")









    



<?xml version='1.0' encoding='UTF-8'?>
<osm version="0.6" generator="osmconvert 0.8.5" timestamp="2017-04-25T20:43:28Z">
	<bounds minlat="50.50555" minlon="-1.659074" maxlat="50.80102" maxlon="-1.0313699"/>
	<node id="195206" lat="50.6275781" lon="-1.1730057" version="10" timestamp="2016-03-29T12:53:40Z" changeset="38143882" uid="3099236" user="iwhs"/>

Look at the generated data



In [3]:

    
import osmdigest.detail as detail
import datetime



In [4]:

    
possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()

#with detail.Parser(os.path.join(basedir, filename)) as gen:
start = datetime.datetime.now()
with detail.Parser("isle-of-wight-latest.osm") as gen:
    for x in gen:
        if isinstance(x, detail.OSM) or isinstance(x, detail.Bounds):
            print(x)
        elif isinstance(x, detail.Node):
            for y in x.subobjs:
                assert isinstance(y, detail.Tag)
                possible_node_tags.add(y.key)
        elif isinstance(x, detail.Way):
            for y in x.subobjs:
                if isinstance(y, detail.Tag):
                    possible_way_tags.add(y.key)
                else:
                    assert isinstance(y, detail.NodeRef)
        elif isinstance(x, detail.Relation):
            for y in x.subobjs:
                if isinstance(y, detail.Tag):
                    possible_relation_tags.add(y.key)
                else:
                    assert isinstance(y, detail.Member)
                    assert y.type in {"way", "node", "relation"}
        else:
            raise Exception("Should see this")
print("Took {}".format(datetime.datetime.now()-start))









    



OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(longitude:[-1.659074,-1.0313699], latitude:[50.50555,50.80102]
Took 0:02:55.461164



In [5]:

    
len(possible_node_tags), list(possible_node_tags)[:5]









    Out[5]:





(335, ['note', 'religion', 'alt_name', 'condition', 'recycling:glass'])



In [6]:

    
len(possible_way_tags), list(possible_way_tags)[:5]









    Out[6]:





(484, ['note', 'electrified', 'condition', 'bridge:name', 'source'])



In [7]:

    
len(possible_relation_tags), list(possible_relation_tags)[:5]









    Out[7]:





(151, ['note', 'religion', 'name:es', 'alt_name', 'political_division'])

Parse the data in a reduced way



In [8]:

    
import osmdigest.digest as digest



In [9]:

    
possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()

start = datetime.datetime.now()
for x in digest.parse_sax("isle-of-wight-latest.osm"):
    if isinstance(x, digest.OSM) or isinstance(x, digest.Bounds):
        print(x)
    elif isinstance(x, digest.Node):
        for key in x.tags.keys():
            possible_node_tags.add(key)
    elif isinstance(x, digest.Way):
        for key in x.tags.keys():
            possible_way_tags.add(key)
    elif isinstance(x, digest.Relation):
        for key in x.tags.keys():
            possible_relation_tags.add(key)
print("Took {}".format(datetime.datetime.now()-start))









    



OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(longitude:[-1.659074,-1.0313699], latitude:[50.50555,50.80102]
Took 0:02:39.068315



In [10]:

    
len(possible_node_tags), list(possible_node_tags)[:5]









    Out[10]:





(335, ['note', 'religion', 'alt_name', 'condition', 'recycling:glass'])



In [11]:

    
len(possible_way_tags), list(possible_way_tags)[:5]









    Out[11]:





(484, ['note', 'electrified', 'condition', 'bridge:name', 'source'])



In [12]:

    
len(possible_relation_tags), list(possible_relation_tags)[:5]









    Out[12]:





(151, ['note', 'religion', 'name:es', 'alt_name', 'political_division'])

Use `xml.etree` instead

A different Python standard library xml parser.



In [13]:

    
possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()

start = datetime.datetime.now()
for x in digest.parse("isle-of-wight-latest.osm"):
    if isinstance(x, digest.OSM) or isinstance(x, digest.Bounds):
        print(x)
    elif isinstance(x, digest.Node):
        for key in x.tags.keys():
            possible_node_tags.add(key)
    elif isinstance(x, digest.Way):
        for key in x.tags.keys():
            possible_way_tags.add(key)
    elif isinstance(x, digest.Relation):
        for key in x.tags.keys():
            possible_relation_tags.add(key)
print("Took {}".format(datetime.datetime.now()-start))









    



OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(longitude:[-1.659074,-1.0313699], latitude:[50.50555,50.80102]
Took 0:00:12.496971



In [14]:

    
len(possible_node_tags), list(possible_node_tags)[:5]









    Out[14]:





(335, ['note', 'religion', 'alt_name', 'condition', 'recycling:glass'])



In [15]:

    
len(possible_way_tags), list(possible_way_tags)[:5]









    Out[15]:





(484, ['note', 'electrified', 'condition', 'bridge:name', 'source'])



In [16]:

    
len(possible_relation_tags), list(possible_relation_tags)[:5]









    Out[16]:





(151, ['note', 'religion', 'name:es', 'alt_name', 'political_division'])

Via a callback

Just to show that it's not the SAX library itself which is the bottleneck.



In [17]:

    
class Handler(digest.OSMDataHandler):
    def __init__(self):
        self.possible_node_tags = set()
        self.possible_way_tags = set()
        self.possible_relation_tags = set()

    def start(self, osm):
        print(osm)
        
    def bounds(self, bounds):
        print(bounds)

    def node(self, x):
        for key in x.tags.keys():
            self.possible_node_tags.add(key)

    def way(self, x):
        for key in x.tags.keys():
            self.possible_way_tags.add(key)

    def relation(self, x):
        for key in x.tags.keys():
            self.possible_relation_tags.add(key)

start = datetime.datetime.now()
handler = Handler()
digest.parse_callback("isle-of-wight-latest.osm", handler)
print("Took {}".format(datetime.datetime.now()-start))









    



OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(longitude:[-1.659074,-1.0313699], latitude:[50.50555,50.80102]
Took 0:00:11.229552



In [18]:

    
len(handler.possible_node_tags), list(handler.possible_node_tags)[:5]









    Out[18]:





(335, ['note', 'religion', 'alt_name', 'condition', 'recycling:glass'])



In [19]:

    
len(handler.possible_way_tags), list(handler.possible_way_tags)[:5]









    Out[19]:





(484, ['note', 'electrified', 'condition', 'bridge:name', 'source'])



In [20]:

    
len(handler.possible_relation_tags), list(handler.possible_relation_tags)[:5]









    Out[20]:





(151, ['note', 'religion', 'name:es', 'alt_name', 'political_division'])

Convert the callback to a generator at the OSM data level

This works fairly well.



In [21]:

    
import osmdigest.utils.cbtogen as cbtogen



In [22]:

    
class Handler(digest.OSMDataHandler):
    def __init__(self, delegate):
        self.delegate = delegate
        
    def start(self, osm):
        self.delegate.notify(osm)
        
    def bounds(self, bounds):
        self.delegate.notify(bounds)

    def node(self, x):
        self.delegate.notify(x)

    def way(self, x):
        self.delegate.notify(x)

    def relation(self, x):
        self.delegate.notify(x)
        
generator = cbtogen.CallbackToGenerator()
handler = Handler(generator)
def func():
    digest.parse_callback("isle-of-wight-latest.osm", handler)
generator.set_callback_function(func)

possible_node_tags = set()
possible_way_tags = set()
possible_relation_tags = set()

with generator:
    start = datetime.datetime.now()
    for x in generator:
        if isinstance(x, digest.OSM) or isinstance(x, digest.Bounds):
            print(x)
        elif isinstance(x, digest.Node):
            for key in x.tags.keys():
                possible_node_tags.add(key)
        elif isinstance(x, digest.Way):
            for key in x.tags.keys():
                possible_way_tags.add(key)
        elif isinstance(x, digest.Relation):
            for key in x.tags.keys():
                possible_relation_tags.add(key)
    print("Took {}".format(datetime.datetime.now()-start))
    
len(possible_node_tags), len(possible_way_tags), len(possible_relation_tags)









    



OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28)
Bounds(longitude:[-1.659074,-1.0313699], latitude:[50.50555,50.80102]
Took 0:00:16.574323






    Out[22]:





(335, 484, 151)

Conclusion

Using 2 threads to convert from a callback to a generator might be fun, but it's not performant. At all.

I suspect what happens is that the inter-thread communication (and whatever context switching Python does, as CPYthon is essentially single threaded) adds a certain overhead. It we put this overhead at the XML parsing level, then we generate a huge number of temporary objects which are pushed onto the queue only to be removed and essentially ignored (e.g. "character" messages). Those events which aren't ignored are often converted into a much smaller number of OSM specific objects (e.g. a way with many tags and node references yields only osm object but could be 50 XML events).

Check bounding box

We'll check our example to see how the bounding box actually reflects the data.

Quite a few nodes fall outside of the bounding box
However, almost all ways feature at least one node in the bounding box,
and for every way, all nodes can be found in the dataset.
Conversely, there are a few relations which feature nodes / ways not in the dataset
but all relations in the dataset have some node or way or sub-relation in the dataset



In [23]:

    
gen = digest.parse("isle-of-wight-latest.osm")
osm, bounds = next(gen), next(gen)

osm, bounds









    Out[23]:





(OSM(version=0.6, generator=osmconvert 0.8.5, timestamp=2017-04-25 20:43:28),
 Bounds(longitude:[-1.659074,-1.0313699], latitude:[50.50555,50.80102])



In [24]:

    
minlon, maxlon, minlat, maxlat = None, None, None, None
nodes, ways, relations = [], [], []
for el in digest.parse("isle-of-wight-latest.osm"):
    if el.name == "node":
        nodes.append(el)
        if minlon is None or el.longitude < minlon:
            minlon = el.longitude
        if maxlon is None or el.longitude > maxlon:
            maxlon = el.longitude
        if minlat is None or el.latitude < minlat:
            minlat = el.latitude
        if maxlat is None or el.latitude > maxlat:
            maxlat = el.latitude
    elif el.name == "way":
        ways.append(el)
    elif el.name == "relation":
        relations.append(el)
        
minlon, maxlon, minlat, maxlat









    Out[24]:





(-5.4136531, -0.9118864, 43.3548941, 50.8959907)



In [25]:

    
node_ids_in_bb = { node.osm_id for node in nodes if
    node.longitude >= bounds.min_longitude and node.longitude <= bounds.max_longitude
    and node.latitude >= bounds.min_latitude and node.latitude <= bounds.max_latitude }
node_ids = { node.osm_id for node in nodes }
way_ids = { way.osm_id for way in ways }



In [26]:

    
# Find ways which feature no node in the bounding box
for way in ways:
    if not any(noderef in node_ids_in_bb for noderef in way.nodes):
        print(way)









    



Way(10157044 ->  [13718394, 13718395, 13718398, 13718404, 13718407, 13718412] {'source': 'PGS & Bing', 'natural': 'coastline'})
Way(10157047 ->  [256790402, 13718604, 13718617, 13718621, 13718629, 2101289190, 13718635, 1691252673, 13718647, 13718651, 2101289129, 2101289124, 2101289121, 1691252669, 13718664, 2101289120, 2101289117, 2101289114, 1691252671, 2101289112, 2101289105, 13718674, 256790387, 256790385, 2101289076, 2101289059, 2101289053, 2101289050, 2101289045, 13718692, 2101289044, 2101289029, 2101289030, 2101289017, 2101289018, 13718700, 2101289011, 13718705, 2101289000, 13718710, 13718712, 13718718, 13718723, 2106275306, 13718733, 2106275303, 2106275301, 2106275300, 2106275298, 2106275295, 2106275294, 2106275292, 2106275290, 3101656908, 2106275287, 3101656839, 2106275284, 2106275281, 2106275279, 2106275275, 2106275270, 2106275265] {'source': 'PGS & Bing', 'natural': 'coastline'})
Way(39498695 ->  [13717968, 2101289492, 2101289491, 2101289490, 2101289488, 13717971, 13717975, 2101289486, 13717981] {'source': 'PGS', 'natural': 'coastline'})
Way(39498700 ->  [13717981, 2101289485, 2101289484, 2101289482, 2101289481, 13717989, 310253725, 13718001, 13718006, 13718033, 2101289473, 2101289471, 2101289470, 13718036, 13718037, 13718050, 13718056, 13718061, 2101289467, 2101289465, 2101289464, 2101289461, 2101289463, 2101289462, 13718073, 2101289460, 13718082, 13718085, 2101289459, 13718109, 13718125, 13718147, 13718150, 13718178, 13718191, 13718225, 13718239, 13718247, 13718275, 13718284, 13718293, 13718303, 13718308, 13718315, 13718333, 13718346, 13718353, 13718358, 13718363, 13718369, 13718373, 13718385, 13718392, 13718394] {'source': 'OS_OpenData_StreetView', 'natural': 'coastline'})
Way(199346431 ->  [13718412, 13718418, 13718419, 13718424, 13718427, 13718432, 2101289398, 13718436, 2101289401, 2101289403, 2101289405, 2101289407, 2101289404, 2101289406, 13718441, 2101289402, 13718446, 13718451, 13718456, 310253759, 13718459, 13718464, 13718469, 1691252687, 13718472, 13718476, 33318734, 33318735, 1691252697, 1691252695, 33318736, 321789287, 1691252702, 321789288, 33318737, 1691252700, 33318738, 600200813, 600200815, 600200817, 600200819, 246254949, 246254950, 1691252688, 1691252690, 1691252693, 13718498, 13718504, 1691252689, 13718513, 13718526, 13718539, 1691252680, 13718557, 13718564, 13718570, 13718571, 13718575, 13718578, 2101289351, 13718588, 2101289231, 2101289226, 13718591, 2101289209, 1691252675, 1691252678, 4633590097, 2101289211, 256790402] {'source': 'PGS & Bing', 'natural': 'coastline'})
Way(200662663 ->  [2106275265, 2106275264, 2106275263, 13718813, 2106275262] {'source': 'PGS', 'natural': 'coastline'})
Way(200662665 ->  [2106275262, 2106275261, 2106275258, 2106275248, 2106275242, 2106275238, 2106275236, 2106275234] {'source': 'OS_OpenData_StreetView', 'natural': 'coastline'})
Way(222281730 ->  [13713007, 13713009, 13713012, 13713015, 13713026, 13713028, 13713031, 1241434785, 13713038, 13713048, 13713050, 13713051, 13713053, 13713066, 13713075, 13713079, 13713080, 13713081, 13713082, 13713084, 13713085, 3804317987, 13713106, 13713123, 13713126, 13713133, 13713136, 13713144, 13713154, 13713166, 13713188, 1241434535, 13713189, 13713190, 287362645, 3804318018, 4277941860, 4277941859, 1241434798] {'source': 'PGS', 'natural': 'coastline'})
Way(263265337 ->  [1241434798, 13717968] {})



In [27]:

    
# Check that all nodes in each way are in the dataset
for way in ways:
    if not all(noderef in node_ids for noderef in way.nodes):
        print(way)



In [28]:

    
relations = {rel.osm_id : rel for rel in relations}



In [29]:

    
# Find relations which feature a member not in the dataset
for rel in relations.values():
    failed = False
    for member in rel.members:
        if member.type == "node" and member.ref not in node_ids:
            failed = True
        elif member.type == "way" and member.ref not in way_ids:
            failed = True
        elif member.type == "relation" and member.ref not in relations:
            failed = True
    if failed:
        print(rel.osm_id)



In [30]:

    
def relation_has_member_in_bb(rel):
    for member in rel.members:
        if member.type == "node" and member.ref in node_ids_in_bb:
            return True
        elif member.type == "way" and member.ref in way_ids:
            return True
        elif member.type == "relation" and member.ref in relations:
            return relation_has_member_in_bb(relations[member.ref])
    return False

all(relation_has_member_in_bb(rel) for rel in relations.values())









    Out[30]:





True



In [ ]: