In [9]:
#!/usr/bin/python
#coding=utf-8

OSM文件分类捡取工具。

by openthings@163.com, 2016-05-04.

将osm文件按照tag分类,并转为不同的文件,以方便后续的处理。

  • 每一个tag对象转为独立的一行(去掉换行符),以便Spark读入。
  • 采用递归方式处理,占用内存较少,可以处理大型文件。

后续工作:

每一tag对象数据转为dict并保存为json到一行。
每一tag对象数据转为wkt格式。


In [2]:
import os
import lxml
from lxml import etree
import xmltodict, sys, gc
from pymongo import MongoClient

gc.enable() #Enable Garbadge Collection

client = MongoClient()
db = client.re
streetsDB = db.streets

hwTypes = ['motorway', 'trunk', 'primary', 'secondary', 'tertiary', 'pedestrian', 'unclassified', 'service']

递归方式读取osm的xml结构数据。

http://www.ibm.com/developerworks/xml/library/x-hiperfparse/


In [39]:
def process_element(elem):
    print("element:",str(elem.attrib))
    if (elem.tag=="node"): 
        fnode.write((etree.tostring(elem).decode('utf-8'))+"\r\n")
    elif (elem.tag=="way"): 
        fway.write((etree.tostring(elem).decode('utf-8'))+"\r\n")
    elif (elem.tag=="relation"): 
        frelation.write((etree.tostring(elem)).decode('utf-8')+"\r\n")
    data = etree.tostring(elem)
    
    #data = etree.tostring(elem)
    #data = xmltodict.parse(data)
    #print(data.decode('ascii'))
    #print(str(elem))

快速迭代处理,func为迭代的element处理函数。


In [165]:
from pprint import *

def fast_iter(context, func, file, maxline):
    print('Process XML...')
    placement = 0
    try:
        for event, elem in context:
            placement += 1
            if (maxline > 0):
                if (placement >= maxline): break
            print(placement,"elem: ")

            #print("element",str(elem.attrib)) 
            data = etree.tostring(elem)
            print(data)
            
            global data2
            data2 = xmltodict.parse(data)
            pprint(data2)

            #if (file):
            #    file.write(str(elem.attrib) + "\n")
            #else:
            #    print("file is null.")
            #func(elem)
                           
            elem.clear()
            #while elem.getprevious() is not None:
            #   del elem.getparent()[0]
    except Exception as ex:
        print("Error:",ex)
        
    del context

将指定tag的对象提取,写入json文件。

osmfile:输入的\.osm文件 *
tagname:'node','way','relation'


In [166]:
def process_tag(osmfile, tagname, maxline):
    filename_tag = osmfile + "_" + tagname + ".json"
    print("Filename output: ",filename_tag)
    ftag = open(filename_tag,"w+")
    context = etree.iterparse(osmfile, tag = tagname)
    fast_iter(context,process_element,ftag,maxline)
    ftag.close()

In [167]:
osmfile = '../data/muenchen.osm'

#process_tag(osmfile,'node',5)
process_tag(osmfile,'way',2)
#process_tag(osmfile,'relation',0)


Filename output:  ../data/muenchen.osm_way.json
Process XML...
1 elem: 
b'<way id="24665462" visible="true" version="5" changeset="31566605" timestamp="2015-05-29T16:01:23Z" user="zarl" uid="29003">\n  <nd ref="21585827"/>\n  <nd ref="3556567936"/>\n  <nd ref="2475797158"/>\n  <nd ref="268098186"/>\n  <nd ref="268101008"/>\n  <nd ref="268098217"/>\n  <nd ref="268098189"/>\n  <tag k="bicycle" v="yes"/>\n  <tag k="highway" v="footway"/>\n  <tag k="lit" v="yes"/>\n  <tag k="smoothness" v="good"/>\n  <tag k="surface" v="asphalt"/>\n </way>\n '
OrderedDict([('way',
              OrderedDict([('@id', '24665462'),
                           ('@visible', 'true'),
                           ('@version', '5'),
                           ('@changeset', '31566605'),
                           ('@timestamp', '2015-05-29T16:01:23Z'),
                           ('@user', 'zarl'),
                           ('@uid', '29003'),
                           ('nd',
                            [OrderedDict([('@ref', '21585827')]),
                             OrderedDict([('@ref', '3556567936')]),
                             OrderedDict([('@ref', '2475797158')]),
                             OrderedDict([('@ref', '268098186')]),
                             OrderedDict([('@ref', '268101008')]),
                             OrderedDict([('@ref', '268098217')]),
                             OrderedDict([('@ref', '268098189')])]),
                           ('tag',
                            [OrderedDict([('@k', 'bicycle'), ('@v', 'yes')]),
                             OrderedDict([('@k', 'highway'),
                                          ('@v', 'footway')]),
                             OrderedDict([('@k', 'lit'), ('@v', 'yes')]),
                             OrderedDict([('@k', 'smoothness'),
                                          ('@v', 'good')]),
                             OrderedDict([('@k', 'surface'),
                                          ('@v', 'asphalt')])])]))])

In [152]:
pprint(data2)


OrderedDict([('way',
              OrderedDict([('@id', '24665462'),
                           ('@visible', 'true'),
                           ('@version', '5'),
                           ('@changeset', '31566605'),
                           ('@timestamp', '2015-05-29T16:01:23Z'),
                           ('@user', 'zarl'),
                           ('@uid', '29003'),
                           ('nd',
                            [OrderedDict([('@ref', '21585827')]),
                             OrderedDict([('@ref', '3556567936')]),
                             OrderedDict([('@ref', '2475797158')]),
                             OrderedDict([('@ref', '268098186')]),
                             OrderedDict([('@ref', '268101008')]),
                             OrderedDict([('@ref', '268098217')]),
                             OrderedDict([('@ref', '268098189')])]),
                           ('tag',
                            [OrderedDict([('@k', 'bicycle'), ('@v', 'yes')]),
                             OrderedDict([('@k', 'highway'),
                                          ('@v', 'footway')]),
                             OrderedDict([('@k', 'lit'), ('@v', 'yes')]),
                             OrderedDict([('@k', 'smoothness'),
                                          ('@v', 'good')]),
                             OrderedDict([('@k', 'surface'),
                                          ('@v', 'asphalt')])])]))])

In [164]:
for i in data2["way"]["nd"]:
    print("nd=",i["@ref"])


nd= 21585827
nd= 3556567936
nd= 2475797158
nd= 268098186
nd= 268101008
nd= 268098217
nd= 268098189

In [163]:
for i in data2["way"]["tag"]:
    print(i["@k"],"=",i["@v"])


bicycle = yes
highway = footway
lit = yes
smoothness = good
surface = asphalt

In [171]:
import json
jsonStr = json.dumps(data2)
pprint(jsonStr)


('{"way": {"@id": "24665462", "@visible": "true", "@version": "5", '
 '"@changeset": "31566605", "@timestamp": "2015-05-29T16:01:23Z", "@user": '
 '"zarl", "@uid": "29003", "nd": [{"@ref": "21585827"}, {"@ref": '
 '"3556567936"}, {"@ref": "2475797158"}, {"@ref": "268098186"}, {"@ref": '
 '"268101008"}, {"@ref": "268098217"}, {"@ref": "268098189"}], "tag": [{"@k": '
 '"bicycle", "@v": "yes"}, {"@k": "highway", "@v": "footway"}, {"@k": "lit", '
 '"@v": "yes"}, {"@k": "smoothness", "@v": "good"}, {"@k": "surface", "@v": '
 '"asphalt"}]}}')

In [182]:
jsonobj = json.loads(jsonStr)
pprint(jsonobj)


{'way': {'@changeset': '31566605',
         '@id': '24665462',
         '@timestamp': '2015-05-29T16:01:23Z',
         '@uid': '29003',
         '@user': 'zarl',
         '@version': '5',
         '@visible': 'true',
         'nd': [{'@ref': '21585827'},
                {'@ref': '3556567936'},
                {'@ref': '2475797158'},
                {'@ref': '268098186'},
                {'@ref': '268101008'},
                {'@ref': '268098217'},
                {'@ref': '268098189'}],
         'tag': [{'@k': 'bicycle', '@v': 'yes'},
                 {'@k': 'highway', '@v': 'footway'},
                 {'@k': 'lit', '@v': 'yes'},
                 {'@k': 'smoothness', '@v': 'good'},
                 {'@k': 'surface', '@v': 'asphalt'}]}}

In [186]:
jsonobj["way"]["tag"]


Out[186]:
[{'@k': 'bicycle', '@v': 'yes'},
 {'@k': 'highway', '@v': 'footway'},
 {'@k': 'lit', '@v': 'yes'},
 {'@k': 'smoothness', '@v': 'good'},
 {'@k': 'surface', '@v': 'asphalt'}]

In [ ]: