In [36]:
import os
import lxml
from lxml import etree
import xmltodict, sys, gc
from pymongo import MongoClient
#help(lxml.etree._Element)

快速遍历xml文档,采用递归器模式。

context = etree.iterparse(infile, events=('end',), tag='Title')  
for event, elem in context:  
       out.write('%s\n' % elem.text.encode('utf-8'))

递归读取osm文件,写入mongoDB。


In [1]:
#Ultimate fix of life.
#reload(sys)
#sys.setdefaultencoding("utf-8")

client = MongoClient()
db = client.re
streetsDB = db.streets

hwTypes = ['motorway', 'trunk', 'primary', 'secondary', 'tertiary', 'pedestrian', 'unclassified', 'service']

#Enable Garbadge Collection
gc.enable()

# http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
# Author: Liza Daly
def fast_iter(context, func):
    print('Process...')
    placement = 1
    for event, elem in context:
        placement += 1
        #if (placement>=10): break
        #print(placement,)
                
        func(elem)
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    del context

fnode = open("../data/fnode.txt","w+")
fway = open("../data/fway.txt","w+")
frelation = open("../data/frelation.txt","w+")

def process_element(elem):
    #print(str(elem.attrib))
    if (elem.tag=="node"): 
        fnode.write((etree.tostring(elem).decode('utf-8'))+"\r\n")
    elif (elem.tag=="way"): 
        fway.write((etree.tostring(elem).decode('utf-8'))+"\r\n")
    elif (elem.tag=="relation"): 
        frelation.write((etree.tostring(elem)).decode('utf-8')+"\r\n")
    data = etree.tostring(elem)
    #data = etree.tostring(elem)
    #data = xmltodict.parse(data)

    #print(data.decode('ascii'))
    #print(str(elem))
'''
    data = xmltodict.parse(data)
    keys = data['way'].keys()
    if 'tag' in keys:
        if isinstance(data['way']['tag'], dict):
                    if data['way']['tag']['@k'] == 'highway':
                        if data['way']['tag']['@v'] in hwTypes:
                            streetsDB.insert(data)
        else:
            for y in data['way']['tag']:
                if y['@k'] == 'highway':
                    if y['@v'] in hwTypes:
                        streetsDB.insert(data)
                        break

    del data
    del keys
    gc.collect()
'''
#
context = etree.iterparse('../data/muenchen.osm', tag=['way','node','relation'] )
#context = etree.iterparse('../data/muenchen.osm', tag='node' )
fast_iter(context,process_element)
#fast_iter(context,process_node)
fnode.close()
fway.close()
frelation.close()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-5f8795d80dfe> in <module>()
      3 #sys.setdefaultencoding("utf-8")
      4 
----> 5 client = MongoClient()
      6 db = client.re
      7 streetsDB = db.streets

NameError: name 'MongoClient' is not defined
class TitleTarget(object): def __init__(self): self.text = [] def start(self, tag, attrib): self.is_title = True if tag == 'Title' else False def end(self, tag): pass def data(self, data): if self.is_title: self.text.append(data.encode('utf-8')) def close(self): return self.text parser = etree.XMLParser(target = TitleTarget()) # This and most other samples read in the Google copyright data infile = 'copyright.xml' results = etree.parse(infile, parser) # When iterated over, 'results' will contain the output from # target parser's close() method out = open('titles.txt', 'w') out.write('\n'.join(results)) out.close()