In [2]:
from lxml import etree
from lxml.etree import tostring
from itertools import chain
In [3]:
xml = etree.parse('C:\simplewiki-latest-pages-articles.xml')
root = xml.getroot()
In [5]:
def stringify_children(node):
parts = ([node.text] + list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) + [node.tail])
return ''.join(filter(None, parts))
for page in root.findall("{http://www.mediawiki.org/xml/export-0.10/}page"):
title = page.find("{http://www.mediawiki.org/xml/export-0.10/}title")
text = page.find("{http://www.mediawiki.org/xml/export-0.10/}revision/{http://www.mediawiki.org/xml/export-0.10/}text")
u_title = stringify_children(title).strip().replace(u':', '_').replace(u'/', '_').replace(u'*', '_').replace(u'\"', '').replace(u'?', '_').replace(u'\\', '_')
if len(u_title) > 30:
u_title = u_title[:30]
with open("C:/Users/illus_000/Documents/GitHub/korchevatel-babushkina/corpus/simplewiki/" + u_title + ".txt", 'wb') as f:
u_text = stringify_children(text).encode('utf-8')
f.write(u_text)