notebook.community

Edit and run



In [2]:

    
from lxml import etree
from lxml.etree import tostring
from itertools import chain



In [3]:

    
xml = etree.parse('C:\simplewiki-latest-pages-articles.xml')
root = xml.getroot()



In [5]:

    
def stringify_children(node):
    parts = ([node.text] + list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) + [node.tail])
    return ''.join(filter(None, parts))

for page in root.findall("{http://www.mediawiki.org/xml/export-0.10/}page"):
    title = page.find("{http://www.mediawiki.org/xml/export-0.10/}title")
    text = page.find("{http://www.mediawiki.org/xml/export-0.10/}revision/{http://www.mediawiki.org/xml/export-0.10/}text")
    u_title = stringify_children(title).strip().replace(u':', '_').replace(u'/', '_').replace(u'*', '_').replace(u'\"', '').replace(u'?', '_').replace(u'\\', '_')
    
    if len(u_title) > 30:
        u_title = u_title[:30]
    
    with open("C:/Users/illus_000/Documents/GitHub/korchevatel-babushkina/corpus/simplewiki/" + u_title + ".txt", 'wb') as f:
        u_text = stringify_children(text).encode('utf-8')
        f.write(u_text)