In [197]:
import sys
import os
import os.path
import fnmatch
from bs4 import BeautifulSoup as soup
from bs4 import Comment
from lxml.html.soupparser import fromstring
from lxml import etree
from lxml.etree import tostring
from collections import OrderedDict
import re
from io import StringIO, BytesIO
from copy import deepcopy
import xml.dom.minidom as minidom

In [198]:
def cleanAtts(soup, listAtt, name):
    for att in listAtt:
        for a in soup.find_all(name,{"rend" : att}):
            a.attrs.clear()

In [199]:
def divRend(soup, listAtt, name):
    for att in listAtt:
        for a in soup.find_all('div',{"rend" : att}):
            a.attrs.clear()
            a.name=name

In [200]:
def elemExtract(elems):
    for elem in elems:
        [x.extract() for x in xmlSoup.findAll(elem)]

In [201]:
inv_tags=['p']
def quoting(soup,listVerse,att):
    checkQuote=False
    for verse in listVerse:
        for a in soup.find_all('div',{"rend" : verse}):
            for tag in inv_tags: 
                for match in a.findAll(tag):
                    match.replaceWithChildren()
            a.attrs.clear()
            a.name=att
            checkQuote=True
        if checkQuote==True:
            print("+|+|+|+ Il y a quote avec le rend : "+verse)
    return soup

In [202]:
inv_tags=['p', 'hi']

def dedication(soup):
    testDed1=False
    testDed2=False
    testDed3=False
    for a in soup.find_all('quote',{"rend" : "epigraphe"}):
        for tag in inv_tags: 
            for match in a.findAll(tag):
                match.attrs.clear()
                match.replaceWithChildren()
        newTag=soup.new_tag("epigraph")
        test=re.sub(r"\n{2,}","\n",a.text)
        newTag.string=re.sub(r"\n"," ,",test)
        a.clear()
        a.name="div"
        a["type"]="dedication"
        a.append(newTag)
        testDed1=True
    if testDed1==True:
        print("+.+.+.+ Il y a dédicace")
    for a in soup.find_all('div',{"n" : "Epigraphe"}):
        for tag in inv_tags: 
            for match in a.findAll(tag):
                match.attrs.clear()
                match.replaceWithChildren()
        newTag=soup.new_tag("epigraph")
        test=re.sub(r"\n{2,}","\n",a.text)
        newTag.string=re.sub(r"\n"," ,",test)
        a.clear()
        a.name="div"
        a["type"]="dedication"
        a.append(newTag)
        testDed2=True
    if testDed2==True:
        print("+.+.+.+ Il y a dédicace2")
    for a in soup.find_all('div',{"n" : "Dédicace"}):
        for tag in inv_tags: 
            for match in a.findAll(tag):
                match.attrs.clear()
                match.replaceWithChildren()
        newTag=soup.new_tag("epigraph")
        test=re.sub(r"\n{2,}","\n",a.text)
        newTag.string=re.sub(r"\n",", ",test)
        a.clear()
        a.name="div"
        a["type"]="dedication"
        a.append(newTag)
        testDed3=True
    if testDed3==True:
        print("+.+.+.+ Il y a dédicace3")
    return soup

In [ ]:
rends={'pindent','blocktextpblocktext','frontmatter',
         'captionmatter','realspc',
         'calibre','calibrecalibre','sepetoile','chapter',
         'titlechapter','pcbr','calibretitpartl','titpartl',
        'titlesection','part','chapn','schap','dev','pre',
        'pagecopyright','subtitlechapter','pindentinverse','pc',
      'titchapl','linei','niv','chapno','titchapltitcenter','identauteuridentcenterc',
      'titfblkc','identtitidentcenterc','titlblkidenteditidentcenter','startlinepictos',
      'titchapltitleft','titchapltitjustify','encdef','c','titpartltitleft','txtcourantjustif',
      'divprefpre','chap','sl','divautreappen','blocktextpcblocktext','t','amanuensisautosmallcaps',
      'subtitlefrontmatter','titlefrontmatter','illustypeimagetext','captionpc','captionpcbr',
      'pblancblocktext','pbrblocktext','blocktext','pdblocktext','blocktextpbrblocktext',
      'ov','bold','titlinetitcenterchapno','titlinetitcenter','pgmonospacedpgheader','pd'}
salutes={'dedicace','indentdedicaces'}
quotes={'cita','citation',
        'poetrypoetryintfigureadvertisementfigureadvertisement','blockquote'}
verses={'poetrypoetryintcalibrestropint','citastroplg',
        'poetrycontainerpoetrystanza','poemstanza',
        'poetrycontainerpoetrystanza','poetrypoetryintstropstrop',
        'poem','poetrystrop','poetrystropcentre'}
toRem={'meta','dc:contributor','dc:description',
      'dc:language','dc:identifier','dc:rights','dc:subjects',
      'graphic','?xml-model'}
listChap={"chapitre","Chapitre","Chapitre","CHAPITRE",
              "1","2","3","4","5","6","7","8","9","0"}
listBook={"livre","Livre","LIVRE"}
listPart={"Partie","partie","PARTIE"}
listPreface={"Préface","Preface","PREFACE","PRÉFACE","Préliminaire","PRELIMINAIRE","PRÉLIMINAIRE"}
sectToDel={"propos de cette édition numérique","propos de cette édition électronique","START: FULL LICENSE",
          "Œuvres de ","Page de titre","Page de Titre","Table des Matières","Table des matières",
          "TABLE DES MATIÈRES","TABLE","Page de Copyright","Page de copyright","Copyright","Achevé de numériser",
          "Couverture","Du même auteur"}

In [ ]:
ns = {'dc': 'http://purl.org/dc/elements/1.1/'}

stats = open('/home/odysseus/Bureau/Bureau/ANR/testsCode/stats.txt','w')

for idx, fileTemp in enumerate(fnmatch.filter(os.listdir('/home/odysseus/Bureau/Bureau/ANR/testsCode/output/'), '*.xml')):
    fileTemp=fileTemp.replace("/",":")
    tei = open('/home/odysseus/Bureau/Bureau/ANR/testsCode/output/'+fileTemp).read()
    print("\n"+fileTemp)
    stats.write("\n"+fileTemp+"\n")
    if os.path.isfile('/home/odysseus/Bureau/Bureau/ANR/testsCode/final/'+fileTemp)== False:
        xmlSoup = soup(tei, 'html.parser')
        for element in xmlSoup(text=lambda text: isinstance(text, Comment)):
            element.extract()

    # Nettoyage des dublin core :
        for a in xmlSoup.find_all("dc:creator"):
            a.name="author"
            del a["opf:file-as"]
            del a["xmlns:dc"]
            del a["xmlns:opf"]
            del a["opf:role"]
        for a in xmlSoup.find_all("dc:date"):
            a.name="date"
            cutDate=a.string
            a.string=cutDate[:4]
            del a["xmlns:dc"]
        for a in xmlSoup.find_all("dc:subject"):
            a.name="subject"
            a.attrs.clear()
        for a in xmlSoup.find_all("dc:title"):
            a.name="title"
            del a["xmlns:dc"]
        for a in xmlSoup.find_all("dc:publisher"):
            a.attrs.clear()
            a.name="publisher"

    # italiques
        for a in xmlSoup.find_all("hi"):
            a.attrs.clear()
            a["rend"]="italic"
        for a in xmlSoup.find_all("emph"):
            a.name="hi"
            a.attrs.clear()
            a["rend"]="italic"

    # clear <p>s
        for a in xmlSoup.find_all('p'):
            a.attrs.clear()
            a.name="p"

    # quotes, citations
        quoting(xmlSoup,quotes,"quotecita")
        quoting(xmlSoup,verses,"quoteverse")


    # nettoyer balise head   
        for a in xmlSoup.find_all('head'):
            a.attrs.clear()
            test=a.text
            if "À propos de cette édition numérique" in test:
                a.extract()

    # nettoyer les div rend
        divRend(xmlSoup,rends,"p")
        cleanAtts(xmlSoup,rends,"div")
        cleanAtts(xmlSoup,rends,"quote")
        cleanAtts(xmlSoup,rends,"seg")
        cleanAtts(xmlSoup,rends,"dfn")
        dedication(xmlSoup)
        quoting(xmlSoup,salutes,"salute")

        for a in xmlSoup.find_all('div',{"rend" : "letter"}):
            a.attrs.clear()
            a.name="q"
            a["type"]="letter"
        listChap={"chapitre","Chapitre","CHAPITRE","1","2","3","4","5","6","7","8","9","0"}
        listBook={"livre","Livre","LIVRE"}
        listPart={"Partie","partie","PARTIE"}
        check=False

        for a in xmlSoup.find_all('div',{"type" : "section"},{"n":True}):
            sectionTitle=a["n"]
            sectionTitle=sectionTitle.replace(u'\xa0', ' ').encode('utf-8')
            wordsInTitle=sectionTitle.decode().split(" ")
            for wToDel in sectToDel:
                if wToDel in sectionTitle.decode('utf-8'):
                    a.extract()
            if str(sectionTitle,'utf-8')== "À propos" :
                a.extract()
            if str(sectionTitle,'utf-8')== "Fin" :
                print("attention, il y a un chapitre appelé Fin supprimé")
                stats.write("attention, il y a un chapitre appelé Fin supprimé"+"\n")
                a.extract()

        for a in xmlSoup.find_all('div',{"type" : "section"},{"n":True}):
            sectionTitle=a["n"]
            sectionTitle=sectionTitle.replace(u'\xa0', ' ').encode('utf-8')
            wordsInTitle=sectionTitle.decode().split(" ")
            if len(set(wordsInTitle).intersection(set(listBook)))>0:
                a.attrs.clear()
                a.name="book"
                a["type"]="book"
                a["title"]=sectionTitle.decode("utf-8")
                print("+---+---+ Il y a des livres")
                stats.write("+---+---+ Il y a des livres"+"\n")
            elif len(set(wordsInTitle).intersection(set(listPart)))>0:
                a.attrs.clear()
                a.name="part"
                a["type"]="part"
                a["title"]=sectionTitle.decode("utf-8")
                print("+-+-+-+ Il y a des parties")
                stats.write("+---+---+ Il y a des parties"+"\n")
            elif len(set(wordsInTitle).intersection(set(listPreface)))>0:
                print("+p+p+p+ Il y a une préface")
                a.attrs.clear()
                a.name="div"
                a["type"]="preface"
                a["title"]=sectionTitle.decode("utf-8")
                stats.write("+---+---+ Il y a une préface"+"\n")
            elif "Avertissement" in wordsInTitle :
                print("+p+p+p+ Il y a un avertissement")
                stats.write("+---+---+ Il y a un avertissement"+"\n")
                a.attrs.clear()
                a.name="div"
                a["type"]="avertissement"
                a["title"]=sectionTitle.decode("utf-8")
            else :
                a.attrs.clear()
                a.name="chapter"
                a["type"]="chapter"
                a["title"]=sectionTitle.decode("utf-8")
    #             print("+++ Il y a des chapitres")

        for a in xmlSoup.find_all('div',{"type" : "chapter"}):
            if a.findChild('head'):
                a.name="chapter"
                title=a.find('head').string
                if (title==None):
                    a.extract()
                else:
                    for book in listBook:
                        if book in title:
                            a.name="book"
                    a.attrs.clear()
                    a["title"]=title

    # inclassables

        invalid_tags = ['hi', 'ref','div']
        nbNotes=0
        for a in xmlSoup.find_all('ref',{"rend" : "renvoi"}):
            nbNotes+=1
            if xmlSoup.find('div', {"rend":"notecnt"}):
                elemTarg=xmlSoup.find('div', {"rend":"notecnt"})
                a.name="note"
                a.attrs.clear()
                for tag in invalid_tags: 
                    for match in elemTarg.findAll(tag):
                        match.replaceWithChildren()
                a.string=elemTarg.text
                elemTarg.extract()

            elif xmlSoup.findAll('dfn'):
                listDfns=xmlSoup.findAll('dfn')
                for dfn in listDfns:
                    if len(dfn.findAll("ref",{"rend":"notenumrenvret"}))>0:
                        a.name="note"
                        a.attrs.clear()
                        for tag in inv_tags: 
                            for match in dfn.findAll(tag):
                                match.replaceWithChildren()
                        a.string=dfn.text[1:]
                        dfn.extract()
                        break    


        for a in xmlSoup.find_all('ref',{"rend" : "apnb"}):
            nbNotes+=1
            for elemTarg2 in xmlSoup.findAll('div', {"rend":"ntb"}):
    #             print("il rentre dans la 3e condition")
                a.name="note"
                a.attrs.clear()
                for tag in inv_tags: 
                    for match in elemTarg2.findAll(tag):
                        match.replaceWithChildren()
                a.string=re.sub("\n"," ",elemTarg2.text)
    #             print(a.string)
                elemTarg2.extract()
                break
    #         elemTarg2.extract()
    
        for a in xmlSoup.find_all('ref',{"rend" : "pginternal"}):
            nbNotes+=1
            for elemTarg2 in xmlSoup.findAll('note'):
    #             print("il rentre dans la 3e condition")
                a.name="note"
                a.attrs.clear()
                for tag in inv_tags: 
                    for match in elemTarg2.findAll(tag):
                        match.replaceWithChildren()
                a.string=re.sub("\n"," ",elemTarg2.text)
    #             print(a.string)
                elemTarg2.extract()
                break
        
        if nbNotes>0:            
            print("+#+#+#+ Il y a "+str(nbNotes)+" notes à rattacher")
            stats.write("+#+#+#+ Il y a "+str(nbNotes)+" notes à rattacher\n")

    # suppressions    
        elemExtract(toRem)      
        [x.extract() for x in xmlSoup.findAll('meta')]
        [x.extract() for x in xmlSoup.findAll('table')]
        [x.extract() for x in xmlSoup.findAll('foreign')]
        [x.extract() for x in xmlSoup.findAll('dc:contributor')]
        [x.extract() for x in xmlSoup.findAll('dc:description')]
        [x.extract() for x in xmlSoup.findAll('dc:publisher')]
        [x.extract() for x in xmlSoup.findAll('dc:language')]
        [x.extract() for x in xmlSoup.findAll('dc:identifier')]
        [x.extract() for x in xmlSoup.findAll('dc:rights')]
        [x.extract() for x in xmlSoup.findAll('dc:subject')]
        [x.extract() for x in xmlSoup.findAll('opf:meta')]
        [x.extract() for x in xmlSoup.findAll('graphic')]
        [x.extract() for x in xmlSoup.findAll('?xml-model')]
        [x.extract() for x in xmlSoup.findAll('div', {"rend":"illustypeimage"})]
        [x.extract() for x in xmlSoup.findAll('div',{'rend':"som"})]
        [x.extract() for x in xmlSoup.findAll('div', {"rend":"illustypeimage"})]
        [x.extract() for x in xmlSoup.findAll('div', {"rend":"realspc"})]
        [x.extract() for x in xmlSoup.findAll('ref', {"xml:id":"tdm"})]
        [x.extract() for x in xmlSoup.findAll('ref', {"xml:id":"ete_1_minotaure"})]
        [x.extract() for x in xmlSoup.findAll('div', {"rend":"pblanc"})]
        [x.extract() for x in xmlSoup.findAll('div', {"rend":"realspcc"})]
        [x.extract() for x in xmlSoup.findAll('seg', {"rend":"realspcc"})]
        [x.extract() for x in xmlSoup.findAll('div', {"rend":"vertspc"})]
        [x.extract() for x in xmlSoup.findAll('div', {"rend":"notes"})]
        [x.extract() for x in xmlSoup.findAll('div', {"rend":"defnotes"})]
        
        root=str(xmlSoup)

        root=root.replace("xmlns=\"http://www.tei-c.org/ns/1.0\"","xmlns:tei=\"http://www.tei-c.org/ns/1.0\"")
        root=root.replace("xmlns=\"http://www.idpf.org/2007/opf\"","xmlns:idpf=\"http://www.idpf.org/2007/opf\"")

        myparser = etree.XMLParser(remove_blank_text=True)
        tree   = etree.parse(StringIO(root), parser=myparser)

        root=tree.getroot()
        tei= etree.Element('TEI')

    #     print(etree.tostring(tree, pretty_print=True))


        teiHeader=etree.Element('teiHeader')

        text=etree.Element('text')

        back=etree.Element('back')
        body=etree.Element('body')
        front=etree.Element('front')

        fileDesc=etree.Element('fileDesc')

        titleStmt=etree.Element('titleStmt')
        title=etree.Element('title')
        title.text=tree.find('.//title',root.nsmap).text
        author=etree.Element('author', 
                         attrib=OrderedDict([ \
                            ("key",""), \
                            ("name",tree.find('.//author',root.nsmap).text),\
                            ("from",tree.find('.//date',root.nsmap).text),\
                            ("to",tree.find('.//date',root.nsmap).text)]))
        attEdition = {"n":""}
        edition=etree.Element('edition', attrib=attEdition)

        editor=etree.Element('editor',attrib=OrderedDict([("name",""),("where","")]))
        titleStmt.append(title)
        titleStmt.append(author)
        titleStmt.append(edition)
        titleStmt.append(editor)

        publicationStmt=etree.Element('publicationStmt')
        myattributes2 = {"when": tree.find('.//date',root.nsmap).text,
                     "type": "issued"}
        myattributes1 = {"when": tree.find('.//date',root.nsmap).text,
                     "type": "created"}
        date1=etree.Element('date', attrib=myattributes1)
        date2=etree.Element('date', attrib=myattributes2)
        publicationStmt.append(date1)
        publicationStmt.append(date2)

        editionStmt=etree.Element('editionStmt')
        canon=""
        listSubjects=tree.findall(".//subject",root.nsmap)
        keywords=etree.Element('keywords')
        for a in listSubjects:
            if "canonique" in a.text:
                canon="canonique"
            else:
                canon="non-canonique"
                term=etree.Element("term")
                keywords.append(term)
        attProfDesc = {"type":"","tag":canon}
        profileDesc=etree.Element('profileDesc', attrib=OrderedDict([("type","genre"),("tag","canon")]))
        textClass=etree.Element('textClass')

        textClass.append(keywords)
        profileDesc.append(textClass)
        editionStmt.append(profileDesc)

        fileDesc.append(titleStmt)
        fileDesc.append(publicationStmt)
        fileDesc.append(editionStmt)


        titlePage=etree.Element('titlePage')
        docAuthor=etree.Element('docAuthor')
        docTitle=etree.Element('docTitle')
        attTitPart1={"main":title.text}
        attTitPart2={"sub":""}
        titlePart1=etree.Element('titlePart',attrib=attTitPart1)
        titlePart2=etree.Element('titlePart',attrib=attTitPart2)
        docTitle.append(titlePart1)
        docTitle.append(titlePart2)
        titlePage.append(docAuthor)
        titlePage.append(docTitle)

        attDed={"type":"dedication"}
        divDed=etree.Element('div', attrib=attDed)
        checkDed=False
        if tree.findall('.//salute',root.nsmap):
            salute=etree.Element('salute')
            salute.text=tree.find('.//salute',root.nsmap).text
            divDed.append(salute)
            checkDed=True
        if tree.findall('.//epigraph',root.nsmap):
            epigraph=etree.Element('epigraph')
            epigraph.text=tree.find('.//epigraph',root.nsmap).text
            divDed.append(epigraph)
            checkDed=True
        if (checkDed==True):
            titlePage.append(divDed)      

        if tree.findall('.//div[@type=\'preface\']',root.nsmap):
            attPref={"type":"preface"}
            divPref=etree.Element('div', attrib=attPref)
            print("Déplacement de la la préface")
            divPref=deepcopy(tree.find('.//div[@type=\'preface\']',root.nsmap))
            titlePage.append(divPref)

        if tree.findall('.//div[@type=\'avertissement\']',root.nsmap):
            attPref={"type":"preface"}
            divPref=etree.Element('div', attrib=attPref)
            print("Déplacement de l'avertissement")
            divPref=deepcopy(tree.find('.//div[@type=\'avertissement\']',root.nsmap))
            titlePage.append(divPref)

        front.append(titlePage)


        head=etree.Element('head')

        listBooks=tree.findall(".//subject",root.nsmap)
        book=etree.Element('div', 
                       attrib=OrderedDict(
            [("type","book"),("title",""),("level","1")]))
        listBook=tree.findall(".//book",root.nsmap)
        listPart=tree.findall(".//part",root.nsmap)
        if len(listPart)>0:
            nbPart=0
            print("Il y a "+str(len(listPart))+" parties")
            stats.write("Il y a "+str(len(listPart))+" parties\n")
            for part in listPart:
                nbPart+=1
                nbChapInPart=0
                for chap in part.itersiblings(preceding=False):
                    if chap.tag != "part" and chap.tag == "chapter":
                        part.append(chap)
                    else:
                        break
                    nbChapInPart+=1
                if nbChapInPart>0:
                    print("Il y a "+str(nbChapInPart)+" chapitres dans la partie "+str(nbPart))
                    stats.write("Il y a "+str(nbChapInPart)+" chapitres dans la partie "+str(nbPart)+"\n")
                else:
                    print("Nombre de chapitres internes indéterminé (texte sale)")
                    stats.write("Nombre de chapitres internes indéterminé (texte sale)\n")
                body.append(part)
        if len(listBook)>0:
            print("Il y a "+str(len(listBook))+" livres")
            stats.write("Il y a "+str(len(listBook))+" livres\n")
            nbBook=0
            for book in listBook:
                nbBook+=1
                nbChapInBook=0
                for chap in book.itersiblings(preceding=False):
                    nbChapInBook+=1
                    if chap.tag != "book" and (chap.tag == "chapter" or chap.tag == "part"):
                        book.append(chap)
                    else:
                        break
                if nbChapInBook>0:
                    print("Il y a "+str(nbChapInBook)+" chapitres dans la partie "+str(nbBook))
                    stats.write("Il y a "+str(nbChapInBook)+" chapitres dans la partie "+str(nbBook)+"\n")
                else:
                    print("Nombre de chapitres internes indéterminé (texte sale)")
                    stats.write("Il y a "+str(nbChapInBook)+" chapitres dans la partie "+str(nbBook)+"\n")
                body.append(book)
        else :
            listChap=tree.findall(".//chapter",root.nsmap)
            for chap in listChap:
                body.append(chap)
            if len(listChap)>0:
                print("Il y a "+str(len(listChap))+" chapitres")
                stats.write("Il y a "+str(len(listChap))+" chapitres\n")
            listSect=tree.findall(".//UndefinedSection",root.nsmap)
            for sect in listSect:
                body.append(sect)
                
        body.append(head)
        
#         print(etree.tostring(body, pretty_print=True,encoding = "unicode"))
        
        text.append(front)
        text.append(body)
        text.append(back)
        tei.append(teiHeader)
        tei.append(text)
        teiHeader.append(fileDesc)
        
#         print(etree.tostring(tei, pretty_print=True,encoding = "unicode"))

        for quoteCit in tei.findall(".//quotecita", root.nsmap):
    #         parag=etree.Element('p')
    #         quoteTest=etree.Element('quote')
    #         parag.text=etree.tostring(quoteCit, pretty_print=True)
    #         quoteTest.append(parag)
            quoteCit.tag="quote"
    #         print(quoteCit.tag)
        listVerse=tei.findall(".//quoteverse",root.nsmap)
        for quoteVer in listVerse:
            parag=etree.Element('q')
            if quoteVer.iterdescendants():
                subChildren=quoteVer.iterdescendants()
                for element in subChildren:
                    if element.tag=="l":
                        element.tag="q"
                    parag.append(element)
            quoteVer.tag="quote"
            quoteVer.append(parag)
            quoteVer.text=""

        for bad in tei.xpath('.//p/*[contains(.,"Gutenberg")]'):
            bad.getparent().remove(bad)
            print("XXXXXXXXXXXXXXXXXXXXXXXXX je rentre dans la condition Gutenberg XXXXXXXXXXXXXXXXXXXXXXXXXXX")

        for latestChap in tei.findall(".//chapter", root.nsmap):
            latestChap.tag="div"

        for latestBook in tei.findall(".//book", root.nsmap):
            latestBook.tag="div"

        for latestPart in tei.findall(".//part", root.nsmap):
            latestPart.tag="div"


        final=str(etree.tostring(tei, pretty_print=True,encoding = "unicode"))
        final = re.sub(r'ns[0-9]+:', '', final)
        final= final.replace("&#10;","")
        final= final.replace("●","")
        final= final.replace("■","")
        final= final.replace("◗","")
        final= re.sub(r'\n<dfn>', '', final)
        final= re.sub(r'<dfn>', '', final)
        final= re.sub('</dfn>', '', final)

        pattern=re.compile("\s+")
        soupFinal = soup(final, "xml")
        for p in soupFinal.find_all('p'):
            if p.string and pattern.match(p.string):
                newtag = soupFinal.new_tag('lb')
                p.replace_with(newtag)

        f = open('./final/'+fileTemp, 'w')
        reparsed = minidom.parseString(final)
        f.write(soupFinal.prettify())
        f.close()
stats.close()
print("Fin du nettoyage XML")


2014_Saussure-Horace-Bénédict-de_L'Ascension-du-Mont-Blanc.xml
Il y a 16 chapitres

2011_Chandernagor-Françoise_Les-enfants-d'Alexandrie.xml
Il y a 50 chapitres

2013_Stolz-Madame-de_Les-Poches-de-mon-oncle.xml
Il y a 6 chapitres

1966_San-Antonio_Faut-être-logique.xml
Il y a 1 chapitres

2014_Volodine-Antoine_Terminus-radieux.xml
+-+-+-+ Il y a des parties
+-+-+-+ Il y a des parties
+-+-+-+ Il y a des parties
+-+-+-+ Il y a des parties
Il y a 4 parties
Il y a 8 chapitres dans la partie 1
Il y a 7 chapitres dans la partie 2
Il y a 7 chapitres dans la partie 3
Il y a 27 chapitres dans la partie 4

1920_Toulet-Paul-Jean_La-jeune-fille-verte-roman.xml
Il y a 13 chapitres

1874_Achard-Amédée_Envers-et-contre-tous.xml
Il y a 33 chapitres

1996_Daeninckx-Didier_Nazis-dans-le-métro.xml
Il y a 22 chapitres

1919_Audoux-Marguerite_L'atelier-de-Marie-Claire.xml
attention, il y a un chapitre appelé Fin supprimé
Il y a 18 chapitres

2013_Scholl-Aurélien_Les-Gens-tarés.xml
+p+p+p+ Il y a une préface
Déplacement de la la préface
Il y a 53 chapitres

1898_Ivoi-Paul-d'_La-Capitaine-Nilia.xml

In [ ]: