In [197]:
import sys
import os
import os.path
import fnmatch
from bs4 import BeautifulSoup as soup
from bs4 import Comment
from lxml.html.soupparser import fromstring
from lxml import etree
from lxml.etree import tostring
from collections import OrderedDict
import re
from io import StringIO, BytesIO
from copy import deepcopy
import xml.dom.minidom as minidom
In [198]:
def cleanAtts(soup, listAtt, name):
for att in listAtt:
for a in soup.find_all(name,{"rend" : att}):
a.attrs.clear()
In [199]:
def divRend(soup, listAtt, name):
for att in listAtt:
for a in soup.find_all('div',{"rend" : att}):
a.attrs.clear()
a.name=name
In [200]:
def elemExtract(elems):
for elem in elems:
[x.extract() for x in xmlSoup.findAll(elem)]
In [201]:
inv_tags=['p']
def quoting(soup,listVerse,att):
checkQuote=False
for verse in listVerse:
for a in soup.find_all('div',{"rend" : verse}):
for tag in inv_tags:
for match in a.findAll(tag):
match.replaceWithChildren()
a.attrs.clear()
a.name=att
checkQuote=True
if checkQuote==True:
print("+|+|+|+ Il y a quote avec le rend : "+verse)
return soup
In [202]:
inv_tags=['p', 'hi']
def dedication(soup):
testDed1=False
testDed2=False
testDed3=False
for a in soup.find_all('quote',{"rend" : "epigraphe"}):
for tag in inv_tags:
for match in a.findAll(tag):
match.attrs.clear()
match.replaceWithChildren()
newTag=soup.new_tag("epigraph")
test=re.sub(r"\n{2,}","\n",a.text)
newTag.string=re.sub(r"\n"," ,",test)
a.clear()
a.name="div"
a["type"]="dedication"
a.append(newTag)
testDed1=True
if testDed1==True:
print("+.+.+.+ Il y a dédicace")
for a in soup.find_all('div',{"n" : "Epigraphe"}):
for tag in inv_tags:
for match in a.findAll(tag):
match.attrs.clear()
match.replaceWithChildren()
newTag=soup.new_tag("epigraph")
test=re.sub(r"\n{2,}","\n",a.text)
newTag.string=re.sub(r"\n"," ,",test)
a.clear()
a.name="div"
a["type"]="dedication"
a.append(newTag)
testDed2=True
if testDed2==True:
print("+.+.+.+ Il y a dédicace2")
for a in soup.find_all('div',{"n" : "Dédicace"}):
for tag in inv_tags:
for match in a.findAll(tag):
match.attrs.clear()
match.replaceWithChildren()
newTag=soup.new_tag("epigraph")
test=re.sub(r"\n{2,}","\n",a.text)
newTag.string=re.sub(r"\n",", ",test)
a.clear()
a.name="div"
a["type"]="dedication"
a.append(newTag)
testDed3=True
if testDed3==True:
print("+.+.+.+ Il y a dédicace3")
return soup
In [ ]:
rends={'pindent','blocktextpblocktext','frontmatter',
'captionmatter','realspc',
'calibre','calibrecalibre','sepetoile','chapter',
'titlechapter','pcbr','calibretitpartl','titpartl',
'titlesection','part','chapn','schap','dev','pre',
'pagecopyright','subtitlechapter','pindentinverse','pc',
'titchapl','linei','niv','chapno','titchapltitcenter','identauteuridentcenterc',
'titfblkc','identtitidentcenterc','titlblkidenteditidentcenter','startlinepictos',
'titchapltitleft','titchapltitjustify','encdef','c','titpartltitleft','txtcourantjustif',
'divprefpre','chap','sl','divautreappen','blocktextpcblocktext','t','amanuensisautosmallcaps',
'subtitlefrontmatter','titlefrontmatter','illustypeimagetext','captionpc','captionpcbr',
'pblancblocktext','pbrblocktext','blocktext','pdblocktext','blocktextpbrblocktext',
'ov','bold','titlinetitcenterchapno','titlinetitcenter','pgmonospacedpgheader','pd'}
salutes={'dedicace','indentdedicaces'}
quotes={'cita','citation',
'poetrypoetryintfigureadvertisementfigureadvertisement','blockquote'}
verses={'poetrypoetryintcalibrestropint','citastroplg',
'poetrycontainerpoetrystanza','poemstanza',
'poetrycontainerpoetrystanza','poetrypoetryintstropstrop',
'poem','poetrystrop','poetrystropcentre'}
toRem={'meta','dc:contributor','dc:description',
'dc:language','dc:identifier','dc:rights','dc:subjects',
'graphic','?xml-model'}
listChap={"chapitre","Chapitre","Chapitre","CHAPITRE",
"1","2","3","4","5","6","7","8","9","0"}
listBook={"livre","Livre","LIVRE"}
listPart={"Partie","partie","PARTIE"}
listPreface={"Préface","Preface","PREFACE","PRÉFACE","Préliminaire","PRELIMINAIRE","PRÉLIMINAIRE"}
sectToDel={"propos de cette édition numérique","propos de cette édition électronique","START: FULL LICENSE",
"Œuvres de ","Page de titre","Page de Titre","Table des Matières","Table des matières",
"TABLE DES MATIÈRES","TABLE","Page de Copyright","Page de copyright","Copyright","Achevé de numériser",
"Couverture","Du même auteur"}
In [ ]:
ns = {'dc': 'http://purl.org/dc/elements/1.1/'}
stats = open('/home/odysseus/Bureau/Bureau/ANR/testsCode/stats.txt','w')
for idx, fileTemp in enumerate(fnmatch.filter(os.listdir('/home/odysseus/Bureau/Bureau/ANR/testsCode/output/'), '*.xml')):
fileTemp=fileTemp.replace("/",":")
tei = open('/home/odysseus/Bureau/Bureau/ANR/testsCode/output/'+fileTemp).read()
print("\n"+fileTemp)
stats.write("\n"+fileTemp+"\n")
if os.path.isfile('/home/odysseus/Bureau/Bureau/ANR/testsCode/final/'+fileTemp)== False:
xmlSoup = soup(tei, 'html.parser')
for element in xmlSoup(text=lambda text: isinstance(text, Comment)):
element.extract()
# Nettoyage des dublin core :
for a in xmlSoup.find_all("dc:creator"):
a.name="author"
del a["opf:file-as"]
del a["xmlns:dc"]
del a["xmlns:opf"]
del a["opf:role"]
for a in xmlSoup.find_all("dc:date"):
a.name="date"
cutDate=a.string
a.string=cutDate[:4]
del a["xmlns:dc"]
for a in xmlSoup.find_all("dc:subject"):
a.name="subject"
a.attrs.clear()
for a in xmlSoup.find_all("dc:title"):
a.name="title"
del a["xmlns:dc"]
for a in xmlSoup.find_all("dc:publisher"):
a.attrs.clear()
a.name="publisher"
# italiques
for a in xmlSoup.find_all("hi"):
a.attrs.clear()
a["rend"]="italic"
for a in xmlSoup.find_all("emph"):
a.name="hi"
a.attrs.clear()
a["rend"]="italic"
# clear <p>s
for a in xmlSoup.find_all('p'):
a.attrs.clear()
a.name="p"
# quotes, citations
quoting(xmlSoup,quotes,"quotecita")
quoting(xmlSoup,verses,"quoteverse")
# nettoyer balise head
for a in xmlSoup.find_all('head'):
a.attrs.clear()
test=a.text
if "À propos de cette édition numérique" in test:
a.extract()
# nettoyer les div rend
divRend(xmlSoup,rends,"p")
cleanAtts(xmlSoup,rends,"div")
cleanAtts(xmlSoup,rends,"quote")
cleanAtts(xmlSoup,rends,"seg")
cleanAtts(xmlSoup,rends,"dfn")
dedication(xmlSoup)
quoting(xmlSoup,salutes,"salute")
for a in xmlSoup.find_all('div',{"rend" : "letter"}):
a.attrs.clear()
a.name="q"
a["type"]="letter"
listChap={"chapitre","Chapitre","CHAPITRE","1","2","3","4","5","6","7","8","9","0"}
listBook={"livre","Livre","LIVRE"}
listPart={"Partie","partie","PARTIE"}
check=False
for a in xmlSoup.find_all('div',{"type" : "section"},{"n":True}):
sectionTitle=a["n"]
sectionTitle=sectionTitle.replace(u'\xa0', ' ').encode('utf-8')
wordsInTitle=sectionTitle.decode().split(" ")
for wToDel in sectToDel:
if wToDel in sectionTitle.decode('utf-8'):
a.extract()
if str(sectionTitle,'utf-8')== "À propos" :
a.extract()
if str(sectionTitle,'utf-8')== "Fin" :
print("attention, il y a un chapitre appelé Fin supprimé")
stats.write("attention, il y a un chapitre appelé Fin supprimé"+"\n")
a.extract()
for a in xmlSoup.find_all('div',{"type" : "section"},{"n":True}):
sectionTitle=a["n"]
sectionTitle=sectionTitle.replace(u'\xa0', ' ').encode('utf-8')
wordsInTitle=sectionTitle.decode().split(" ")
if len(set(wordsInTitle).intersection(set(listBook)))>0:
a.attrs.clear()
a.name="book"
a["type"]="book"
a["title"]=sectionTitle.decode("utf-8")
print("+---+---+ Il y a des livres")
stats.write("+---+---+ Il y a des livres"+"\n")
elif len(set(wordsInTitle).intersection(set(listPart)))>0:
a.attrs.clear()
a.name="part"
a["type"]="part"
a["title"]=sectionTitle.decode("utf-8")
print("+-+-+-+ Il y a des parties")
stats.write("+---+---+ Il y a des parties"+"\n")
elif len(set(wordsInTitle).intersection(set(listPreface)))>0:
print("+p+p+p+ Il y a une préface")
a.attrs.clear()
a.name="div"
a["type"]="preface"
a["title"]=sectionTitle.decode("utf-8")
stats.write("+---+---+ Il y a une préface"+"\n")
elif "Avertissement" in wordsInTitle :
print("+p+p+p+ Il y a un avertissement")
stats.write("+---+---+ Il y a un avertissement"+"\n")
a.attrs.clear()
a.name="div"
a["type"]="avertissement"
a["title"]=sectionTitle.decode("utf-8")
else :
a.attrs.clear()
a.name="chapter"
a["type"]="chapter"
a["title"]=sectionTitle.decode("utf-8")
# print("+++ Il y a des chapitres")
for a in xmlSoup.find_all('div',{"type" : "chapter"}):
if a.findChild('head'):
a.name="chapter"
title=a.find('head').string
if (title==None):
a.extract()
else:
for book in listBook:
if book in title:
a.name="book"
a.attrs.clear()
a["title"]=title
# inclassables
invalid_tags = ['hi', 'ref','div']
nbNotes=0
for a in xmlSoup.find_all('ref',{"rend" : "renvoi"}):
nbNotes+=1
if xmlSoup.find('div', {"rend":"notecnt"}):
elemTarg=xmlSoup.find('div', {"rend":"notecnt"})
a.name="note"
a.attrs.clear()
for tag in invalid_tags:
for match in elemTarg.findAll(tag):
match.replaceWithChildren()
a.string=elemTarg.text
elemTarg.extract()
elif xmlSoup.findAll('dfn'):
listDfns=xmlSoup.findAll('dfn')
for dfn in listDfns:
if len(dfn.findAll("ref",{"rend":"notenumrenvret"}))>0:
a.name="note"
a.attrs.clear()
for tag in inv_tags:
for match in dfn.findAll(tag):
match.replaceWithChildren()
a.string=dfn.text[1:]
dfn.extract()
break
for a in xmlSoup.find_all('ref',{"rend" : "apnb"}):
nbNotes+=1
for elemTarg2 in xmlSoup.findAll('div', {"rend":"ntb"}):
# print("il rentre dans la 3e condition")
a.name="note"
a.attrs.clear()
for tag in inv_tags:
for match in elemTarg2.findAll(tag):
match.replaceWithChildren()
a.string=re.sub("\n"," ",elemTarg2.text)
# print(a.string)
elemTarg2.extract()
break
# elemTarg2.extract()
for a in xmlSoup.find_all('ref',{"rend" : "pginternal"}):
nbNotes+=1
for elemTarg2 in xmlSoup.findAll('note'):
# print("il rentre dans la 3e condition")
a.name="note"
a.attrs.clear()
for tag in inv_tags:
for match in elemTarg2.findAll(tag):
match.replaceWithChildren()
a.string=re.sub("\n"," ",elemTarg2.text)
# print(a.string)
elemTarg2.extract()
break
if nbNotes>0:
print("+#+#+#+ Il y a "+str(nbNotes)+" notes à rattacher")
stats.write("+#+#+#+ Il y a "+str(nbNotes)+" notes à rattacher\n")
# suppressions
elemExtract(toRem)
[x.extract() for x in xmlSoup.findAll('meta')]
[x.extract() for x in xmlSoup.findAll('table')]
[x.extract() for x in xmlSoup.findAll('foreign')]
[x.extract() for x in xmlSoup.findAll('dc:contributor')]
[x.extract() for x in xmlSoup.findAll('dc:description')]
[x.extract() for x in xmlSoup.findAll('dc:publisher')]
[x.extract() for x in xmlSoup.findAll('dc:language')]
[x.extract() for x in xmlSoup.findAll('dc:identifier')]
[x.extract() for x in xmlSoup.findAll('dc:rights')]
[x.extract() for x in xmlSoup.findAll('dc:subject')]
[x.extract() for x in xmlSoup.findAll('opf:meta')]
[x.extract() for x in xmlSoup.findAll('graphic')]
[x.extract() for x in xmlSoup.findAll('?xml-model')]
[x.extract() for x in xmlSoup.findAll('div', {"rend":"illustypeimage"})]
[x.extract() for x in xmlSoup.findAll('div',{'rend':"som"})]
[x.extract() for x in xmlSoup.findAll('div', {"rend":"illustypeimage"})]
[x.extract() for x in xmlSoup.findAll('div', {"rend":"realspc"})]
[x.extract() for x in xmlSoup.findAll('ref', {"xml:id":"tdm"})]
[x.extract() for x in xmlSoup.findAll('ref', {"xml:id":"ete_1_minotaure"})]
[x.extract() for x in xmlSoup.findAll('div', {"rend":"pblanc"})]
[x.extract() for x in xmlSoup.findAll('div', {"rend":"realspcc"})]
[x.extract() for x in xmlSoup.findAll('seg', {"rend":"realspcc"})]
[x.extract() for x in xmlSoup.findAll('div', {"rend":"vertspc"})]
[x.extract() for x in xmlSoup.findAll('div', {"rend":"notes"})]
[x.extract() for x in xmlSoup.findAll('div', {"rend":"defnotes"})]
root=str(xmlSoup)
root=root.replace("xmlns=\"http://www.tei-c.org/ns/1.0\"","xmlns:tei=\"http://www.tei-c.org/ns/1.0\"")
root=root.replace("xmlns=\"http://www.idpf.org/2007/opf\"","xmlns:idpf=\"http://www.idpf.org/2007/opf\"")
myparser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(StringIO(root), parser=myparser)
root=tree.getroot()
tei= etree.Element('TEI')
# print(etree.tostring(tree, pretty_print=True))
teiHeader=etree.Element('teiHeader')
text=etree.Element('text')
back=etree.Element('back')
body=etree.Element('body')
front=etree.Element('front')
fileDesc=etree.Element('fileDesc')
titleStmt=etree.Element('titleStmt')
title=etree.Element('title')
title.text=tree.find('.//title',root.nsmap).text
author=etree.Element('author',
attrib=OrderedDict([ \
("key",""), \
("name",tree.find('.//author',root.nsmap).text),\
("from",tree.find('.//date',root.nsmap).text),\
("to",tree.find('.//date',root.nsmap).text)]))
attEdition = {"n":""}
edition=etree.Element('edition', attrib=attEdition)
editor=etree.Element('editor',attrib=OrderedDict([("name",""),("where","")]))
titleStmt.append(title)
titleStmt.append(author)
titleStmt.append(edition)
titleStmt.append(editor)
publicationStmt=etree.Element('publicationStmt')
myattributes2 = {"when": tree.find('.//date',root.nsmap).text,
"type": "issued"}
myattributes1 = {"when": tree.find('.//date',root.nsmap).text,
"type": "created"}
date1=etree.Element('date', attrib=myattributes1)
date2=etree.Element('date', attrib=myattributes2)
publicationStmt.append(date1)
publicationStmt.append(date2)
editionStmt=etree.Element('editionStmt')
canon=""
listSubjects=tree.findall(".//subject",root.nsmap)
keywords=etree.Element('keywords')
for a in listSubjects:
if "canonique" in a.text:
canon="canonique"
else:
canon="non-canonique"
term=etree.Element("term")
keywords.append(term)
attProfDesc = {"type":"","tag":canon}
profileDesc=etree.Element('profileDesc', attrib=OrderedDict([("type","genre"),("tag","canon")]))
textClass=etree.Element('textClass')
textClass.append(keywords)
profileDesc.append(textClass)
editionStmt.append(profileDesc)
fileDesc.append(titleStmt)
fileDesc.append(publicationStmt)
fileDesc.append(editionStmt)
titlePage=etree.Element('titlePage')
docAuthor=etree.Element('docAuthor')
docTitle=etree.Element('docTitle')
attTitPart1={"main":title.text}
attTitPart2={"sub":""}
titlePart1=etree.Element('titlePart',attrib=attTitPart1)
titlePart2=etree.Element('titlePart',attrib=attTitPart2)
docTitle.append(titlePart1)
docTitle.append(titlePart2)
titlePage.append(docAuthor)
titlePage.append(docTitle)
attDed={"type":"dedication"}
divDed=etree.Element('div', attrib=attDed)
checkDed=False
if tree.findall('.//salute',root.nsmap):
salute=etree.Element('salute')
salute.text=tree.find('.//salute',root.nsmap).text
divDed.append(salute)
checkDed=True
if tree.findall('.//epigraph',root.nsmap):
epigraph=etree.Element('epigraph')
epigraph.text=tree.find('.//epigraph',root.nsmap).text
divDed.append(epigraph)
checkDed=True
if (checkDed==True):
titlePage.append(divDed)
if tree.findall('.//div[@type=\'preface\']',root.nsmap):
attPref={"type":"preface"}
divPref=etree.Element('div', attrib=attPref)
print("Déplacement de la la préface")
divPref=deepcopy(tree.find('.//div[@type=\'preface\']',root.nsmap))
titlePage.append(divPref)
if tree.findall('.//div[@type=\'avertissement\']',root.nsmap):
attPref={"type":"preface"}
divPref=etree.Element('div', attrib=attPref)
print("Déplacement de l'avertissement")
divPref=deepcopy(tree.find('.//div[@type=\'avertissement\']',root.nsmap))
titlePage.append(divPref)
front.append(titlePage)
head=etree.Element('head')
listBooks=tree.findall(".//subject",root.nsmap)
book=etree.Element('div',
attrib=OrderedDict(
[("type","book"),("title",""),("level","1")]))
listBook=tree.findall(".//book",root.nsmap)
listPart=tree.findall(".//part",root.nsmap)
if len(listPart)>0:
nbPart=0
print("Il y a "+str(len(listPart))+" parties")
stats.write("Il y a "+str(len(listPart))+" parties\n")
for part in listPart:
nbPart+=1
nbChapInPart=0
for chap in part.itersiblings(preceding=False):
if chap.tag != "part" and chap.tag == "chapter":
part.append(chap)
else:
break
nbChapInPart+=1
if nbChapInPart>0:
print("Il y a "+str(nbChapInPart)+" chapitres dans la partie "+str(nbPart))
stats.write("Il y a "+str(nbChapInPart)+" chapitres dans la partie "+str(nbPart)+"\n")
else:
print("Nombre de chapitres internes indéterminé (texte sale)")
stats.write("Nombre de chapitres internes indéterminé (texte sale)\n")
body.append(part)
if len(listBook)>0:
print("Il y a "+str(len(listBook))+" livres")
stats.write("Il y a "+str(len(listBook))+" livres\n")
nbBook=0
for book in listBook:
nbBook+=1
nbChapInBook=0
for chap in book.itersiblings(preceding=False):
nbChapInBook+=1
if chap.tag != "book" and (chap.tag == "chapter" or chap.tag == "part"):
book.append(chap)
else:
break
if nbChapInBook>0:
print("Il y a "+str(nbChapInBook)+" chapitres dans la partie "+str(nbBook))
stats.write("Il y a "+str(nbChapInBook)+" chapitres dans la partie "+str(nbBook)+"\n")
else:
print("Nombre de chapitres internes indéterminé (texte sale)")
stats.write("Il y a "+str(nbChapInBook)+" chapitres dans la partie "+str(nbBook)+"\n")
body.append(book)
else :
listChap=tree.findall(".//chapter",root.nsmap)
for chap in listChap:
body.append(chap)
if len(listChap)>0:
print("Il y a "+str(len(listChap))+" chapitres")
stats.write("Il y a "+str(len(listChap))+" chapitres\n")
listSect=tree.findall(".//UndefinedSection",root.nsmap)
for sect in listSect:
body.append(sect)
body.append(head)
# print(etree.tostring(body, pretty_print=True,encoding = "unicode"))
text.append(front)
text.append(body)
text.append(back)
tei.append(teiHeader)
tei.append(text)
teiHeader.append(fileDesc)
# print(etree.tostring(tei, pretty_print=True,encoding = "unicode"))
for quoteCit in tei.findall(".//quotecita", root.nsmap):
# parag=etree.Element('p')
# quoteTest=etree.Element('quote')
# parag.text=etree.tostring(quoteCit, pretty_print=True)
# quoteTest.append(parag)
quoteCit.tag="quote"
# print(quoteCit.tag)
listVerse=tei.findall(".//quoteverse",root.nsmap)
for quoteVer in listVerse:
parag=etree.Element('q')
if quoteVer.iterdescendants():
subChildren=quoteVer.iterdescendants()
for element in subChildren:
if element.tag=="l":
element.tag="q"
parag.append(element)
quoteVer.tag="quote"
quoteVer.append(parag)
quoteVer.text=""
for bad in tei.xpath('.//p/*[contains(.,"Gutenberg")]'):
bad.getparent().remove(bad)
print("XXXXXXXXXXXXXXXXXXXXXXXXX je rentre dans la condition Gutenberg XXXXXXXXXXXXXXXXXXXXXXXXXXX")
for latestChap in tei.findall(".//chapter", root.nsmap):
latestChap.tag="div"
for latestBook in tei.findall(".//book", root.nsmap):
latestBook.tag="div"
for latestPart in tei.findall(".//part", root.nsmap):
latestPart.tag="div"
final=str(etree.tostring(tei, pretty_print=True,encoding = "unicode"))
final = re.sub(r'ns[0-9]+:', '', final)
final= final.replace(" ","")
final= final.replace("●","")
final= final.replace("■","")
final= final.replace("◗","")
final= re.sub(r'\n<dfn>', '', final)
final= re.sub(r'<dfn>', '', final)
final= re.sub('</dfn>', '', final)
pattern=re.compile("\s+")
soupFinal = soup(final, "xml")
for p in soupFinal.find_all('p'):
if p.string and pattern.match(p.string):
newtag = soupFinal.new_tag('lb')
p.replace_with(newtag)
f = open('./final/'+fileTemp, 'w')
reparsed = minidom.parseString(final)
f.write(soupFinal.prettify())
f.close()
stats.close()
print("Fin du nettoyage XML")
In [ ]: