Taking a closer look at the ways in which the texts are structured, we found (a) "paragraphs" to be a promising candidate for segmentation, provided that we do not understand "paragraph" in the typographical sense but as a section of the text that is introduced with a pilcrow sign ("¶"). As second and third criteria for segmentation, we also use (b) daggers and (c) two subsequent capital letters when no pilcrow sign is around. So this is what we try here...
After a revision of the results, we also
We parse what resolved files we find:
In [18]:
import os
import lxml
from lxml import etree
resolved_dir = "./data/processing/10000_resolved"
# we create a dictionary with our editions:
resolved = { os.path.basename(file).split(os.extsep)[0] :
(etree.parse(resolved_dir + "/" + file))
for file in sorted(os.listdir(resolved_dir))
}
# and a list of available editions for quick lookup:
editions = list(resolved.keys())
# For now, hard-code languages
language = {}
language['azp1549'] = "pt"
language['azp1552'] = "pt"
language['azp1556'] = "es"
language['azp1573'] = "la"
print ("Parsed {} resolved files: {}".format(len(resolved), editions))
Next, we add a switch allowing to specify whether we want to overwrite result files that might exist already:
In [19]:
import ipywidgets as widgets
from ipywidgets import interact # for interactively en-/disabling overwrite
def ow_seg(overwrite_segmented):
global overwrite_seg
overwrite_seg = overwrite_segmented
if os.listdir('./data/processing/12000_segmented_paragraphs/'):
overwrite_seg = False
interact(ow_seg, overwrite_segmented=True)
else:
overwrite_seg = True
In [20]:
print('Overwrite segmented files?: {}'.format(overwrite_seg))
Next, to have some diagnostic information, we count milestone and div elements for all editions:
In [21]:
import re
nsmap = {"tei": "http://www.tei-c.org/ns/1.0"}
string_doc = {}
string_reverse_doc = {}
find_divs = etree.XPath("//tei:body/tei:div[@type = 'chapter'][not(@n = '0')]", namespaces=nsmap)
find_ps = etree.XPath("//tei:body/tei:div[not(@n = '0')]//tei:p", namespaces=nsmap)
find_ms = etree.XPath("//tei:body//tei:milestone", namespaces=nsmap)
find_body = etree.XPath("//tei:body", namespaces=nsmap)
# since python negative look*behind* assertions have to be fixed-length,
# we reverse the document and do negative look*ahead* assertions...
find_lone_daggers = re.compile(r'reggad#(?!.{0,100}¶)') # daggers not preceded by pilcrow within 100 characters
find_lone_ms = re.compile(r'derohcnanu#(?!.{0,100}¶)') # unanchored milestones not preceded by pilcrow within 100 characters
find_lone_caps = re.compile(r'[A-Z]{2}\b(?!.{0,100}¶)') # two capital letters not preceded ...
ct_divs = {}
ct_ps = {}
ct_pilcrows = {}
ct_ms = {}
ct_total_daggers = {}
ct_lone_daggers = {}
ct_lone_ms = {}
ct_lone_caps = {}
for ed in resolved:
ct_divs[ed] = len(find_divs(resolved[ed]))
ct_ps[ed] = len(find_ps(resolved[ed]))
ct_ms[ed] = len(find_ms(resolved[ed]))
string_doc[ed] = etree.tostring(find_body(resolved[ed])[0], encoding='utf-8', method='xml').decode('utf-8')
string_reverse_doc[ed] = string_doc[ed][::-1]
ct_pilcrows[ed] = string_doc[ed].count('¶')
ct_total_daggers[ed] = string_doc[ed].count('#dagger')
ct_lone_daggers[ed] = len(find_lone_daggers.findall(string_reverse_doc[ed]))
ct_lone_ms[ed] = len(find_lone_ms.findall(string_reverse_doc[ed]))
ct_lone_caps[ed] = len(find_lone_caps.findall(string_reverse_doc[ed]))
print ("number of top-level divs[not(@n = '0')]: {}".format(ct_divs))
print ("number of typographical paragraphs (<tei:p>): {}".format(ct_ps))
print ("number of pilcrow signs: {}".format(ct_pilcrows))
print ("number of milestones: {}".format(ct_ms))
print ("number of total daggers: {}".format(ct_total_daggers))
print ("number of standalone daggers: {}".format(ct_lone_daggers))
print ("number of standalone unanchored milestones: {}".format(ct_lone_ms))
print ("number of standalone capital bigrams: {}".format(ct_lone_caps))
After some experiments with XPath and lxml's iter() method (see appendices in milestones segmentation approach), we take a third approach to segment the texts: (a) We flatten the whole text and replace the breakpoints we have identified by a key string; (b) we split the text by using the key strings. ([c] We save our results.)
Here are the rules we use for segmentation:
Add breaks
Then, from these, we remove breaks where they would be redundant
Recursively extract text, children and tail text properties. Insert ++div_xy-- and ++break-- keystrings where div breaks and breakpoints occur.
In [22]:
beginnings = { "pt": ["Se", "Mas\s+se", "E\s+se", "O\s+q[uv]e", "Os\s+q[uv]e", "Diz", "E\s+a\s+reza", "Dissemos",
"Acrecenamos", "Acreceto[uv]se"],
"es": ["S[uv]mm?ario", "Preg[uv]ntas", "De\s+los\s+pecc?ados", "Diximos", "Anadiose", "Anadimos",
"Sig[uv]ese\s+tambien", "Acrecentose", "Allegase", "(Donde|De\s+lo\s+q[uv]al)\s+inferimos",
"De\s+donde\s+inferimos", "Desto\s+inferimos", "Desta\s+resol[uv]cion\s+inferimos",
"Pares?cenos", "Si", "Ante\s+de\s+los\s+q[uv]ales\s+a[uv]isamos",
"En\s+otro\s+gercero"],
"la": ["Dixi", "Seq[uv]it[uv]r", "Pro\s+f[uv]ndam[eẽ]n?to", "Ex\s+(his|q[uv]ib[uv]s|q[uv]o)\s+infert[uv]r",
"Ex\s+pr(ae|æ)dictis", "Ex\s+his\s+pr(ae|æ)missis", "Et\s+conseq[uv]enter", "Adijcimus", "Admoneo",
"Accedit", "(Ex\s+q[uv]o|[UV]nde)\s+infer(im[uv]s|t[uv]r)", "[UV]nde\s+seq[uv]it[uv]r", "Addo", "Ante\s+quor[uv]m",
"Videtur", "Prior\s+cas[uv]s\s+est", "Posterior\s+cas[uv]s\s+est",
"S[uv]per\s+alio\s+vero\s+tertio"]
}
numbers = ["primum", "secundum", "tertium", "quartum", "quintum", "sextum", "septimum", "octa[uv]um", "nonum", "decimum", "[uv]ndecimum",
"prima", "prime[iy]?r[ao]", "se[cg]und[ao]", "terti[ao]", "terce[iy]?r[ao]", "quart[ao]",
"quint[ao]", "sext[ao]", "septim[ao]", "octa[uv][ao]", "non[ao]", "decim[ao]",
"[uv]ndecim[ao]", "duodecim[ao]",
"[cijlvxCIJLVX]+"
]
numbers_caps = ["Primum", "Secundum", "Tertium", "Quartum", "Quintum", "Sextum", "Septimum", "Octa[uv]um", "Nonum", "Decimum", "[UV]ndecimum", "D[uv]odecimum",
"Prima", "Prime[iy]?r[ao]", "Se[cg]und[ao]", "Terti[ao]", "Terce[iy]?r[ao]", "Quart[ao]",
"Quint[ao]", "Sext[ao]", "Septim[ao]", "Octa[uv][ao]", "Non[ao]", "Decim[ao]",
"[UV]ndecim[ao]", "D[uv]odecim[ao]",
"[CIJLVX]+"
]
prefixes = ["Ho\.?\s+", "O\.?\s+", "El\.?\s+", "Lo\.?\s+", "A\.?\s+", "Ad\s+", "La\.?\s+", "Dela\s+",
"Decim[ao]", "Vigesim[ao]", "Trigesim[ao]"]
suffixes = ["mente", "decimo", " infertur"]
rex_all_num = [ [ num for num in numbers_caps ], # all numbers
[ num + suf for num in numbers_caps for suf in suffixes ], # all numbers plus all suffixes
[ pref + num for num in numbers for pref in prefixes ], # all prefixes plus all numbers
[ pref + num + suf for num in numbers for pref in prefixes for suf in suffixes ] # all prefixes plus all numbers plus all suffixes
]
num_rex = sum(rex_all_num, [])
def flatten(element: lxml.etree._Element):
t = ""
# Dagger milestones
if element.get("rendition")=="#dagger":
t += "†"
if element.tail:
t += str.replace(element.tail, "\n", " ")
# asterisk milestones (additions in the 1556 ed.) - create temporary marker
elif element.get("rendition")=="#asterisk":
t += "*"
if element.tail:
t += str.replace(element.tail, "\n", " ")
# Unanchored milestones - create temporary marker
elif element.get("rendition")=="#unanchored":
t += "‡"
if element.tail:
t += str.replace(element.tail, "\n", " ")
# Summaries lists
elif element.get("type")=="summaries":
t += "++break--"
if element.text:
t += str.replace(element.text, "\n", " ")
if element.getchildren():
t += " ".join((flatten(child)) for child in element.getchildren())
if element.tail:
t += str.replace(element.tail, "\n", " ")
# Headings (except for summaries headings)
elif etree.QName(element).localname=="head" and element.getparent().get("type")!="summaries":
if element.text:
t += str.replace(element.text, "\n", " ")
if element.getchildren():
t += " ".join((flatten(child)) for child in element.getchildren())
t += "++break--"
if element.tail:
t += str.replace(element.tail, "\n", " ")
# horizontal space followed by "Circa"
elif etree.QName(element).localname=="space" and element.tail and str.strip(element.tail)[:5] == "Circa":
t += "++break--"
t += str.replace(element.tail, "\n", " ")
# paragraphs
elif etree.QName(element).localname=="p":
t += "<p>"
if element.text:
t += str.replace(element.text, "\n", " ")
if element.getchildren():
t += " ".join((flatten(child)) for child in element.getchildren())
if element.tail:
t += str.replace(element.tail, "\n", " ")
t += "</p>"
else:
if element.text:
t += str.replace(element.text, "\n", " ")
if element.getchildren():
t += " ".join((flatten(child)) for child in element.getchildren())
if element.tail:
t += str.replace(element.tail, "\n", " ")
return t
xp_divs = etree.XPath("(//tei:body/tei:div[@type = 'chapter'][not(@n = '0')])", namespaces = nsmap)
divs = {}
flattened = {}
lera = {}
for ed in resolved:
t, ttemp1, ttemp2, ttemp3, ttemp4, ttemp5, ttemp6, ttemp7, ttemp8, ttemp9, ttemp10, ttemp11, ttemp12, ttemp13 = ("", "", "", "", "", "", "", "", "", "", "", "", "", "")
divs[ed] = xp_divs(resolved[ed])
t = "".join("++div_" + str(div.get("n")) + "--" + flatten(div) for div in divs[ed])
# Add breaks
ttemp1 = re.sub(r'<p>', r'\n++break--<p>', t) # paragraphs begins
ttemp2 = re.sub(r'¶', '++break--¶', ttemp1) # where pilcrow signs are
ttemp3 = re.sub(r'([:\.\?\]])\s+([A-Z])(?!([CIJLVX]+|.)?\.)(?![^†‡*]{0,80}[:\.\?\]][^a-z]*[A-Z])(?=.{0,80}[†‡*])',
r'\1 ++break-- \2', ttemp2) # sentences beginning
# with punctuation, whitespace, and a
# capital letter (not immediately followed by
# an abbreviation period)
# and a milestone follows within 80 characters
# (that do not contain a punctuation character)
for rex in beginnings[language[ed]]:
ttemp4 = re.sub('([:\.\?\]])\s+(' + rex + '\s+)', r'\1 ++break-- \2', ttemp3)
for rex in num_rex:
ttemp5 = re.sub('([:\.\?\]])\s+(' + rex + '\.?\s+)', r'\1 ++break-- \2', ttemp4)
ttemp6 = re.sub(r'\b([A-Z]{2}\s*[a-z])', r'++break-- \1', ttemp5) # two capital letters
ttemp7 = ttemp6[::-1] # reverse the string
ttemp8 = re.sub(r'([†‡*])(?!.{0,100}--kaerb)', r'\1--kaerb++', ttemp7) # daggers without sentence boundaries, i.e. not covered above
# Eliminate breaks
ttemp9 = re.sub(r'--kaerb\+\+\s*(?=\.\s*(bil|pac|[a-z])\sni\s)', '', ttemp8) # preceded by " in (lib|cap|[a-z])."
ttemp10 = re.sub(r'--kaerb\+\+\s*(?=\.\s*[SP]\s+)', '', ttemp9) # preceded by " S." or " P."
ttemp11 = re.sub(r'--kaerb\+\+\s*(?=[.¶†‡&* ]+--kaerb\+\+)', '', ttemp10) # redundant ones
ttemp12 = re.sub(r'--kaerb\+\+\s*(?=--\d+_vid\+\+)', '', ttemp11) # preceded by a "div-break"
ttemp13 = re.sub(r'--kaerb\+\+\s*(?=(\.[cijlvx]+|\.(o[LH]|A)|[^\.?\]]){1,100}(¶|>p<))',
'', ttemp12) # preceded within 100 chars by ¶ or <p>
ttemp14 = re.sub(r'--kaerb\+\+\s*(?=.{0,40}(acriC\s*--kaerb\+\+))',
'', ttemp13) # preceded within 30 chars by ++break--Circa
ttemp15 = re.sub(r'--kaerb\+\+\s*(?=[†‡*]?\s*\.?[CIJLVXcijlvx]+\s*[†‡*]?\s*--kaerb\+\+)',
'', ttemp14) # preceded only by a roman numeral.
ttemp16 = ttemp15[::-1] # re-reverse i.e. restore original reading direction
ttemp17 = re.sub(r'\+\+break--\s*(?=([A-Za-z0-9]+\.\s+)+\+\+(break|div_))',
'', ttemp16) # followed only by words with period
ttemp18 = re.sub(r'\+\+break--\s*(?=\+\+div_)', '', ttemp17) # followed by a "div-break"
# Eliminate temporary markers
ttemp19 = re.sub(r'‡', '', ttemp18) # unanchored milestones
ttemp20 = re.sub(r'</?p>', '', ttemp19) # paragraphs
# Concat everything and do a final removal of redundant breaks.
flattened[ed] = re.sub(r'\+\+break--\s*\+\+break--', '++break--', " ".join(ttemp20.strip().split()))
lera[ed] = re.sub(r'\+\+break--', r'<milestone type="lera-segment"/>', flattened[ed])
lera[ed] = re.sub(r'\+\+div_([0-9]+)--', r'</div><div type="chapter" n="\1">', lera[ed])
lera[ed] = '<root>' + re.sub(r'&', '&', lera[ed])[6:] + '</div></root>'
Check if results make sense:
In [23]:
for ed in editions:
print("number of divs/milestones in {}: {}/{}".format(ed,
str(lera[ed].count('<div')),
str(lera[ed].count('<milestone type="lera-segment"'))
))
Let's save this so that we can easier check if the break marks are in the right places...
In [24]:
import glob
if overwrite_seg:
for ed in editions:
with open('./data/processing/12000_segmented_paragraphs/' + ed + '.xml', 'w', encoding='utf-8') as txt_file:
txt_file.write(lera[ed])
else:
print("Files present no overwriting requested.")
flattened_files = glob.glob('./data/processing/12000_segmented_paragraphs/*.xml')
print ("Flattened files: {}".format(flattened_files))
Now we split our long string into actual segments (and we do this for all our editions).
In [25]:
import glob
# First load the files again (so they may be manually tweaked in-between)
fEd = []
flattened = {}
for filename in glob.glob("./data/processing/12000_segmented_paragraphs/*.xml"):
e = os.path.basename(filename)[:-4]
fEd.append(e)
if e in set(editions):
with open(filename, encoding='utf-8') as file:
flattened[e] = file.read()
print("File {} read.".format(filename))
for i in set(editions) ^ set(fEd):
print("Check for problems with these editions: ".format(i))
In [26]:
import re
segmented = {}
key_prb = {}
for ed in editions:
segmented[ed] = {}
key_prb[ed] = []
body = flattened[ed][5:-6]
for div in re.split('<div', body):
i = 0
dlabel = div[div.find('n="')+3:div.find('">')]
content = div[div.find('">')+2:div.find('</div>')]
for seg in re.split(r'<milestone type="lera-segment"/>', content):
if seg[0:31] == '<milestone type="lera-segment"/>':
mscontent = " ".join(seg[seg.find('--')+2:].strip().split())
else:
mscontent = " ".join(seg.strip().split())
if (len(mscontent) > 0):
segmented[ed].update({dlabel.zfill(2) + '_' + str(i).zfill(3): mscontent})
i += 1
Report how many segments we have found:
In [27]:
for ed in editions:
print("number of segments in {}: {}".format(ed, str(len(segmented[ed]))))
Now we save our first intermediate results, the segmented editions:
In [28]:
import csv
if overwrite_seg:
for ed in segmented:
with open('./data/processing/12000_segmented_paragraphs/' + ed + '_seg.csv', 'w', encoding='utf-8') as csv_file:
writer = csv.writer(csv_file, lineterminator="\n")
for key, value in segmented[ed].items():
writer.writerow([key, value])
else:
print("Files present no overwriting requested.")
segmented_files = glob.glob('./data/processing/12000_segmented_paragraphs/*.csv')
print ("Segmented files: {}".format(segmented_files))
In [34]:
segmented['azp1552'].keys()
Out[34]:
In [35]:
segmented['azp1556'].keys()
Out[35]: