Sprachübergreifende Textalignierung

Blockseminar Studiengang "Digitale Methodik in den Geistes- und Kulturwissenschaften" (18.1.2020, 8.2.2020, 15.2.2020)

import os import sys import glob import re import locale locale.setlocale(locale.LC_ALL, '') # Use '' for auto, or force e.g. to 'en_US.UTF-8' from collections import OrderedDict from decimal import Decimal from functools import partial from itertools import chain import ctypes import numpy as np import csv import json import lxml from lxml import etree from sklearn.base import BaseEstimator, TransformerMixin from sklearn.compose import ColumnTransformer from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.pipeline import Pipeline, FeatureUnion import tabulate from IPython.display import HTML, display import nltk import nltk.translate.gale_church import bleualign.gale_church # from Rico Sennrich's Bleualign: https://github.com/rsennrich/Bleualign # import _align from gale-church # from Li Ling Tan's https://github.com/alvations/gachalign # Freeling aux_dir = "\\auxiliary_files" nb_dir = os.path.split(os.getcwd())[0] + "\\" + os.path.split(os.getcwd())[1] + aux_dir if nb_dir not in sys.path: sys.path.append(nb_dir) from auxiliary_files import pyfreeling pyfreeling.util_init_locale("default") import networkx as nx import matplotlib.pyplot as plt

In [ ]:
import os

in_dir = "./data/constitutions/"

# we create a dictionary with our constitutions:
sources = {}

for file in sorted(os.listdir(in_dir)):
    key = os.path.basename(file).split(os.extsep)[0]
    with open(in_dir + '/' + file, encoding="utf-8") as f:
        sources[key] = f.read()

# and a list of available constitutions for quick lookup:
constitutions = list(sources.keys())

print ("{} files read:".format(len(constitutions)))
print (constitutions)

In [ ]:
from nltk import tokenize
# nltk.download('punkt')

sentences = {}
nos = {}
for c in constitutions:
    t = tokenize.sent_tokenize(sources[c])
    nos[c] = len(t)
    for i, s in enumerate(t):
        sentences[c + '_' + str(i)] = s

boundary = len(sentences) - nos['1948_-_BU_-_Burma_-_constitution_of_burma']
print("Corpus has {} sentences.".format(len(sentences)))
print("1948_-_BU_-_Burma_-_constitution_of_burma has {}.\n".format(nos['1948_-_BU_-_Burma_-_constitution_of_burma']))

print("Its first 3 sentences are:\n{}".format([sentences['1948_-_BU_-_Burma_-_constitution_of_burma_0'],\
                                              sentences['1948_-_BU_-_Burma_-_constitution_of_burma_1'],\
                                              sentences['1948_-_BU_-_Burma_-_constitution_of_burma_2']]))

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word', strip_accents='unicode', stop_words=["the", "of", "and"])
dfm = vectorizer.fit_transform(sentences.values())

print(dfm.shape)
print(type(dfm))
print(dfm.toarray())

In [ ]:
from sklearn.metrics.pairwise import cosine_similarity

target = dfm[boundary:]
sources = dfm[:boundary,]
print(target.shape)
print(sources.shape)

simils = cosine_similarity(target, sources)
print(simils.shape)

In [ ]:
import numpy as np

np.argmax(simils, axis=1)

Manuales (sprachübergreifende Alignierung)


In [ ]:
import os

in_dir = "./data/manual/"

# we create a dictionary with our manuales:
sources = {}

for file in sorted(os.listdir(in_dir)):
    key = os.path.basename(file).split(os.extsep)[0]
    with open(in_dir + '/' + file, encoding="utf-8") as f:
        sources[key] = f.read()

# and a list of available constitutions for quick lookup:
manuales = list(sources.keys())

print ("{} files read:".format(len(manuales)))
print (manuales)

In [ ]:
from nltk import tokenize
# nltk.download('punkt')

sentences = {}
nos = {}
for c in manuales:
    t = tokenize.sent_tokenize(sources[c])
    nos[c] = len(t)
    for i, s in enumerate(t):
        sentences[c + '_' + str(i)] = s

print("Corpus has {} sentences.".format(len(sentences)))
print("azp1552_ch17 {}.\n".format(nos['azp1552_ch17']))

print("Its first 3 sentences are:\n{}".format([sentences['azp1552_ch17_2'],\
                                              sentences['azp1552_ch17_3'],\
                                              sentences['azp1552_ch17_4']]))

LXML


In [2]:
import lxml
from lxml import etree
import glob
import re
import os

in_dir = "./data/manual/"

sources = glob.glob(in_dir + '*.xml')

parsed = { os.path.basename(file).split(os.extsep)[0] :
                 (etree.parse(file))
                     for file in sorted(sources)
         }

manuales = list(parsed.keys())
print (manuales)

nsmap = {"tei": "http://www.tei-c.org/ns/1.0"}

def flatten(element):
    t = ""
    # Dagger milestones
    if element.get("rendition")=="#dagger":
        t += "†"
        if element.tail:
            t += str.replace(element.tail, "\n", " ")
    # asterisk milestones (additions in the 1556 ed.) - create temporary marker
    elif element.get("rendition")=="#asterisk":
        t += "*"
        if element.tail:
            t += str.replace(element.tail, "\n", " ")
    # Unanchored milestones - create temporary marker
    elif element.get("rendition")=="#unanchored":
        t += "‡"
        if element.tail:
            t += str.replace(element.tail, "\n", " ")
    else:
        for c in element.iter("expan"):
            flatten(c)
            if element.tail:
                t += str.replace(element.tail, "\n", " ")
        for c in element.iter("corr"):
            flatten(c)
            if element.tail:
                t += str.replace(element.tail, "\n", " ")
        if element.text:
            t += str.replace(element.text, "\n", " ")
        if element.getchildren():
            t += " ".join((flatten(child)) for child in element.getchildren())
        if element.tail:
            t += str.replace(element.tail, "\n", " ")

    return t

xp_divs = etree.XPath("(//tei:body/tei:div[@type = 'chapter'][not(@n = '0')])", namespaces = nsmap)

divs = {}
text = {}

for ed in manuales:
    t1 = ""
    divs[ed] = xp_divs(parsed[ed])
    t1  = "".join("++div--" + re.sub('\s+', ' ', '<p>' + flatten(div)) for div in divs[ed])
    t2  = re.sub(r'¶', '++break--¶',                       t1)       # where pilcrow signs are
    t3  = re.sub(r'([:\.\?\]])\s+([A-Z])(?!([CIJLVX]+|.)?\.)(?![^†‡*]{0,80}[:\.\?\]][^a-z]*[A-Z])(?=.{0,80}[†‡*])',
                     r'\1 ++break-- \2',                   t2)       # sentences beginning
                                                                     # with punctuation, whitespace, and a
                                                                     # capital letter (not immediately followed by
                                                                     # an abbreviation period)
                                                                     # and a milestone follows within 80 characters
                                                                     # (that do not contain a punctuation character)
    t4  = re.sub(r'\b([A-Z]{2}\s*[a-z])', r'++break-- \1', t3)       # two capital letters
    t5  = t4[::-1]                                                   # reverse the string
    t6  = re.sub(r'([†‡*])(?!.{0,100}--kaerb)', r'\1--kaerb++', t5)  # daggers without sentence boundaries, i.e. not covered above
    t7  = t6[::-1]                                                   # reverse the string
    t8  = re.sub(r'‡', '',                                 t7)       # Eliminate temporary markers: unanchored milestones

    # Concat everything and do a final removal of redundant breaks.
    t9 = re.sub(r'\+\+break--\s*\+\+break--', '++break--', " ".join(t8.strip().split()))
    
    t10 = re.sub(r'\+\+break--', r'<milestone type="lera-segment"/>', t9)
    t11 = re.sub(r'\+\+div--', r'</div><div type="chapter">', t10)
    text[ed] = '<root>' + re.sub(r'&', '&amp;', t11)[6:] + '</div></root>'


print("text['azp1552_ch17'] is:\n{}...".format(text['azp1552_ch17'][:400]))


['azp1552_ch17', 'azp1556_ch17', 'azp1573_ch17']
text['azp1552_ch17'] is:
<root><div type="chapter"><p> <milestone type="lera-segment"/>¶ Do ſeptimo mandamento. N ão furtaras. Capit. xvi j̈. <milestone type="lera-segment"/> PEra fundamento † das preguntas de ſte mãdam ẽto mandamento di ʒemos. Ho pri meyro que ha hi furto m ẽtal, ⁊ fur to real. Ho m ẽtal he võtade vontade de co meter ho real. Eho real he ſeg ũdo segundo Paulo l. 1. ff. đ fur. .§. 1. In ſtit. de obliga...

In [8]:
sentences = {}
nos = {}
for ed in manuales:
    sentences[ed] = {}
    segments = text[ed].split('<milestone type="lera-segment"/>')
    nos[ed] = len(segments)
    for i, s in enumerate(segments):
        sentences[ed][ed + '_' + str(i)] = s.strip()

print("Corpus has {} sentences.".format(len(sentences)))
print("azp1552_ch17 has {}.\n".format(nos['azp1552_ch17']))

print("Its first 5 sentences are:\n{}".format([sentences['azp1552_ch17']['azp1552_ch17_0'],\
                                               sentences['azp1552_ch17']['azp1552_ch17_1'],\
                                               sentences['azp1552_ch17']['azp1552_ch17_2'],\
                                               sentences['azp1552_ch17']['azp1552_ch17_3'],\
                                               sentences['azp1552_ch17']['azp1552_ch17_4']]))


Corpus has 3 sentences.
azp1552_ch17 has 387.

Its first 5 sentences are:
['<root><div type="chapter"><p>', '¶ Do ſeptimo mandamento. N ão furtaras. Capit. xvi j̈.', 'PEra fundamento † das preguntas de ſte mãdam ẽto mandamento di ʒemos. Ho pri meyro que ha hi furto m ẽtal, ⁊ fur to real. Ho m ẽtal he võtade vontade de co meter ho real. Eho real he ſeg ũdo segundo Paulo l. 1. ff. đ fur. .§. 1. In ſtit. de obligat. qu æ ex de li. na ſc. contrata ç ã, ou tratam ẽto engano ſa do alheo cõtra contra võtade vontade do ſe ñor, pera auer a ꝓpriedade, ou po ſ ſi ſ ſam, ou ho v ſo della. Di ſ ſemos (c õtrata ç ã) porq̃ porque ſem ella n ão ha furto real in. d. author="bragagnolo" timestamp="20190408T191343+0200" comment="check" : ainda q̃ que ho ha metal. Di ſ ſemos (do alheo) por q̃ ho tratam ẽto do ſeu, em q̃nto quanto ho he, ou cõ com re ʒ ão cree q̃ que he ſeu, n ão he furto. l. Inter o ẽs .§. Recte. ff. de furt. Acrec ẽtamos ( cõtra contra a võtade vontade do ſe ñor) porq̃ porque ſendo cõ com ſeu c õ ſ ẽ tim ẽto, n ão he furto in d. §. Re cte.. Di ſ ſemos (engano ſo) porq̃ porque ſe ſe fa ʒ por ʒ õbar, ou polo fa ʒer mais e ſperto: t ã pouco he furto. Di ſ ſemos ( ꝑa para auer a ꝓpriedade ou po ſ ſi ſ ſam) porq̃ porque aba ſta q̃rer querer auer alg ũa cou ſa \uebd1 ſtas ꝑa para q̃ que ſeja furto, ſeg ũdo segundo todos.', '† Ho. ij. q̃ que por e ſte mãdam ẽto mandamento, como ẽ em outra ꝑte di ſ ſemos in additio. rep. e Q ñ de c õ ſecr. d. 1. n. 231., nã não ſom ẽte ſe defende o q̃ que ſecretam ẽte ſe toma ao ꝓ ximo cõtra contra ſuav õtade ( q̃ que ꝓpriam ẽte propriamente ſe chama fur to) mas ainda tudo q̃nto quanto ſe toma mal, ⁊mal ſe tẽ tem: ⁊ todo ho d ãno q̃ que inju ſtam ẽte ſe daa: ou porfor ça ou por leys inju ſtas, ou por outras v ſurpa ç ões il licitas c. penale. 14. q. 5. ⁊ tãb ẽ tambem toda võtade vontade deliberada \uebd1 de tomar, reter, d ãnar, ⁊ v ſurpar illicitam ẽte cõtra contra a v õta \uebd1 de ſeu dono, porq̃ porque como acima in c. li. n. 8., ⁊ em outra ꝑte in dicta ad diti. n. 233. di ſ ſemos, os pecados da võtade vontade, palaura, ⁊ obra de hũa huma me ſma q̃lidade ſ ã: ainda q̃ que os da soo v õ tade n ão obrig ã a re ſtitui ç ã, como os da obra ⁊ palaura.', '† Ho. iij. que apouquidade ⁊ indelibera ç ã e ſcu ſ ã de mortal, a ſ ſi ne ſta, como em toda outra mate ria, ſegundo acima ho di ſ ſemos in. d. c. 11. n. 4. : polo qual quẽ quem furta hũa huma ma ça ã, ainda que ſeja com animo de furtar, n ão pecca mais de venialmente, ſe n ão te ue enren ç ã de furtar cou ſa notauel, nẽ nem de dar d ã no notauel, ſe podera: doutra maneira ſi, ſeg ũdo segundo S. Tho. 2. S æc. q. 66. art. 6. Ant. 2. ꝑ. t. 1. c. 15. §. 1. et Syl. in ro ſa aurea. ca ſu. 38. porque ni ſto n ão tam ſomente ſe tem re ſpeyto ao que ſe toma, mas aa enten ç ão ⁊ v õ tade do que furta: ſegundo S. Hieronymo in c. fin. 14. q. 4. , ao menos quanto ao foro da con ſciencia, como em outra parte ho di ſ ſemos in repet. d. c. fin. . E ſcu ſao porem ainda de venial a ignorancia prouauel, de n ão ſaber q̃ que a cou ſa era alhea, ⁊ a ſua grande nece ſ ſidade, a juy ʒo de bõ bom var ão c. Si quis ꝓ pter nece ſ ſitatem de furt, vbi Pan. &amp; alii : ⁊ tambẽ tambem quãdo quando cree cõ com cau ſa prouauel, que ho ſe ñor da cou ſa ho auera por b ẽ, l. inter o ẽs .§. Recte. ff de furt.']

Save sentences as plaintext files.


In [9]:
import csv

for ed in manuales:
    with open('./data/manual/' + ed + '_seg.csv', 'w', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file, lineterminator="\n")
        for key, value in sentences[ed].items():
           writer.writerow([key, value])