Torsten Hiltmann, Collation of texts with important variations in spelling using a dictionary of regulated spellings for normalisation.
Collating texts with important variations in spelling (e.g. in Middle French), this small code enables you to normalise your texts for the collation by the help of a dictionary of regulated spellings.
Torsten Hiltmann, Ronald Haentjes Dekker
In [5]:
import csv
import re
from collatex import *
#create the dictionary (here: Dictionary.csv consisting in two columns, separated by a comma. The first Column 'Original' are the strings as found in the text, the second column 'Normalised' contains the strings you want to replace them with. No whitespaces behind the strings.)
Normfrench = {}
with open('../nTexte/Dictionnary_neu.csv') as csvfile:
reader = csv.DictReader(csvfile, fieldnames=['Original', 'Normalised'], dialect='excel')
for row in reader:
Normfrench[row['Original']]= row['Normalised']
#read in the witnesses from your file system (here: ../nTexte/)
from collatex.core_classes import WordPunctuationTokenizer
tokenizer = WordPunctuationTokenizer()
witness_BNF25186 = open("../nTexte/BNF25186.txt").read()
witness_BNF1968 = open("../nTexte/BNF1968.txt").read()
witness_Bresslau1994 = open("../nTexte/Bresslau1994.txt").read()
# build a function to tokenize and to normalize by replace keys to be found in the dictionary by the corresponding values
def tokennormalizer(witness) :
tokens_as_strings = tokenizer.tokenize(witness)
list = []
for token_string in tokens_as_strings:
normversion = re.sub(r'\s+$',"", token_string)
replaceversion = Normfrench.get(normversion,normversion)
list.append({'t':token_string, 'n':replaceversion})
return(list)
#collate
tokens_a = tokennormalizer(witness_BNF25186)
tokens_b = tokennormalizer(witness_BNF1968)
tokens_c = tokennormalizer(witness_Bresslau1994)
witness_a = { "id": "BNF25186", "tokens": tokens_a }
witness_b = { "id": "BNF1968", "tokens": tokens_b }
witness_c = { "id": "Bresslau1994", "tokens": tokens_c }
input = { "witnesses": [ witness_a, witness_b, witness_c ] }
table = collate(input, output='tei', segmentation=True)
#save the output in a TEI/XML File
with open("output.xml", "w") as text_file:
text_file.write(table)