In [1]:
import json
import os
import re
In [2]:
def open_file():
p = os.path.expanduser('~/cltk_data/originals/tlg/AUTHTAB.DIR')
with open(p, 'rb') as fo:
return fo.read()
In [3]:
file_bytes = open_file()
# From Diogenes; useful?
# my $regexp = qr!$prefix(\w\w\w\d)\s+([\x01-\x7f]*[a-zA-Z][^\x83\xff]*)!;
c1 = re.compile(b'\x83g')
body = c1.split(file_bytes)[1]
c2 = re.compile(b'\xff')
id_authors = [x for x in c2.split(body) if x]
In [4]:
def make_id_author_pairs():
comp = re.compile(b'\s')
for id_author_raw in id_authors:
id_author_split = comp.split(id_author_raw, maxsplit=1)
if len(id_author_split) is 2:
_id, author = id_author_split[0], id_author_split[1]
# cleanup author name
comp2 = re.compile(b'&1|&')
author = id_author_split[1]
author = comp2.sub(b'', author)
comp3 = re.compile(b'\[2')
comp4 = re.compile(b'\]2')
author = comp3.sub(b'[', author)
author = comp4.sub(b']', author)
# normalize whitespaces
#comp5 = re.compile('\s+')
#author = comp5.sub(' ', author)
# cleanup odd bytecodes
comp7 = re.compile(b'\x80')
if comp7.findall(author):
author = comp7.sub(b', ', author)
# cleanup odd bytecodes
comp8 = re.compile(b'\x83e')
if comp8.findall(author):
author = comp8.sub(b'', author)
# transliterate beta code in author fields
# it's way easier to manually do these three
# Note that the converted bytes will now be str
comp6 = re.compile(b'\$1')
if comp6.findall(author):
if author == b'Dialexeis ($1*DISSOI\\ LO/GOI)':
author = 'Dialexeis (Δισσοὶ λόγοι)'
elif author == b'Dionysius $1*METAQE/MENOS Phil.':
author = 'Dionysius Μεταθέμενος Phil.'
elif author == b'Lexicon $1AI(MWDEI=N':
author = 'Lexicon αἱμωδεῖν'
# convert to str for final stuff
_id = _id.decode('utf_8')
if type(author) is bytes:
author = author.decode('utf_8')
if '+' in author:
author = author.replace('e+', 'ë')
author = author.replace('i+', 'ï')
yield (_id, author)
In [5]:
id_author_dict = {}
for k, v in make_id_author_pairs():
id_author_dict[k] = v
write_dir = os.path.expanduser('~/cltk/cltk/corpus/greek/tlg')
write_path = os.path.join(write_dir, 'id_author.json')
with open(write_path, 'w') as file_open:
json.dump(id_author_dict, file_open, sort_keys=True, indent=4, separators=(',', ': '))