In [1]:
import json
import os
import re

In [2]:
def open_file():
    p = os.path.expanduser('~/cltk_data/originals/tlg/AUTHTAB.DIR')
    with open(p, 'rb') as fo:
        return fo.read()

In [3]:
file_bytes = open_file()

# From Diogenes; useful?
# my $regexp = qr!$prefix(\w\w\w\d)\s+([\x01-\x7f]*[a-zA-Z][^\x83\xff]*)!;

c1 = re.compile(b'\x83g')
body = c1.split(file_bytes)[1]
c2 = re.compile(b'\xff')
id_authors = [x for x in c2.split(body) if x]

In [4]:
def make_id_author_pairs():
    comp = re.compile(b'\s')
    for id_author_raw in id_authors:
        id_author_split = comp.split(id_author_raw, maxsplit=1)
        if len(id_author_split) is 2:
            
            _id, author = id_author_split[0], id_author_split[1]
            
            # cleanup author name
            comp2 = re.compile(b'&1|&')
            author = id_author_split[1]
            author = comp2.sub(b'', author)
            comp3 = re.compile(b'\[2')
            comp4 = re.compile(b'\]2')
            author = comp3.sub(b'[', author)
            author = comp4.sub(b']', author)
            
            # normalize whitespaces
            #comp5 = re.compile('\s+')
            #author = comp5.sub(' ', author)

            # cleanup odd bytecodes
            comp7 = re.compile(b'\x80')
            if comp7.findall(author):
                author = comp7.sub(b', ', author)

            # cleanup odd bytecodes
            comp8 = re.compile(b'\x83e')
            if comp8.findall(author):
                author = comp8.sub(b'', author)
            
            # transliterate beta code in author fields
            # it's way easier to manually do these three
            # Note that the converted bytes will now be str
            comp6 = re.compile(b'\$1')
            if comp6.findall(author):
                if author == b'Dialexeis  ($1*DISSOI\\ LO/GOI)':
                    author = 'Dialexeis (Δισσοὶ λόγοι)'
                elif author == b'Dionysius $1*METAQE/MENOS Phil.':
                    author = 'Dionysius Μεταθέμενος Phil.'
                elif author == b'Lexicon $1AI(MWDEI=N':
                    author = 'Lexicon αἱμωδεῖν'

            # convert to str for final stuff
            _id = _id.decode('utf_8')
            if type(author) is bytes:
                author = author.decode('utf_8')                


            if '+' in author:
                author = author.replace('e+', 'ë')
                author = author.replace('i+', 'ï')
            
            yield (_id, author)

In [5]:
id_author_dict = {}
for k, v in make_id_author_pairs():
    id_author_dict[k] = v

write_dir = os.path.expanduser('~/cltk/cltk/corpus/greek/tlg')
write_path = os.path.join(write_dir, 'id_author.json')
with open(write_path, 'w') as file_open:
    json.dump(id_author_dict, file_open, sort_keys=True, indent=4, separators=(',', ': '))