In [1]:
from collections import defaultdict
import json
import os
import re

In [2]:
def open_file():
    p = os.path.expanduser('~/cltk_data/originals/tlg/LSTSCDCN.DIR')
    with open(p, 'rb') as fo:
        return fo.read()

In [3]:
file_bytes = open_file()

In [4]:
def parse_lists():
    file_bytes = open_file()

    c1 = re.compile(b'\xff')
    parts = [x for x in c1.split(file_bytes) if x]
    body = parts[:-2]

    all_lists = defaultdict(list)
    for count, part in enumerate(body):
        file = part[:8]
        about = part[8:].decode('utf_8')

        file = file.decode('utf_8')            

        if 0 < count < 7:
            file_about = (file + '.BIN', about)
            all_lists['Lists pertaining to all works in Canon (chronological)'].append(file_about)
        if 7 < count < 14:
            file_about = (file + '.BIN', about)
            all_lists['Lists pertaining to all works in Canon (chronological)'].append(file_about)
        if 14 < count < 21:
            file_about = (file + '.BIN', about)
            all_lists['Lists pertaining to all works in Canon (by TLG number)'].append(file_about)
        if 21 < count < 28:
            file_about = (file + '.BIN', about)
            all_lists['Lists pertaining to all works in Canon (by TLG number)'].append(file_about)
        if 28 < count < 34:
            file_about = (file + '.BIN', about)
            all_lists['Miscellaneous indices (works on CD)'].append(file_about)
        if 34 < count:
            file_about = (file + '.BIN', about)
            all_lists['Miscellaneous indices (works in canon)'].append(file_about)

    return all_lists


index_lists = parse_lists()

final_dict = {}
for about,files in index_lists.items():
    list_dict = {}
    for tup in files:
        list_dict[tup[0]] = tup[1]
    final_dict[about] = list_dict

write_dir = os.path.expanduser('~/cltk/cltk/corpus/greek/tlg')
write_path = os.path.join(write_dir, 'index_lists.json')
with open(write_path, 'w') as file_open:
    json.dump(final_dict, file_open, sort_keys=True, indent=4, separators=(',', ': '))