In [1]:
from collections import defaultdict
import json
import os
import re
In [2]:
def open_file():
p = os.path.expanduser('~/cltk_data/originals/tlg/LSTSCDCN.DIR')
with open(p, 'rb') as fo:
return fo.read()
In [3]:
file_bytes = open_file()
In [4]:
def parse_lists():
file_bytes = open_file()
c1 = re.compile(b'\xff')
parts = [x for x in c1.split(file_bytes) if x]
body = parts[:-2]
all_lists = defaultdict(list)
for count, part in enumerate(body):
file = part[:8]
about = part[8:].decode('utf_8')
file = file.decode('utf_8')
if 0 < count < 7:
file_about = (file + '.BIN', about)
all_lists['Lists pertaining to all works in Canon (chronological)'].append(file_about)
if 7 < count < 14:
file_about = (file + '.BIN', about)
all_lists['Lists pertaining to all works in Canon (chronological)'].append(file_about)
if 14 < count < 21:
file_about = (file + '.BIN', about)
all_lists['Lists pertaining to all works in Canon (by TLG number)'].append(file_about)
if 21 < count < 28:
file_about = (file + '.BIN', about)
all_lists['Lists pertaining to all works in Canon (by TLG number)'].append(file_about)
if 28 < count < 34:
file_about = (file + '.BIN', about)
all_lists['Miscellaneous indices (works on CD)'].append(file_about)
if 34 < count:
file_about = (file + '.BIN', about)
all_lists['Miscellaneous indices (works in canon)'].append(file_about)
return all_lists
index_lists = parse_lists()
final_dict = {}
for about,files in index_lists.items():
list_dict = {}
for tup in files:
list_dict[tup[0]] = tup[1]
final_dict[about] = list_dict
write_dir = os.path.expanduser('~/cltk/cltk/corpus/greek/tlg')
write_path = os.path.join(write_dir, 'index_lists.json')
with open(write_path, 'w') as file_open:
json.dump(final_dict, file_open, sort_keys=True, indent=4, separators=(',', ': '))