We show the embedding of nodes annotated as sentences, clauses, phrases, subphrases and words. We put them in a format (eventually) such that they can be read by TQUERY. Then Rens Bod and Andreas van Cranenburgh can do interesting business with it.
We walk through all anchor positions of the primary data, and follow the node events at that position. LAF-Fabric tries hard to generate node events in an order that respects the factual embedding of nodes.
However, some nodes have gaps, some nodes are linked to regions with zero length, and some nodes have identical regions. Moreover, the way our current LAF resource of the Hebrew Bible is coded causes all higher level nodes such as phrases, clauses and sentences to have gaps at each white space occurrence.
LAF-Fabric succeeds in overcoming all those problems.
In another notebook we use a different way.
In [1]:
%load_ext autoreload
import sys
import collections
from laf.fabric import LafFabric
processor = LafFabric(verbose='DETAIL')
In [3]:
%autoreload 2
API = processor.load('etcbc4', '--', 'trees',
{
"xmlids": {"node": False, "edge": False},
"features": ("otype monads g_cons_utf8 sp book chapter verse", ""),
'primary': True,
})
In [7]:
F = API['F']
NE = API['NE']
msg = API['msg']
outfile = API['outfile']
my_file = API['my_file']
relevant_nodes = [
("word", ''),
("subphrase", 'p'),
("phrase", 'P'),
("clause", 'C'),
("sentence", 'S'),
("_split_", None),
("verse", None),
("chapter", None),
("book", None),
]
pos_table = {
'adjv': 'aj',
'advb': 'av',
'art': 'dt',
'conj': 'cj',
'intj': 'ij',
'inrg': 'ir',
'nega': 'ng',
'subs': 'n',
'nmpr': 'n-pr',
'prep': 'pp',
'prps': 'pr-ps',
'prde': 'pr-dem',
'prin': 'pr-int',
'verb': 'vb',
}
select_node = collections.defaultdict(lambda: None)
abbrev_node = collections.defaultdict(lambda: None)
for (i, (otype, abb)) in enumerate(relevant_nodes):
select_node[otype] = i
abbrev_node[otype] = abb if abb != None else otype
split_n = select_node['_split_']
In [8]:
trees = outfile("trees.txt")
anomalies = outfile("anomalies.txt")
recent_sentences = collections.deque([], 3)
def process_saved():
for (i, (events, book, chapter, verse, verse_label, level)) in enumerate(recent_sentences):
if i == 0:
anomalies.write("BEFORE the anomaly {}\n".format(verse_label))
elif i == 1:
anomalies.write("the anomaly ITSELF {}\n".format(verse_label))
elif i == 2:
anomalies.write("AFTER the anomaly{}\n".format(verse_label))
for (anchor, node, kind) in events:
otype = F.otype.v(node)
if kind == 3:
if select_node[otype] > split_n:
continue
level -= 1
anomalies.write('{:>7}-{}{})\n'.format(anchor, "." * level, abbrev_node[otype]))
elif kind == 2:
if select_node[otype] > split_n:
continue
level -= 1
anomalies.write('{:>7}-{}{}»\n'.format(anchor, "." * level, abbrev_node[otype]))
elif kind == 1:
if select_node[otype] > split_n:
continue
anomalies.write('{:>7}-{}«{}\n'.format(anchor, "." * level, abbrev_node[otype]))
level += 1
elif kind == 0:
if otype == 'book':
book = F.book.v(node)
elif otype == 'chapter':
chapter = F.chapter.v(node)
elif otype == 'verse':
verse = F.verse.v(node)
verse_label = '{} {}:{}'.format(book, chapter, verse)
anomalies.write("\n{}\n".format(verse_label))
msg(verse_label)
elif otype == 'word':
pos = pos_table[F.sp.v(node)]
text = F.g_cons_utf8.v(node)
monads = F.monads.v(node)
anomalies.write('{:>7}-{}({} "{}" ={}=\n'.format(anchor, "." * level, pos, text, monads))
level += 1
else:
anomalies.write('{:>7}-{}({}\n'.format(anchor, "." * level, abbrev_node[otype]))
level += 1
anomalies.write("\nEND of the anomaly in {}:\n".format(verse_label))
book = None
chapter = None
verse = None
verse_label = None
tree = ''
n_warnings = 0
level = 0
warning = False
saved_events = ([], book, chapter, verse, verse_label, level) # we save the events of the current sentence, in case there is an anomaly.
for (anchor, events) in NE(key=lambda n:select_node[F.otype.v(n)], simplify=lambda n:select_node[F.otype.v(n)] < split_n):
for (node, kind) in events:
saved_events[0].append((anchor, node, kind))
otype = F.otype.v(node)
if kind == 3:
level -= 1
if select_node[otype] > split_n:
continue
tree += ')'
if otype == 'sentence':
trees.write(tree + "\n")
tree = ""
recent_sentences.append(saved_events)
if warning:
process_saved()
warning = False
saved_events = ([], book, chapter, verse, verse_label, level)
elif kind == 2:
level -= 1
if select_node[otype] > split_n:
continue
tree += '»{}»'.format(abbrev_node[otype])
if otype == 'sentence':
trees.write(tree + "\n")
tree = ""
recent_sentences.append(saved_events)
if warning:
process_saved()
warning = False
saved_events = ([], book, chapter, verse, verse_label, level)
elif kind == 1:
if select_node[otype] > split_n:
continue
if otype == 'sentence':
if tree != '':
msg("WARNING: material between two sentences in {}: [{}]".format(verse_label, tree))
n_warnings += 1
trees.write("{:<15} *** {} ***\n".format(verse_label, tree))
tree = ''
recent_sentences.append(saved_events)
if warning:
process_saved()
warning = False
saved_events = ([], book, chapter, verse, verse_label, level)
warning = True
tree += '{:<15} «S« '.format(verse_label)
else:
tree += '«{}« '.format(abbrev_node[otype])
level += 1
elif kind == 0:
if otype == 'book':
book = F.book.v(node)
msg(book)
elif otype == 'chapter':
chapter = F.chapter.v(node)
elif otype == 'verse':
verse = F.verse.v(node)
verse_label = '{} {}:{}'.format(book, chapter, verse)
elif otype == 'sentence':
if tree != '':
msg("WARNING: material between two sentences in {}: [{}]".format(verse_label, tree))
n_warnings += 1
trees.write("{:<15} *** {} ***\n".format(verse_label, tree))
tree = ''
recent_sentences.append(saved_events)
if warning:
process_saved()
warning = False
saved_events = ([], book, chapter, verse, verse_label, level)
warning = True
tree += '{:<15} (S '.format(verse_label)
elif otype == 'word':
pos = pos_table[F.sp.v(node)]
text = F.g_cons_utf8.v(node)
tree += '({} "{}"'.format(pos, text)
else:
tree += '({} '.format(abbrev_node[otype])
level += 1
msg("There were {} warnings".format(n_warnings))
In [9]:
API['close']()
In [10]:
!head -n 25 {my_file('trees.txt')}