In [1]:
import os
import discoursegraphs as dg
MMAX_TESTFILE = os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/coreference/maz-1423.mmax')
mdg = dg.read_mmax2(MMAX_TESTFILE)
In [2]:
import itertools
from collections import defaultdict
from discoursegraphs.readwrite.mmax2 import spanstring2tokens
In [3]:
def gen_bracket_mappings(docgraph):
pointing_chains = dg.get_pointing_chains(docgraph, layer=None)
markables = sorted(itertools.chain(*pointing_chains),
key=dg.util.natural_sort_key)
markable2chain = {}
for chain in pointing_chains:
chain_id = chain[0] # first markable in a chain
for markable in chain:
markable2chain[markable] = chain_id
token2markable = defaultdict(list)
opening = defaultdict(list)
closing = defaultdict(list)
for markable in markables:
span_tokens = spanstring2tokens(docgraph, docgraph.node[markable][docgraph.ns+':span'])
opening[span_tokens[0]].append(markable)
closing[span_tokens[-1]].append(markable)
for tok_id in span_tokens:
token2markable[tok_id].append(markable)
return opening, closing, markable2chain, token2markable
In [10]:
def gen_closing_string(closing_dict, markable2chain, token_id, stack):
num_of_closing_brackets = len(closing_dict[token_id])
closing_markable_ids = [stack.pop() for i in range(num_of_closing_brackets)]
return u''.join(u']_{}'.format(markable2chain[closing_id]) for closing_id in closing_markable_ids)
def gen_bracketed_output(docgraph):
opening, closing, markable2chain, token2markable = gen_bracket_mappings(docgraph)
ret_str = u''
stack = []
for token_id in docgraph.tokens:
token_str = docgraph.get_token(token_id)
if token_id in opening:
num_of_opening_brackets = len(opening[token_id])
stack.extend(opening[token_id])
opening_str = u'[' * num_of_opening_brackets
if token_id in closing:
# token is both the first and last element of 1+ markables
closing_str = gen_closing_string(closing, markable2chain, token_id, stack)
ret_str += u'{0}{1}{2} '.format(opening_str, token_str, closing_str)
else: # token is the first element of 1+ markables
ret_str += u'{0}{1} '.format(opening_str, token_str)
elif token_id in closing:
closing_str = gen_closing_string(closing, markable2chain, token_id, stack)
ret_str += u'{0}{1} '.format(token_str, closing_str)
else:
ret_str += u'{} '.format(token_str)
return ret_str
In [11]:
print gen_bracketed_output(mdg)
In [8]:
opening, closing, markable2chain, token2markable = gen_bracket_mappings(mdg)
In [9]:
markable2chain
Out[9]:
In [ ]: