In [1]:
import os
import discoursegraphs as dg
In [2]:
ddg = dg.corpora.pcc.get_document('maz-6728')
In [3]:
dg.DATA_ROOT_DIR
Out[3]:
In [4]:
# dg.corpora.pcc.get_files_by_layer('syntax')
tdg = dg.read_tiger(os.path.join(
dg.DATA_ROOT_DIR,
'potsdam-commentary-corpus-2.0.0/syntax/maz-11766.xml'))
In [5]:
# dg.info(tdg)
In [6]:
# %load_ext gvmagic
In [7]:
# %dotstr dg.print_dot(tdg)
In [8]:
syntax_nodes = set(dg.select_nodes_by_layer(ddg, 'tiger:syntax'))
In [9]:
cat_nodes = set(dg.select_nodes_by_attribute(ddg, 'tiger:cat'))
In [10]:
# token_map = dg.util.TokenMapper(ddg)
In [11]:
# token_map.id2index['rst:rst:9_0']
In [25]:
token_map = dg.util.TokenMapper(ddg)
cat_spans = {}
for cat_node in cat_nodes:
# print ddg.node[cat_node]['tiger:cat'], dg.get_span_offsets(ddg, cat_node)
first_token_index = token_map.id2index[dg.get_span(ddg, cat_node)[0]]
last_token_index = token_map.id2index[dg.get_span(ddg, cat_node)[-1]]
# print ddg.node[cat_node]['tiger:cat'], token_map.id2index[first_token_id], token_map.id2index[last_token_id]
cat_spans[(first_token_index, last_token_index)] = {
'node': cat_node, 'cat': ddg.node[cat_node]['tiger:cat']}
In [26]:
def get_syntax_spans(docgraph):
token_map = dg.util.TokenMapper(docgraph)
cat_nodes = set(dg.select_nodes_by_attribute(docgraph, 'tiger:cat'))
syntax_spans = {}
for cat_node in cat_nodes:
first_token_index = token_map.id2index[dg.get_span(docgraph, cat_node)[0]]
last_token_index = token_map.id2index[dg.get_span(docgraph, cat_node)[-1]]
syntax_spans[(first_token_index, last_token_index)] = {
'node': cat_node, 'cat': docgraph.node[cat_node]['tiger:cat']}
return syntax_spans
In [19]:
# cat_spans
In [34]:
def get_rst_span_map(docgraph):
rst_spans = {}
for rel_id, span_type, rel_type, tok_onset, tok_offset in dg.readwrite.rst.rs3.get_rst_spans(docgraph):
rst_spans[(tok_onset, tok_offset)] = {
'rel_id': rel_id, 'span_type': span_type, 'rel_type': rel_type}
return rst_spans
In [35]:
for doc_id in dg.corpora.pcc.document_ids:
ddg = dg.corpora.pcc.get_document(doc_id)
syntax_spans = get_syntax_spans(ddg)
for syntax_span in syntax_spans:
if syntax_spans[syntax_span]['cat'] not in ('S', 'CS'):
if syntax_span in rst_spans:
print doc_id, syntax_span, syntax_spans[syntax_span]['cat']
In [42]:
from collections import Counter
cat_counter = Counter()
subord_counter = Counter()
for doc_id in dg.corpora.pcc.document_ids:
ddg = dg.corpora.pcc.get_document(doc_id)
syntax_spans = get_syntax_spans(ddg)
subord_nodes = get_subordinate_clauses(ddg)
for syntax_span in syntax_spans:
if syntax_span in rst_spans:
# print doc_id, syntax_span, syntax_spans[syntax_span]['cat']
cat = syntax_spans[syntax_span]['cat']
cat_counter[cat] += 1
if syntax_spans[syntax_span]['node'] in subord_nodes:
subord_counter[cat] += 1
In [44]:
cat_counter.most_common()
Out[44]:
In [45]:
subord_counter
Out[45]:
In [36]:
def get_subordinate_clauses(tiger_docgraph):
"""
given a document graph of a TIGER syntax tree, return all
node IDs of nodes representing subordinate clause constituents.
Parameters
----------
tiger_docgraph : dg.DiscourseDocumentGraph
document graph from which subordinate clauses will be extracted
Returns
-------
subord_clause_nodes : list(str)
list of node IDs of nodes directly dominating subordinate clauses
"""
subord_clause_rels = \
dg.select_edges_by_attribute(
tiger_docgraph, attribute='tiger:label',
value=['MO', 'RC', 'SB'])
subord_clause_nodes = []
for src_id, target_id in subord_clause_rels:
src_cat = tiger_docgraph.node[src_id].get('tiger:cat')
if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id):
subord_clause_nodes.append(target_id)
return subord_clause_nodes
In [39]:
for node_id in get_subordinate_clauses(ddg):
print node_id, ddg.node[node_id]
In [ ]:
In [18]:
# rst_spans
In [24]:
for cat_span in cat_spans:
if cat_span in rst_spans:
print cat_span, cat_spans[cat_span][1], rst_spans[cat_span][1:]
In [ ]: