MATCH ()-[r1]->(p)-[*]->(c)<-[:LINKS_TO {layers: ["rst", "rst:token"], edge_type: "spans"}]-(rst)
WHERE p.tiger:cat
= "S" AND (r1.label = "MO" OR r1.label = "SB" OR r1.label = "RC") AND EXISTS(c.tiger:token
) AND EXISTS(rst.rst:text
)
RETURN p,c,rst ORDER BY c.tiger:id
;
In [1]:
import discoursegraphs as dg
from discoursegraphs.corpora import pcc
In [2]:
pcc.get_files_by_document_id('maz-17673')
Out[2]:
In [3]:
tdg = dg.read_tiger('/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/discoursegraphs-0.1.2-py2.7.egg/data/potsdam-commentary-corpus-2.0.0/syntax/maz-17673.xml')
In [4]:
rdg = dg.read_rs3('/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/discoursegraphs-0.1.2-py2.7.egg/data/potsdam-commentary-corpus-2.0.0/rst/maz-17673.rs3')
In [5]:
# dg.info(tdg)
In [6]:
tdg.node['s999_506']
Out[6]:
In [7]:
def get_subordinate_clauses(tiger_docgraph):
"""
given a document graph of a TIGER syntax tree, return all
node IDs of nodes representing subordinate clause constituents.
"""
subord_clause_rels = \
dg.select_edges_by_attribute(
tiger_docgraph, attribute='tiger:label',
value=['MO', 'RC', 'SB'])
subord_clause_nodes = []
for src_id, target_id in subord_clause_rels:
src_cat = tiger_docgraph.node[src_id].get('tiger:cat')
if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id):
subord_clause_nodes.append(target_id)
return subord_clause_nodes
In [8]:
tdg.merge_graphs(rdg)
In [9]:
from discoursegraphs.readwrite.rst import rs3
In [10]:
subord_offsets = [dg.get_span_offsets(tdg, sc)
for sc in get_subordinate_clauses(tdg)]
In [11]:
edu_offsets = [dg.get_span_offsets(tdg, edu) for edu in rs3.get_edus(tdg)]
In [12]:
set(edu_offsets).intersection(set(subord_offsets))
Out[12]:
In [13]:
dg.get_text(tdg)[318:366] # subordinate clause
Out[13]:
In [14]:
dg.get_text(tdg)[318:368] # EDU
Out[14]:
In [15]:
for e_on, e_off in edu_offsets:
for s_on, s_off in subord_offsets:
if e_on <= s_on and s_off <= e_off:
share = (s_off - s_on) / float(e_off - e_on)
if share >= 0.9:
print (e_on, e_off), "includes", (s_on, s_off), "share: ", share
In [16]:
for s_on, s_off in subord_offsets:
for e_on, e_off in edu_offsets:
if s_on <= e_on and e_off <= s_off:
share = float(e_off - e_on) / (s_off - s_on)
if share >= 0.9:
print (s_on, s_off), "includes", (e_on, e_off), "share: ", share
In [17]:
dg.get_text(tdg)[203:366]
Out[17]:
In [18]:
dg.get_text(tdg)[196:368]
Out[18]:
In [140]:
from operator import itemgetter
def max_overlap(overlapping_nodes):
"""
given a list of nodes, return the node ID of the
best matching node.
Parameters
----------
overlapping_nodes : list of dict
A list of nodes. Each node is represented as a
dict with two keys: ``node`` (the node ID) and
``overlap`` (percentage over overlap with another
string)
Returns
-------
max_overlap : dict
dict containing ``node_id`` (of the node
with the highest overlap), as well as some
metadata (e.g. ``overlap`` in percent, ``interval``
(string onset int, string offset int))
"""
return sorted(overlapping_nodes,
key=lambda n: n['overlap'],
reverse=True)[0]
In [141]:
from collections import defaultdict
from intervaltree import IntervalTree, Interval
import discoursegraphs as dg
from discoursegraphs.readwrite.rst import rs3
def find_overlapping_nodes(
docgraph, local_nodes, other_nodes,
overlap_threshold=95, debug=False):
"""
"""
# there might be more than one node covering the same span
local_span2nodes = defaultdict(list)
for local_node in local_nodes:
span = dg.get_span_offsets(docgraph, local_node)
local_span2nodes[span].append(local_node)
other_span2nodes = defaultdict(list)
for other_node in other_nodes:
span = dg.get_span_offsets(docgraph, other_node)
other_span2nodes[span].append(other_node)
other_tree = IntervalTree.from_tuples(other_span2nodes.keys())
overlap_map = defaultdict(list)
for local_span in local_span2nodes:
# all the spans from ``other_spans`` that overlap with this ``local_span``
overlap_intervals = other_tree[Interval(*local_span)]
for overlap_interval in overlap_intervals:
local_on, local_off = local_span
len_local = local_off - local_on
other_on, other_off = overlap_interval.begin, overlap_interval.end
len_other = other_off - other_on
len_longest_input = max(len_local, len_other)
overlap_on = max(local_on, other_on)
overlap_off = min(local_off, other_off)
len_overlap = overlap_off - overlap_on
# overlap in % between the input intervals
overlap = len_overlap / float(len_longest_input) * 100
if overlap >= overlap_threshold or (len_overlap+2 >= len_longest_input):
# generate a mapping from a local node to all
# the ``other_nodes`` it overlaps with (incl.
# their overlap in % for finding the best match)
for local_node in local_span2nodes[local_span]:
overlap_span = (overlap_interval.begin, overlap_interval.end)
for other_node in other_span2nodes[overlap_span]:
overlap_map[local_node].append(
{'node_id': other_node, 'overlap': overlap,
'interval': (other_on, other_off)})
if debug:
print local_span, overlap_interval, overlap
return {local_node: max_overlap(overlap_map[local_node])
for local_node in overlap_map}
In [142]:
subord_nodes = get_subordinate_clauses(tdg)
edu_nodes = rs3.get_edus(tdg)
overlapping_nodes_map = find_overlapping_nodes(
tdg, subord_nodes, edu_nodes, overlap_threshold=100)
In [178]:
for subord_clause in overlapping_nodes_map:
# print subord_clause
print tdg.node[subord_clause]['tiger:cat'], dg.get_text(tdg, subord_clause)
# other_node = overlapping_nodes_map[subord_clause]
# other_node_id = other_node['node_id']
# print other_node, dg.get_text(tdg, other_node_id)
# print tdg.node[other_node_id]['rst:segment_type'], '\n'
In [183]:
for edu in overlapping_nodes_map.values():
edu_node_id = edu['node_id']
segment_type = tdg.node[edu_node_id]['rst:segment_type']
if segment_type == 'nucleus':
rel_name = tdg.node[edu_node_id]['rst:rel_name']
else:
in_edges = tdg.in_edges(edu_node_id)
assert len(in_edges) == 1, \
"There must be exactly one dominating node."
dom_node_id = in_edges[0][0]
rel_name = tdg.node[dom_node_id]['rst:rel_name']
print edu_node_id, segment_type, rel_name
In [ ]: