In [1]:
import discoursegraphs as dg
from discoursegraphs.corpora import pcc
from discoursegraphs.readwrite.rst import rs3
In [2]:
def get_subordinate_clauses(tiger_docgraph):
"""
given a document graph of a TIGER syntax tree, return all
node IDs of nodes representing subordinate clause constituents.
Parameters
----------
tiger_docgraph : dg.DiscourseDocumentGraph
document graph from which subordinate clauses will be extracted
Returns
-------
subord_clause_nodes : list(str)
list of node IDs of nodes directly dominating subordinate clauses
"""
subord_clause_rels = \
dg.select_edges_by_attribute(
tiger_docgraph, attribute='tiger:label',
value=['MO', 'RC', 'SB'])
subord_clause_nodes = []
for src_id, target_id in subord_clause_rels:
src_cat = tiger_docgraph.node[src_id].get('tiger:cat')
if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id):
subord_clause_nodes.append(target_id)
return subord_clause_nodes
In [3]:
from collections import defaultdict
from intervaltree import IntervalTree, Interval
import discoursegraphs as dg
from discoursegraphs.readwrite.rst import rs3
In [4]:
def _spans2nodes(docgraph, nodes):
"""
Takes a document graph and a collection of nodes
and returns a mapping from a span to all the nodes
that dominate that span. (There might be multiple
nodes which cover the same span.)
Parameters
----------
docgraph : dg.DiscourseDocumentGraph
document graph from which the span-node mapping will be extracted
nodes : collections.Iterable(str)
a list/collection of node IDs whose spans will be extracted
Returns
-------
span2nodes : defaultdict(tuple(int, int): list(str))
maps from the offsets of a span to a list of node IDs
(of nodes directly dominating that span)
"""
span2nodes = defaultdict(list)
for node_id in nodes:
span = dg.get_span_offsets(docgraph, node_id)
span2nodes[span].append(node_id)
return span2nodes
def get_max_overlaps(overlap_map):
"""
finds the nodes with the longest overlapping span.
TODO: needs heavy refactoring, input/output is
hard to grasp.
Parameters
----------
overlap_map : defaultdict(str: list(dict))
maps from a local node (node ID) to a list of other nodes,
where each other node is represented as a dict
with the keys ``node_id`` (str), ``overlap`` (float)
and ``interval`` (tuple(int, int))
Returns
-------
max_overlapping_nodes : dict(str: dict)
maps from a local node (node ID str) to
the other node (dict with keys ``node_id`` (str),
``overlap`` (float) and ``interval``
(tuple(int, int))) that has the greatest overlap
with it
"""
# given a list of other nodes, return the one
# with the highest overlap score
max_overlap = lambda l: sorted(
l, key=lambda n: n['overlap'],
reverse=True)[0]
return {local_node: max_overlap(overlap_map[local_node])
for local_node in overlap_map}
In [5]:
def get_overlap_scores(local_span, overlap_interval):
"""
calculates the overlap between the given input spans.
Parameters
----------
local_span : tuple(int, int)
the onset and offset of the ``local`` span
overlap_interval : Interval
the onset and offset of the ``other`` span
Returns
-------
overlap_ratio : float
the percentage of overlap between the input spans
len_overlap : the number of consecutive characters that
the input spans share
len_longest_input : length (in characters) of the longest
of the two input spans
"""
local_on, local_off = local_span
len_local = local_off - local_on
other_on, other_off = overlap_interval.begin, overlap_interval.end
len_other = other_off - other_on
len_longest_input = max(len_local, len_other)
overlap_on = max(local_on, other_on)
overlap_off = min(local_off, other_off)
# length of overlap in chars (int)
len_overlap = overlap_off - overlap_on
# overlap (float) in % between the input intervals
overlap_ratio = len_overlap / float(len_longest_input) * 100
return overlap_ratio, len_overlap, len_longest_input
In [6]:
def find_overlapping_nodes(
docgraph, local_nodes, other_nodes,
overlap_threshold=95, debug=False, strict=False):
"""
given a document graph and two sets of nodes (called
``local_nodes`` and ``other_nodes`` merely to distinguish them),
find pairs of nodes (one ``local`` and one ``other``) which
cover (approximately) the same span.
Parameters
----------
docgraph : DiscourseDocumentGraph
the document graph in which we'll look for
overlapping nodes
local_nodes : collections.Iterable(str)
a collection of node IDs
other_nodes : collections.Iterable(str)
a collection of node IDs.
There's no technical difference between ``local_nodes``
and ``other_nodes``, we justed to distinguish the
two collections.
overlap_threshold : int
two spans are considered overlapping, if
their onset/offset intervals overlap at least N %
strict : bool
If True, the overlap has to pass the threshold. Otherwise,
up to two characters shorter than the longest input string
Returns
-------
max_overlapping_nodes : dict(str: dict)
maps from a local node (node ID str) to
the other node (dict with keys ``node_id`` (str),
``overlap`` (float) and ``interval``
(tuple(int, int))) that has the greatest overlap
with it
"""
def fulfills_overlap_criteria(strict, overlap, overlap_threshold,
len_overlap, len_longest_input):
"""
returns True, if the overlap criteria are met.
If strict is True, the overlap has to pass a threshold. Otherwise,
the length of the overlap is allowed to be up to two characters
shorter than the longest input string. (The strict=False option
is therefore useful for comparing short spans with potentially
diverging tokenization rules, e.g. 'Hello' vs. 'Hello !'.)
"""
if strict:
if overlap >= overlap_threshold:
return True
else:
if overlap >= overlap_threshold or (len_overlap+2 >= len_longest_input):
return True
return False
# there might be more than one node covering the same span,
# e.g. when an NP only consists of a single noun
local_span2nodes = _spans2nodes(docgraph, local_nodes)
other_span2nodes = _spans2nodes(docgraph, other_nodes)
other_tree = IntervalTree.from_tuples(other_span2nodes.keys())
# overlap_map : defaultdict(str: list(dict))
# maps from a local node (node ID) to a list of other nodes,
# where each other node is represented as a dict
# with the keys ``node_id`` (str), ``overlap`` (float)
# and ``interval`` (tuple(int, int))
overlap_map = defaultdict(list)
# local_span : tuple(int, int)
for local_span in local_span2nodes:
# overlap_intervals : set(Interval)
# all the spans from ``other_spans`` that overlap with this ``local_span``
overlap_intervals = other_tree[Interval(*local_span)]
for overlap_interval in overlap_intervals:
overlap, len_overlap, len_longest_input = \
get_overlap_scores(local_span, overlap_interval)
if fulfills_overlap_criteria(strict, overlap, overlap_threshold,
len_overlap, len_longest_input):
other_on, other_off = overlap_interval.begin, overlap_interval.end
# generate a mapping from a local node (node ID str)
# to all the ``other_nodes`` it overlaps with (incl.
# their overlap in % for finding the best match)
for local_node in local_span2nodes[local_span]:
overlap_span = (other_on, other_off)
for other_node in other_span2nodes[overlap_span]:
overlap_map[local_node].append(
{'node_id': other_node, 'overlap': overlap,
'interval': (other_on, other_off)})
return get_max_overlaps(overlap_map)
In [7]:
def get_rst_subord_matches(document_ids=None):
"""find all subordinate clauses that match with an EDU.
Parameters
----------
document_ids : list(str) or None
A list of document IDs. Iff None, extract matches
from the complete PCC.
Returns
-------
matches : list(tuple(str, str, str))
A list of spans that cover both an RST EDU as well as a
subordinate clause. Each span is represented as a
(subordinate clause type, RST segment type, RST relation name)
tuple, e.g. ('NP', 'nucleus', 'elaboration').
"""
matches = []
if document_ids is None:
document_ids = pcc.document_ids
for doc_id in document_ids:
docgraph = pcc[doc_id]
# compare subordinate clauses to EDUs
subord_nodes = get_subordinate_clauses(docgraph)
edu_nodes = rs3.get_edus(docgraph)
overlapping_nodes_map = find_overlapping_nodes(
docgraph, subord_nodes, edu_nodes, overlap_threshold=100, debug=True, strict=False)
for subord_id, edu in overlapping_nodes_map.items():
subord_clause_cat = docgraph.node[subord_id]['tiger:cat']
edu_node_id = edu['node_id']
segment_type = docgraph.node[edu_node_id]['rst:segment_type']
rel_name = docgraph.node[edu_node_id].get('rst:rel_name')
if not rel_name: # try to get the rel_name, no matter what (cf. issue #139)
in_edges = docgraph.in_edges(edu_node_id)
assert len(in_edges) == 1, \
"There must be exactly one dominating node."
dom_node_id = in_edges[0][0]
rel_name = docgraph.node[dom_node_id]['rst:rel_name']
# prints the subordinating clause and the EDU that it matches
# print subord_clause_cat, edu_node_id, segment_type, rel_name
matches.append((subord_clause_cat, segment_type, rel_name))
return matches
In [8]:
from pandas import DataFrame
import pandas
In [9]:
subset_doc_ids = ['maz-3277', 'maz-3377', 'maz-4428', 'maz-10110', 'maz-19436']
In [10]:
matches = get_rst_subord_matches()
subset_matches = get_rst_subord_matches(subset_doc_ids)
In [11]:
df = DataFrame(matches, columns=('clause', 'segment', 'relation'))
In [12]:
# in the PCC
subord_clauses = []
for doc_id in pcc.document_ids:
docgraph = pcc[doc_id]
subord_clauses.extend(get_subordinate_clauses(docgraph))
print len(subord_clauses)
In [13]:
# in the subcorpus
subord_clauses = []
for doc_id in subset_doc_ids:
docgraph = pcc[doc_id]
subord_clauses.extend(get_subordinate_clauses(docgraph))
print len(subord_clauses)
In [14]:
# in the PCC
edus = []
nucleii = []
satellites = []
for doc_id in pcc.document_ids:
docgraph = pcc[doc_id]
edu_nodes = rs3.get_edus(docgraph)
edus.extend(edu_nodes)
for edu_node_id in edu_nodes:
segment_type = docgraph.node[edu_node_id]['rst:segment_type']
if segment_type == 'nucleus':
nucleii.append(edu_node_id)
elif segment_type == 'satellite':
satellites.append(edu_node_id)
print 'EDUs:', len(edus)
print 'nucleii:', len(nucleii)
print 'satellites:', len(satellites)
In [15]:
import codecs
with codecs.open('/tmp/edus_in_pcc.txt', 'w', encoding='utf8') as edu_file:
for doc_id in pcc.document_ids:
docgraph = pcc[doc_id]
edu_nodes = rs3.get_edus(docgraph)
edus.extend(edu_nodes)
for edu_node_id in edu_nodes:
segment_type = docgraph.node[edu_node_id]['rst:segment_type']
edu_file.write(u"({}, {}, {})\n".format(doc_id, edu_node_id, docgraph.node[edu_node_id]['rst:text']))
In [16]:
# in the subcorpus
edus = []
nucleii = []
satellites = []
for doc_id in subset_doc_ids:
docgraph = pcc[doc_id]
edu_nodes = rs3.get_edus(docgraph)
edus.extend(edu_nodes)
for edu_node_id in edu_nodes:
segment_type = docgraph.node[edu_node_id]['rst:segment_type']
if segment_type == 'nucleus':
nucleii.append(edu_node_id)
elif segment_type == 'satellite':
satellites.append(edu_node_id)
print 'EDUs:', len(edus)
print 'nucleii:', len(nucleii)
print 'satellites:', len(satellites)
In [17]:
print len(matches) # in the PCC
print len(subset_matches) # in the subset
In [18]:
len(df[df.segment == 'satellite'])
Out[18]:
In [19]:
len(df[df.segment == 'nucleus'])
Out[19]:
In [20]:
df[(df.segment != 'nucleus') & (df.segment != 'satellite')]
Out[20]:
In [21]:
from collections import defaultdict
def get_token_based_rst_subord_matches(document_ids=None):
"""find all subordinate clauses that match with an EDU.
Parameters
----------
document_ids : list(str) or None
A list of document IDs. Iff None, extract matches
from the complete PCC.
Returns
-------
matches : list(tuple(str, str, str))
A list of spans that cover both an RST EDU as well as a
subordinate clause. Each span is represented as a
(subordinate clause type, RST segment type, RST relation name)
tuple, e.g. ('NP', 'nucleus', 'elaboration').
"""
matches = []
subord_node_tokens = defaultdict(dict)
edu_node_tokens = defaultdict(dict)
if document_ids is None:
document_ids = pcc.document_ids
for doc_id in document_ids:
docgraph = pcc[doc_id]
# compare subordinate clauses to EDUs
subord_nodes = get_subordinate_clauses(docgraph)
edu_nodes = rs3.get_edus(docgraph)
# print 'doc_id:', doc_id, 'num of edus:', len(edu_nodes)
for subord_node in subord_nodes:
subord_node_tokens[doc_id][subord_node] = set(dg.get_span(docgraph, subord_node))
for edu_node in edu_nodes:
edu_node_tokens[doc_id][edu_node] = set(dg.get_span(docgraph, edu_node))
return subord_node_tokens, edu_node_tokens
In [22]:
subord_node_map, edu_node_map = get_token_based_rst_subord_matches()
In [23]:
print len(subord_node_map)
print len(edu_node_map)
In [24]:
foo = defaultdict(dict)
In [25]:
foo[23][42] = 'bar'
In [26]:
foo[23]
Out[26]:
In [29]:
matches = defaultdict(lambda : defaultdict(set))
for doc_id in subord_node_map:
for subord_node_id in subord_node_map[doc_id]:
subord_token_set = subord_node_map[doc_id][subord_node_id]
num_subord_tokens = len(subord_token_set)
for edu_node_id, edu_token_set in edu_node_map[doc_id].iteritems():
num_common_tokens = len(subord_token_set.intersection(edu_token_set))
if num_common_tokens > 0 and \
num_common_tokens in range(num_subord_tokens-1, num_subord_tokens+2):
# print doc_id, subord_node_id, edu_node_id
matches[doc_id][subord_node_id].add(edu_node_id)
In [ ]:
# TODO: write generic histogram function
print(len(matches))
matches.items()[:4]
('maz-13125',
defaultdict(set,
{'s384_502': {'rst:11', 'rst:4'},
's384_510': {'rst:11', 'rst:4'},
's386_504': {'rst:12', 'rst:4'},
's388_500': {'rst:10', 'rst:4'},
's389_502': {'rst:10', 'rst:4', 'rst:5', 'rst:6'},
's391_502': {'rst:7'},
's391_512': {'rst:7'},
's394_500': {'rst:8'},
's394_502': {'rst:16', 'rst:8'},
's394_505': {'rst:8'},
's394_508': {'rst:16', 'rst:8'},
's395_500': {'rst:9'}})),
In [ ]:
maz_13125_s389_str = """
<s xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="s389" art_id="13125" orig_id="ID_maz-13125">
<graph root="s389_503">
<terminals>
<t id="s389_1" word="Was" lemma="--" pos="PWS" morph="--"/>
<t id="s389_2" word="man" lemma="--" pos="PIS" morph="--"/>
<t id="s389_3" word="nicht" lemma="--" pos="PTKNEG" morph="--"/>
<t id="s389_4" word="durch" lemma="--" pos="APPR" morph="--"/>
<t id="s389_5" word="Augenschein" lemma="--" pos="NN" morph="--"/>
<t id="s389_6" word="nachprüfen" lemma="--" pos="VVINF" morph="--"/>
<t id="s389_7" word="kann" lemma="--" pos="VMFIN" morph="--"/>
<t id="s389_8" word="," lemma="--" pos="$," morph="--"/>
<t id="s389_9" word="ist" lemma="--" pos="VAFIN" morph="--"/>
<t id="s389_10" word="manipulierbar" lemma="--" pos="ADJD" morph="--"/>
<t id="s389_11" word="." lemma="--" pos="$." morph="--"/>
</terminals>
<nonterminals>
<nt id="s389_500" cat="PP">
<edge label="AC" idref="s389_4"/>
<edge label="NK" idref="s389_5"/>
</nt>
<nt id="s389_501" cat="VP">
<edge label="OA" idref="s389_1"/>
<edge label="HD" idref="s389_6"/>
<edge label="MO" idref="s389_500"/>
</nt>
<nt id="s389_502" cat="S">
<edge label="SB" idref="s389_2"/>
<edge label="NG" idref="s389_3"/>
<edge label="HD" idref="s389_7"/>
<edge label="OC" idref="s389_501"/>
</nt>
<nt id="s389_503" cat="S">
<edge label="HD" idref="s389_9"/>
<edge label="PD" idref="s389_10"/>
<edge label="SB" idref="s389_502"/>
</nt>
</nonterminals>
</graph>
</s>
"""
In [ ]:
from lxml import etree
import discoursegraphs as dg
maz_13125_s389 = etree.fromstring(maz_13125_s389_str)
tsg = dg.readwrite.tiger.TigerSentenceGraph(maz_13125_s389)
In [ ]:
%load_ext gvmagic
In [ ]:
%dotstr dg.print_dot(tsg)
In [ ]:
print dg.print_dot(tsg)
In [ ]:
foo = u"""
digraph "" {
"discoursegraph:root_node";
"VROOT-s389";
"s389_500" [label="PP"];
"s389_501" [label="VP"];
"s389_502" [label="S"];
"s389_503" [label="S"];
"s389_1" [label="Was"];
"s389_2" [label="man"];
"s389_3" [label="nicht"];
"s389_4" [label="durch"];
"s389_5" [label="Augenschein"];
"s389_6" [label="nachprüfen"];
"s389_7" [label="kann"];
"s389_8" [label=","];
"s389_9" [label="ist"];
"s389_10" [label="manipulierbar"];
"s389_11" [label="."];
edge [style="invis"];
{rank=same; "s389_1" -> "s389_2" -> "s389_3" -> "s389_4" -> "s389_5" -> "s389_6" -> "s389_7" -> "s389_8" -> "s389_9" -> "s389_10" -> "s389_11";}
edge [style=""];
"discoursegraph:root_node" -> "VROOT-s389" [key=0]; // changed direction
"s389_501" -> "s389_6" [key=0, label="HD"];
"s389_501" -> "s389_1" [key=0, label="OA"];
"s389_501" -> "s389_500" [key=0, label="MO"];
"s389_500" -> "s389_5" [key=0, label="NK"];
"s389_500" -> "s389_4" [key=0, label="AC"];
"s389_503" -> "s389_9" [key=0, label="HD"];
"s389_503" -> "s389_10" [key=0, label="PD"];
"s389_503" -> "s389_502" [key=0, label="SB"];
"s389_502" -> "s389_2" [key=0, label="SB"];
"s389_502" -> "s389_3" [key=0, label="NG"];
"s389_502" -> "s389_501" [key=0, label="OC"];
"s389_502" -> "s389_7" [key=0, label="HD"];
"VROOT-s389" -> "s389_503" [key=0];
"VROOT-s389" -> "s389_8" [key=0];
"VROOT-s389" -> "s389_11" [key=0];
}
"""
%dotstr foo
In [ ]:
matches[('maz-8838', 's2013_501')]
In [ ]:
ab = b.intersection(a)
In [ ]:
range(len(a)-1, len(a)+2)
In [ ]:
def get_parents_of_subordinate_clauses(tiger_docgraph):
parents = []
subord_clause_rels = \
dg.select_edges_by_attribute(
tiger_docgraph, attribute='tiger:label',
value=['MO', 'RC', 'SB'])
# subord_clause_nodes = []
for src_id, target_id in subord_clause_rels:
src_cat = tiger_docgraph.node[src_id].get('tiger:cat')
if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id):
# subord_clause_nodes.append(target_id)
parents.append(src_id)
return parents
In [ ]:
# for doc_id in pcc.document_ids:
# docgraph = pcc[doc_id]
# print 'doc_id:', doc_id
# parents = get_parents_of_subordinate_clauses(docgraph)
# print parents, '\n'
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
In [ ]:
dt = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5)
In [ ]:
from sklearn import preprocessing
df_numeric = df.copy()
encoders = {} # keep them for decoding the data later
for colname in df.columns:
encoder = preprocessing.LabelEncoder()
encoders[colname] = encoder
df_numeric[colname] = encoder.fit_transform(df[colname])
#to convert back
# train.Sex = le_sex.inverse_transform(train.Sex)
In [ ]:
# df_numeric['clause'] = encoders['clause'].inverse_transform(df_numeric['clause'])
In [ ]:
import pandas as pd
In [ ]:
pd.get_dummies(df.clause).head()
In [ ]:
pd.get_dummies(df.segment).head()
In [ ]:
pd.concat([pd.get_dummies(df.clause).head(), pd.get_dummies(df.segment).head()], axis=1)
In [ ]:
%load_ext gvmagic
In [ ]:
def predict_column(df, target_column, feature_columns=None, max_depth=5, min_samples_leaf=2):
"""
"""
# convert all feature cols w/ get_dummies & concat them
if feature_columns is None:
train_df = df.drop(labels=[target_column], axis=1)
else:
train_df = DataFrame(df, columns=feature_columns)
train_binarized = pd.concat(
(pd.get_dummies(df[col]) for col in train_df.columns), axis=1)
target_series = df[target_column]
dt = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
dt.fit(train_binarized, target_series)
return dt, train_binarized, target_series
In [ ]:
from StringIO import StringIO
# # df.drop(labels=['clause'], axis=1)
# for colname in df.columns:
# dt, train_binarized, target_series = predict_column(df, colname)
# out = StringIO()
# export_graphviz(dt, out_file=out)
# %dotstr out.getvalue()
In [ ]:
dt, train_binarized, target_series = predict_column(
df, target_column='segment', feature_columns=['clause'])
In [ ]:
out = StringIO()
export_graphviz(dt, out_file=out,
feature_names=train_binarized.columns,
class_names=dt.classes_,
filled=True, rounded=True,
special_characters=True)
In [ ]:
clause_df = df[['clause']]
In [ ]:
clause_df.head().to_dict()
In [ ]:
clause_df.T.to_dict().values()[:10]
clause_dict = clause_df.T.to_dict().values()
In [ ]:
from sklearn.feature_extraction import DictVectorizer as DV
vectorizer = DV( sparse = False )
clause_vec = vectorizer.fit_transform(clause_dict)
clause_vec
In [ ]:
df.to_csv('clause-segment-relation.csv')
In [ ]:
%dotstr out.getvalue()
# train_binarized.columns
# target_series.value_counts()