In [1]:

    
import discoursegraphs as dg
from discoursegraphs.corpora import pcc
from discoursegraphs.readwrite.rst import rs3

Task 1: Which syntactic subordinate clauses match with an RST EDU?

Task 2: Are these EDUs satellites or nucleii?

Task 3: does this correlate with certain RST relation types?



In [2]:

    
def get_subordinate_clauses(tiger_docgraph):
    """
    given a document graph of a TIGER syntax tree, return all
    node IDs of nodes representing subordinate clause constituents.

    Parameters
    ----------
    tiger_docgraph : dg.DiscourseDocumentGraph
        document graph from which subordinate clauses will be extracted
    
    Returns
    -------
    subord_clause_nodes : list(str)
        list of node IDs of nodes directly dominating subordinate clauses
    """
    subord_clause_rels = \
        dg.select_edges_by_attribute(
            tiger_docgraph, attribute='tiger:label',
            value=['MO', 'RC', 'SB'])
    
    subord_clause_nodes = []
    for src_id, target_id in subord_clause_rels:
        src_cat = tiger_docgraph.node[src_id].get('tiger:cat')
        if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id):
            subord_clause_nodes.append(target_id)
    return subord_clause_nodes



In [3]:

    
from collections import defaultdict

from intervaltree import IntervalTree, Interval

import discoursegraphs as dg
from discoursegraphs.readwrite.rst import rs3



In [4]:

    
def _spans2nodes(docgraph, nodes):
    """
    Takes a document graph and a collection of nodes
    and returns a mapping from a span to all the nodes
    that dominate that span. (There might be multiple
    nodes which cover the same span.)

    Parameters
    ----------
    docgraph : dg.DiscourseDocumentGraph
        document graph from which the span-node mapping will be extracted
    nodes : collections.Iterable(str)
        a list/collection of node IDs whose spans will be extracted

    Returns
    -------
    span2nodes : defaultdict(tuple(int, int): list(str))
        maps from the offsets of a span to a list of node IDs
        (of nodes directly dominating that span)
    """
    span2nodes = defaultdict(list)
    for node_id in nodes:
        span = dg.get_span_offsets(docgraph, node_id)
        span2nodes[span].append(node_id)
    return span2nodes

def get_max_overlaps(overlap_map):
    """
    finds the nodes with the longest overlapping span.
    TODO: needs heavy refactoring, input/output is
    hard to grasp.
    
    Parameters
    ----------
    overlap_map : defaultdict(str: list(dict))
        maps from a local node (node ID) to a list of other nodes,
        where each other node is represented as a dict
        with the keys ``node_id`` (str), ``overlap`` (float)
        and ``interval`` (tuple(int, int))
    
    Returns
    -------
    max_overlapping_nodes : dict(str: dict)
        maps from a local node (node ID str) to
        the other node (dict with keys ``node_id`` (str),
        ``overlap`` (float) and ``interval``
        (tuple(int, int))) that has the greatest overlap
        with it
    """
    # given a list of other nodes, return the one
    # with the highest overlap score
    max_overlap = lambda l: sorted(
        l, key=lambda n: n['overlap'],
        reverse=True)[0]
    return {local_node: max_overlap(overlap_map[local_node])
            for local_node in overlap_map}



In [5]:

    
def get_overlap_scores(local_span, overlap_interval):
    """
    calculates the overlap between the given input spans.
    
    Parameters
    ----------
    local_span : tuple(int, int)
        the onset and offset of the ``local`` span
    overlap_interval : Interval
        the onset and offset of the ``other`` span

    Returns
    -------
    overlap_ratio : float
        the percentage of overlap between the input spans
    len_overlap : the number of consecutive characters that
        the input spans share
    len_longest_input : length (in characters) of the longest
        of the two input spans
    """
    local_on, local_off = local_span
    len_local = local_off - local_on

    other_on, other_off = overlap_interval.begin, overlap_interval.end
    len_other = other_off - other_on
    len_longest_input = max(len_local, len_other)

    overlap_on = max(local_on, other_on)
    overlap_off = min(local_off, other_off)
    # length of overlap in chars (int)
    len_overlap = overlap_off - overlap_on

    # overlap (float) in % between the input intervals
    overlap_ratio = len_overlap / float(len_longest_input) * 100

    return overlap_ratio, len_overlap, len_longest_input



In [6]:

    
def find_overlapping_nodes(
    docgraph, local_nodes, other_nodes,
    overlap_threshold=95, debug=False, strict=False):
    """
    given a document graph and two sets of nodes (called
    ``local_nodes`` and ``other_nodes`` merely to distinguish them),
    find pairs of nodes (one ``local`` and one ``other``) which
    cover (approximately) the same span.
    
    Parameters
    ----------
    docgraph : DiscourseDocumentGraph
        the document graph in which we'll look for
        overlapping nodes
    local_nodes : collections.Iterable(str)
        a collection of node IDs
    other_nodes : collections.Iterable(str)
        a collection of node IDs.
        There's no technical difference between ``local_nodes``
        and ``other_nodes``, we justed to distinguish the
        two collections.
    overlap_threshold : int
        two spans are considered overlapping, if
        their onset/offset intervals overlap at least N %
    strict : bool
        If True, the overlap has to pass the threshold. Otherwise,
        up to two characters shorter than the longest input string
    
    Returns
    -------
    max_overlapping_nodes : dict(str: dict)
        maps from a local node (node ID str) to
        the other node (dict with keys ``node_id`` (str),
        ``overlap`` (float) and ``interval``
        (tuple(int, int))) that has the greatest overlap
        with it
    """    
    def fulfills_overlap_criteria(strict, overlap, overlap_threshold,
                                  len_overlap, len_longest_input):
        """
        returns True, if the overlap criteria are met.

        If strict is True, the overlap has to pass a threshold. Otherwise,
        the length of the overlap is allowed to be up to two characters
        shorter than the longest input string. (The strict=False option
        is therefore useful for comparing short spans with potentially
        diverging tokenization rules, e.g. 'Hello' vs. 'Hello !'.)
        """
        if strict:
            if overlap >= overlap_threshold:
                return True
        else:
            if overlap >= overlap_threshold or (len_overlap+2 >= len_longest_input):
                return True
        return False
        
    # there might be more than one node covering the same span,
    # e.g. when an NP only consists of a single noun
    local_span2nodes = _spans2nodes(docgraph, local_nodes)        
    other_span2nodes = _spans2nodes(docgraph, other_nodes)

    other_tree = IntervalTree.from_tuples(other_span2nodes.keys())
    
    # overlap_map : defaultdict(str: list(dict))
    #    maps from a local node (node ID) to a list of other nodes,
    #    where each other node is represented as a dict
    #    with the keys ``node_id`` (str), ``overlap`` (float)
    #    and ``interval`` (tuple(int, int))
    overlap_map = defaultdict(list)

    # local_span : tuple(int, int)
    for local_span in local_span2nodes:
        # overlap_intervals : set(Interval)
        # all the spans from ``other_spans`` that overlap with this ``local_span``
        overlap_intervals = other_tree[Interval(*local_span)]
        for overlap_interval in overlap_intervals:
            overlap, len_overlap, len_longest_input = \
                get_overlap_scores(local_span, overlap_interval)
            
            if fulfills_overlap_criteria(strict, overlap, overlap_threshold,
                                         len_overlap, len_longest_input):
                other_on, other_off = overlap_interval.begin, overlap_interval.end
                # generate a mapping from a local node (node ID str)
                # to all the ``other_nodes`` it overlaps with (incl.
                # their overlap in % for finding the best match)
                for local_node in local_span2nodes[local_span]:
                    overlap_span = (other_on, other_off)
                    for other_node in other_span2nodes[overlap_span]:
                        overlap_map[local_node].append(
                            {'node_id': other_node, 'overlap': overlap,
                             'interval': (other_on, other_off)})
    
    return get_max_overlaps(overlap_map)



In [7]:

    
def get_rst_subord_matches(document_ids=None):
    """find all subordinate clauses that match with an EDU.
    
    Parameters
    ----------
    document_ids : list(str) or None
        A list of document IDs. Iff None, extract matches
        from the complete PCC.
    
    Returns
    -------
    matches : list(tuple(str, str, str))
        A list of spans that cover both an RST EDU as well as a
        subordinate clause. Each span is represented as a
        (subordinate clause type, RST segment type, RST relation name)
        tuple, e.g. ('NP', 'nucleus', 'elaboration').
    """
    matches = []
    if document_ids is None:
        document_ids = pcc.document_ids
    
    for doc_id in document_ids:
        docgraph = pcc[doc_id]

        # compare subordinate clauses to EDUs
        subord_nodes = get_subordinate_clauses(docgraph)
        edu_nodes =  rs3.get_edus(docgraph)

        overlapping_nodes_map = find_overlapping_nodes(
            docgraph, subord_nodes, edu_nodes, overlap_threshold=100, debug=True, strict=False)

        for subord_id, edu in overlapping_nodes_map.items():
            subord_clause_cat = docgraph.node[subord_id]['tiger:cat']

            edu_node_id = edu['node_id']
            segment_type = docgraph.node[edu_node_id]['rst:segment_type']

            rel_name = docgraph.node[edu_node_id].get('rst:rel_name')
            if not rel_name: # try to get the rel_name, no matter what (cf. issue #139)
                in_edges = docgraph.in_edges(edu_node_id)
                assert len(in_edges) == 1, \
                    "There must be exactly one dominating node."
                dom_node_id = in_edges[0][0]
                rel_name = docgraph.node[dom_node_id]['rst:rel_name']

            # prints the subordinating clause and the EDU that it matches
    #         print subord_clause_cat, edu_node_id, segment_type, rel_name
            matches.append((subord_clause_cat, segment_type, rel_name))
    return matches



In [8]:

    
from pandas import DataFrame
import pandas



In [9]:

    
subset_doc_ids = ['maz-3277', 'maz-3377', 'maz-4428', 'maz-10110', 'maz-19436']



In [10]:

    
matches = get_rst_subord_matches()
subset_matches = get_rst_subord_matches(subset_doc_ids)



In [11]:

    
df = DataFrame(matches, columns=('clause', 'segment', 'relation'))

How many subordinate clauses are there?



In [12]:

    
# in the PCC
subord_clauses = []
for doc_id in pcc.document_ids:
    docgraph = pcc[doc_id]
    subord_clauses.extend(get_subordinate_clauses(docgraph))
print len(subord_clauses)



In [13]:

    
# in the subcorpus
subord_clauses = []
for doc_id in subset_doc_ids:
    docgraph = pcc[doc_id]
    subord_clauses.extend(get_subordinate_clauses(docgraph))
print len(subord_clauses)

How many EDUs are there?



In [14]:

    
# in the PCC
edus = []
nucleii = []
satellites = []
for doc_id in pcc.document_ids:
    docgraph = pcc[doc_id]
    edu_nodes = rs3.get_edus(docgraph)
    edus.extend(edu_nodes)
    for edu_node_id in edu_nodes:
        segment_type = docgraph.node[edu_node_id]['rst:segment_type']
        if segment_type == 'nucleus':
            nucleii.append(edu_node_id)
        elif segment_type == 'satellite':
            satellites.append(edu_node_id)

print 'EDUs:', len(edus)
print 'nucleii:', len(nucleii)
print 'satellites:', len(satellites)









    



EDUs: 3018
nucleii: 1395
satellites: 1117



In [15]:

    
import codecs

with codecs.open('/tmp/edus_in_pcc.txt', 'w', encoding='utf8') as edu_file:
    for doc_id in pcc.document_ids:
        docgraph = pcc[doc_id]
        edu_nodes = rs3.get_edus(docgraph)
        edus.extend(edu_nodes)
        for edu_node_id in edu_nodes:
            segment_type = docgraph.node[edu_node_id]['rst:segment_type']
            edu_file.write(u"({}, {}, {})\n".format(doc_id, edu_node_id, docgraph.node[edu_node_id]['rst:text']))



In [16]:

    
# in the subcorpus
edus = []
nucleii = []
satellites = []
for doc_id in subset_doc_ids:
    docgraph = pcc[doc_id]
    edu_nodes = rs3.get_edus(docgraph)
    edus.extend(edu_nodes)
    for edu_node_id in edu_nodes:
        segment_type = docgraph.node[edu_node_id]['rst:segment_type']
        if segment_type == 'nucleus':
            nucleii.append(edu_node_id)
        elif segment_type == 'satellite':
            satellites.append(edu_node_id)

print 'EDUs:', len(edus)
print 'nucleii:', len(nucleii)
print 'satellites:', len(satellites)









    



EDUs: 89
nucleii: 36
satellites: 39

How many matching spans are there?



In [17]:

    
print len(matches) # in the PCC
print len(subset_matches) # in the subset



In [18]:

    
len(df[df.segment == 'satellite'])









    Out[18]:





168



In [19]:

    
len(df[df.segment == 'nucleus'])









    Out[19]:





24



In [20]:

    
df[(df.segment != 'nucleus') & (df.segment != 'satellite')]









    Out[20]:






  
    
      
      clause
      segment
      relation
    
  
  
    
      102
      S
      span
      condition
    
    
      120
      S
      span
      concession
    
    
      142
      S
      span
      cause
    
    
      148
      S
      span
      antithesis
    
    
      182
      S
      span
      antithesis
    
    
      190
      S
      span
      condition

Problem: different definitions of overlap

I implemented overlap based on string overlap
- a clause and an EDU match, if they cover the same range of characters
  (+/- 2 characters)
W. implemented overlap in neo4j based on token overlap
- a clause and an EDU match, if they cover the same tokens (+/- 2 tokens)

Solution: implement token overlap for discoursegraphs



In [21]:

    
from collections import defaultdict

def get_token_based_rst_subord_matches(document_ids=None):
    """find all subordinate clauses that match with an EDU.
    
    Parameters
    ----------
    document_ids : list(str) or None
        A list of document IDs. Iff None, extract matches
        from the complete PCC.
    
    Returns
    -------
    matches : list(tuple(str, str, str))
        A list of spans that cover both an RST EDU as well as a
        subordinate clause. Each span is represented as a
        (subordinate clause type, RST segment type, RST relation name)
        tuple, e.g. ('NP', 'nucleus', 'elaboration').
    """
    matches = []
    subord_node_tokens = defaultdict(dict)
    edu_node_tokens = defaultdict(dict)
    
    if document_ids is None:
        document_ids = pcc.document_ids
    
    for doc_id in document_ids:
        docgraph = pcc[doc_id]

        # compare subordinate clauses to EDUs
        subord_nodes = get_subordinate_clauses(docgraph)
        edu_nodes =  rs3.get_edus(docgraph)
#         print 'doc_id:', doc_id, 'num of edus:', len(edu_nodes)
        
        for subord_node in subord_nodes:
            subord_node_tokens[doc_id][subord_node] = set(dg.get_span(docgraph, subord_node))
        
        for edu_node in edu_nodes:
            edu_node_tokens[doc_id][edu_node] = set(dg.get_span(docgraph, edu_node))
    return subord_node_tokens, edu_node_tokens



In [22]:

    
subord_node_map, edu_node_map = get_token_based_rst_subord_matches()



In [23]:

    
print len(subord_node_map)
print len(edu_node_map)



In [24]:

    
foo = defaultdict(dict)



In [25]:

    
foo[23][42] = 'bar'



In [26]:

    
foo[23]









    Out[26]:





{42: 'bar'}



In [29]:

    
matches = defaultdict(lambda : defaultdict(set))

for doc_id in subord_node_map:
    for subord_node_id in subord_node_map[doc_id]:
        subord_token_set = subord_node_map[doc_id][subord_node_id]
        num_subord_tokens = len(subord_token_set)
        
        for edu_node_id, edu_token_set in edu_node_map[doc_id].iteritems():
            num_common_tokens = len(subord_token_set.intersection(edu_token_set))
            if num_common_tokens > 0 and \
                num_common_tokens in range(num_subord_tokens-1, num_subord_tokens+2):
#                     print doc_id, subord_node_id, edu_node_id
                    matches[doc_id][subord_node_id].add(edu_node_id)



In [ ]:

    
# TODO: write generic histogram function

print(len(matches))
matches.items()[:4]

TODO: what's wrong with this matching

 ('maz-13125',
  defaultdict(set,
              {'s384_502': {'rst:11', 'rst:4'},
               's384_510': {'rst:11', 'rst:4'},
               's386_504': {'rst:12', 'rst:4'},
               's388_500': {'rst:10', 'rst:4'},
               's389_502': {'rst:10', 'rst:4', 'rst:5', 'rst:6'},
               's391_502': {'rst:7'},
               's391_512': {'rst:7'},
               's394_500': {'rst:8'},
               's394_502': {'rst:16', 'rst:8'},
               's394_505': {'rst:8'},
               's394_508': {'rst:16', 'rst:8'},
               's395_500': {'rst:9'}})),



In [ ]:

    
maz_13125_s389_str = """
<s xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="s389" art_id="13125" orig_id="ID_maz-13125">
<graph root="s389_503">
<terminals>
<t id="s389_1" word="Was" lemma="--" pos="PWS" morph="--"/>
<t id="s389_2" word="man" lemma="--" pos="PIS" morph="--"/>
<t id="s389_3" word="nicht" lemma="--" pos="PTKNEG" morph="--"/>
<t id="s389_4" word="durch" lemma="--" pos="APPR" morph="--"/>
<t id="s389_5" word="Augenschein" lemma="--" pos="NN" morph="--"/>
<t id="s389_6" word="nachprüfen" lemma="--" pos="VVINF" morph="--"/>
<t id="s389_7" word="kann" lemma="--" pos="VMFIN" morph="--"/>
<t id="s389_8" word="," lemma="--" pos="$," morph="--"/>
<t id="s389_9" word="ist" lemma="--" pos="VAFIN" morph="--"/>
<t id="s389_10" word="manipulierbar" lemma="--" pos="ADJD" morph="--"/>
<t id="s389_11" word="." lemma="--" pos="$." morph="--"/>
</terminals>
<nonterminals>
<nt id="s389_500" cat="PP">
<edge label="AC" idref="s389_4"/>
<edge label="NK" idref="s389_5"/>
 </nt>
<nt id="s389_501" cat="VP">
<edge label="OA" idref="s389_1"/>
<edge label="HD" idref="s389_6"/>
<edge label="MO" idref="s389_500"/>
 </nt>
<nt id="s389_502" cat="S">
<edge label="SB" idref="s389_2"/>
<edge label="NG" idref="s389_3"/>
<edge label="HD" idref="s389_7"/>
<edge label="OC" idref="s389_501"/>
 </nt>
<nt id="s389_503" cat="S">
<edge label="HD" idref="s389_9"/>
<edge label="PD" idref="s389_10"/>
<edge label="SB" idref="s389_502"/>
 </nt>
</nonterminals>
</graph>
</s>
"""



In [ ]:

    
from lxml import etree
import discoursegraphs as dg

maz_13125_s389 = etree.fromstring(maz_13125_s389_str)
tsg = dg.readwrite.tiger.TigerSentenceGraph(maz_13125_s389)



In [ ]:

    
%load_ext gvmagic



In [ ]:

    
%dotstr dg.print_dot(tsg)



In [ ]:

    
print dg.print_dot(tsg)



In [ ]:

    
foo = u"""
digraph "" {

"discoursegraph:root_node";
"VROOT-s389";
"s389_500" [label="PP"];
"s389_501" [label="VP"];
"s389_502" [label="S"];
"s389_503" [label="S"];

"s389_1" [label="Was"];
"s389_2" [label="man"];
"s389_3" [label="nicht"];
"s389_4" [label="durch"];
"s389_5" [label="Augenschein"];
"s389_6" [label="nachprüfen"];
"s389_7" [label="kann"];
"s389_8" [label=","];
"s389_9" [label="ist"];
"s389_10" [label="manipulierbar"];
"s389_11" [label="."];

edge [style="invis"];
{rank=same; "s389_1" -> "s389_2" -> "s389_3" -> "s389_4" -> "s389_5" -> "s389_6" -> "s389_7" -> "s389_8" -> "s389_9" -> "s389_10" -> "s389_11";}
edge [style=""];

"discoursegraph:root_node" -> "VROOT-s389" [key=0]; // changed direction

"s389_501" -> "s389_6"  [key=0, label="HD"];
"s389_501" -> "s389_1"  [key=0, label="OA"];
"s389_501" -> "s389_500"  [key=0, label="MO"];
"s389_500" -> "s389_5"  [key=0, label="NK"];
"s389_500" -> "s389_4"  [key=0, label="AC"];
"s389_503" -> "s389_9"  [key=0, label="HD"];
"s389_503" -> "s389_10"  [key=0, label="PD"];
"s389_503" -> "s389_502"  [key=0, label="SB"];
"s389_502" -> "s389_2"  [key=0, label="SB"];
"s389_502" -> "s389_3"  [key=0, label="NG"];
"s389_502" -> "s389_501"  [key=0, label="OC"];
"s389_502" -> "s389_7"  [key=0, label="HD"];
"VROOT-s389" -> "s389_503"  [key=0];

"VROOT-s389" -> "s389_8"  [key=0];
"VROOT-s389" -> "s389_11"  [key=0];
}
"""

%dotstr foo



In [ ]:

    
matches[('maz-8838', 's2013_501')]



In [ ]:

    
ab = b.intersection(a)



In [ ]:

    
range(len(a)-1, len(a)+2)

Problem: not even subord. clause counts match



In [ ]:

    
def get_parents_of_subordinate_clauses(tiger_docgraph):
    parents = []
    subord_clause_rels = \
        dg.select_edges_by_attribute(
            tiger_docgraph, attribute='tiger:label',
            value=['MO', 'RC', 'SB'])
    
#     subord_clause_nodes = []
    for src_id, target_id in subord_clause_rels:
        src_cat = tiger_docgraph.node[src_id].get('tiger:cat')
        if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id):
#             subord_clause_nodes.append(target_id)
            parents.append(src_id)
    return parents



In [ ]:

    
# for doc_id in pcc.document_ids:
#     docgraph = pcc[doc_id]
#     print 'doc_id:', doc_id
#     parents = get_parents_of_subordinate_clauses(docgraph)
#     print parents, '\n'



In [ ]:

    
from sklearn.tree import DecisionTreeClassifier, export_graphviz



In [ ]:

    
dt = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5)



In [ ]:

    
from sklearn import preprocessing

df_numeric = df.copy()

encoders = {} # keep them for decoding the data later
for colname in df.columns:
    encoder = preprocessing.LabelEncoder()
    encoders[colname] = encoder
    df_numeric[colname] = encoder.fit_transform(df[colname])

#to convert back

# train.Sex = le_sex.inverse_transform(train.Sex)



In [ ]:

    
# df_numeric['clause'] = encoders['clause'].inverse_transform(df_numeric['clause'])



In [ ]:

    
import pandas as pd



In [ ]:

    
pd.get_dummies(df.clause).head()



In [ ]:

    
pd.get_dummies(df.segment).head()



In [ ]:

    
pd.concat([pd.get_dummies(df.clause).head(), pd.get_dummies(df.segment).head()], axis=1)



In [ ]:

    
%load_ext gvmagic



In [ ]:

    
def predict_column(df, target_column, feature_columns=None, max_depth=5, min_samples_leaf=2):
    """
    """
    # convert all feature cols w/ get_dummies & concat them
    if feature_columns is None:
        train_df = df.drop(labels=[target_column], axis=1)
    else:
        train_df = DataFrame(df, columns=feature_columns)
        
    train_binarized = pd.concat(
        (pd.get_dummies(df[col]) for col in train_df.columns), axis=1)
    
    target_series = df[target_column]
    
    dt = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
    dt.fit(train_binarized, target_series)
    return dt, train_binarized, target_series



In [ ]:

    
from StringIO import StringIO

# # df.drop(labels=['clause'], axis=1)
# for colname in df.columns:
#     dt, train_binarized, target_series = predict_column(df, colname)

#     out = StringIO()
#     export_graphviz(dt, out_file=out)
#     %dotstr out.getvalue()



In [ ]:

    
dt, train_binarized, target_series = predict_column(
    df, target_column='segment', feature_columns=['clause'])



In [ ]:

    
out = StringIO()
export_graphviz(dt, out_file=out,
                feature_names=train_binarized.columns,  
                class_names=dt.classes_, 
                filled=True, rounded=True,  
                special_characters=True)

DictVectorizer usage example

if we can't use pandas.get_dummies() ...



In [ ]:

    
clause_df = df[['clause']]



In [ ]:

    
clause_df.head().to_dict()



In [ ]:

    
clause_df.T.to_dict().values()[:10]
clause_dict = clause_df.T.to_dict().values()



In [ ]:

    
from sklearn.feature_extraction import DictVectorizer as DV

vectorizer = DV( sparse = False )
clause_vec = vectorizer.fit_transform(clause_dict)
clause_vec



In [ ]:

    
df.to_csv('clause-segment-relation.csv')



In [ ]:

    
%dotstr out.getvalue()
# train_binarized.columns
# target_series.value_counts()

TODO: try Frequent Itemset visualization

Grouped matrix-based visualization
available in arulesViz R-package
cf. Hahsler and Chelluboina (2011). Visualizing Association Rules in Hierarchical Groups.

	clause	segment	relation
102	S	span	condition
120	S	span	concession
142	S	span	cause
148	S	span	antithesis
182	S	span	antithesis
190	S	span	condition