In [3]:

    
%load_ext autoreload
%autoreload 2









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [12]:

    
import lxml.etree as et
from treedlib_util import load_sentences, tag_candidate
from tree_structs_ipynb import sentence_to_xmltree, XMLTree
dts = map(sentence_to_xmltree, load_sentences('test/test1.parsed.tsv'))
dt = dts[1]
tag_candidate(dt.root, ['Autosomal', 'dominant', 'polycystic', 'kidney', 'disease'], 'P1')
tag_candidate(dt.root, ['PKD1'], 'G1')
tag_candidate(dt.root, ['PKD2'], 'G2')
dt.to_str()
dt.render_tree()
root = dt.root









    






<!--Provide the canvas id via python string formatting here--!>

Step 1: Reconfiguring the tree for flattening

We think if we reconfigure the tree so that one of the two candidates is the root, things will be much easier...



In [13]:

    
def root_tree_at(new_root):
    """
    Given a node, remove all parents and add as children
    so that this node becomes the new root
    """
    # Check to see if the new root has any parents...
    parents = new_root.xpath("..")
    if len(parents) > 0:
        p = root_tree_at(parents[0])
        p.remove(new_root)
        new_root.append(p)
    return new_root



In [14]:

    
root = root_tree_at(root.xpath("//*[@cid='P1'][1]")[0])
t = XMLTree(root)
t.render_tree()









    






<!--Provide the canvas id via python string formatting here--!>

2. Flatten tree



In [15]:

    
def flat_tree(root):
    if root.get('dep_label') is not None:
        s = '--%s--> %s' % (root.get('dep_label'), root.get('word'))
    else:
        s = root.get('word')
    if len(root) > 0:
        s += ' ( %s )' % ', '.join(filter(lambda x : x is not None, [flat_tree(c) for c in root]))
    return s



In [16]:

    
ft = flat_tree(t.root)
print ft









    



--nsubj--> disease ( --amod--> Autosomal, --amod--> dominant, --amod--> polycystic, --nn--> kidney, disorder ( --cop--> is, --det--> the, --amod--> common ( --advmod--> most ), --amod--> human, --amod--> monogenic, --conj_and--> caused ( --auxpass--> is, --agent--> mutations ( --prep_in--> PKD1 ( --dep--> the, --conj_or--> PKD2, --dep--> genes ) ) ) ) )

3. Use regexp path queries over tree!



In [81]:

    
re.search(r'[G](\s*--\w+-->\s*){0,5}[P]', ft)



In [ ]: