In [1]:
import discoursegraphs as dg
from nltk.tree import ParentedTree

In [2]:
# input_short.txt_1.parentedtree
INPUT_SHORT_PTREE_STR = """ParentedTree('ROOT', [ParentedTree('satellite:contrast', [ParentedTree('text', ['0'])]), ParentedTree('nucleus:span', [ParentedTree('text', ['1'])])])"""
# input_long.txt_1.parentedtree
INPUT_LONG_PTREE_STR = """ParentedTree('ROOT', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('nucleus:same-unit', [ParentedTree('nucleus:span', [ParentedTree('text', ['0'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['1'])])]), ParentedTree('nucleus:same-unit', [ParentedTree('text', ['2'])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:joint', [ParentedTree('text', ['3'])]), ParentedTree('nucleus:joint', [ParentedTree('nucleus:span', [ParentedTree('nucleus:joint', [ParentedTree('nucleus:span', [ParentedTree('text', ['4'])]), ParentedTree('satellite:background', [ParentedTree('text', ['5'])])]), ParentedTree('nucleus:joint', [ParentedTree('text', ['6'])])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['7'])])])])]), ParentedTree('satellite:background', [ParentedTree('nucleus:same-unit', [ParentedTree('nucleus:span', [ParentedTree('text', ['8'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['9'])])]), ParentedTree('nucleus:same-unit', [ParentedTree('nucleus:span', [ParentedTree('text', ['10'])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('text', ['11'])]), ParentedTree('satellite:enablement', [ParentedTree('text', ['12'])])])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('text', ['13'])]), ParentedTree('satellite:cause', [ParentedTree('text', ['14'])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('text', ['15'])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:joint', [ParentedTree('text', ['16'])]), ParentedTree('nucleus:joint', [ParentedTree('nucleus:span', [ParentedTree('text', ['17'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['18'])])]), ParentedTree('nucleus:joint', [ParentedTree('text', ['19'])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('text', ['20'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['21'])])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('nucleus:joint', [ParentedTree('text', ['22'])]), ParentedTree('nucleus:joint', [ParentedTree('nucleus:span', [ParentedTree('text', ['23'])]), ParentedTree('satellite:temporal', [ParentedTree('text', ['24'])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('nucleus:joint', [ParentedTree('text', ['25'])]), ParentedTree('nucleus:joint', [ParentedTree('text', ['26'])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:joint', [ParentedTree('text', ['27'])]), ParentedTree('nucleus:joint', [ParentedTree('nucleus:span', [ParentedTree('text', ['28'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['29'])])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('text', ['30'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['31'])])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('text', ['32'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['33'])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:same-unit', [ParentedTree('nucleus:span', [ParentedTree('text', ['34'])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:joint', [ParentedTree('text', ['35'])]), ParentedTree('nucleus:joint', [ParentedTree('text', ['36'])])])]), ParentedTree('nucleus:same-unit', [ParentedTree('nucleus:same-unit', [ParentedTree('nucleus:span', [ParentedTree('text', ['37'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['38'])])]), ParentedTree('nucleus:same-unit', [ParentedTree('text', ['39'])])])])])])])"""

In [3]:
# input_short.txt.edus
INPUT_SHORT_EDUS = """Although they did n't like it ,
they accepted the offer .
"""

# input_long.txt.edus
INPUT_LONG_EDUS = """Henryk Szeryng
( 22 September 1918 - 8 March 1988 )
was a violin virtuoso of Polish and Jewish heritage .
He was born in Zelazowa Wola , Poland .
Henryk started piano and harmony training with his mother
when he was 5 ,
and at age 7 turned to the violin ,
receiving instruction from Maurice Frenkel .
After studies with Carl Flesch in Berlin
( 1929-32 ) ,
he went to Paris
to continue his training with Jacques Thibaud at the Conservatory ,
graduating with a premier prix in 1937 .
He made his solo debut in 1933
playing the Brahms Violin Concerto .
From 1933 to 1939
he studied composition in Paris with Nadia Boulanger ,
and during World War II he worked as an interpreter for the Polish government in exile
( Szeryng was fluent in seven languages )
and gave concerts for Allied troops all over the world .
During one of these concerts in Mexico City he received an offer
to take over the string department of the university there .
In 1946 , he became a naturalized citizen of Mexico .
Szeryng subsequently focused on teaching
before resuming his concert career in 1954 .
His debut in New York City brought him great acclaim ,
and he toured widely for the rest of his life .
He died in Kassel .
Szeryng made a number of recordings ,
including two of the complete sonatas and partitas for violin by Johann Sebastian Bach , and several of sonatas of Beethoven and Brahms with the pianist Arthur Rubinstein .
He also composed ;
his works include a number of violin concertos and pieces of chamber music .
He owned the Del Gesu " Le Duc " , the Stradivarius " King David " as well as the Messiah Strad copy by Jean-Baptiste Vuillaume
which he gave to Prince Rainier III of Monaco .
The " Le Duc " was the instrument
on which he performed
and recorded mostly ,
while the latter
( " King David " Strad )
was donated to the State of Israel .
"""

In [4]:
eval(INPUT_SHORT_PTREE_STR)


Out[4]:

In [5]:
eval(INPUT_LONG_PTREE_STR)


Out[5]:

In [ ]:

replace leaf nodes


In [6]:
edus_long = INPUT_LONG_EDUS.splitlines()
ptree_long = eval(INPUT_LONG_PTREE_STR)

for i, leaf_pos in enumerate(ptree_long.treepositions('leaves')):
    leaf_parent_pos = leaf_pos[:-1]
    ptree_long[leaf_parent_pos] = edus_long[i]
    
ptree_long


Out[6]:

In [19]:
def make_tree(ptree_str, edu_str):
    edus = edu_str.splitlines()
    ptree = eval(ptree_str)
    
    for i, leaf_pos in enumerate(ptree.treepositions('leaves')):
        leaf_parent_pos = leaf_pos[:-1]
        ptree[leaf_parent_pos] = edus[i]
    return ptree

In [22]:
ptree_short = make_tree(INPUT_SHORT_PTREE_STR, INPUT_SHORT_EDUS)
ptree_short


Out[22]:

replace intermediate nodes


In [9]:
all_positions = set(ptree_long.treepositions())
leaf_positions = set(ptree_long.treepositions('leaves'))
non_leaf_positions = all_positions.difference(leaf_positions)

In [10]:
ptree_long._label


Out[10]:
'ROOT'

In [11]:
from collections import defaultdict

def transform(ptree, pos=None):
    if pos is None:
        pos = ()
    
    children = defaultdict(list)
    if len(ptree[pos]) == 2:
        for child in ptree[pos]:
            child_pos = child.treeposition()
            nuclearity, relname = child.label().split(':')
            children[nuclearity].append( (child_pos, relname) )
            ptree[child_pos] = transform(ptree, child_pos)
        if len(children['nucleus']) == 2:
            _, ptree._label = children['nucleus'][0]
        else:
            _, ptree._label = children['satellite'][0]
    return ptree[pos]

In [23]:
ptree_short


Out[23]:

In [24]:
transform(ptree_short)


Out[24]:

In [13]:
transform(ptree_long)


Out[13]:

TODO: implement len(ptree[pos]) == 1

TODO: make input_medium.txt


In [ ]: