In [1]:
import discoursegraphs as dg
from nltk.tree import ParentedTree
In [2]:
# input_short.txt_1.parentedtree
INPUT_SHORT_PTREE_STR = """ParentedTree('ROOT', [ParentedTree('satellite:contrast', [ParentedTree('text', ['0'])]), ParentedTree('nucleus:span', [ParentedTree('text', ['1'])])])"""
# input_long.txt_1.parentedtree
INPUT_LONG_PTREE_STR = """ParentedTree('ROOT', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('nucleus:same-unit', [ParentedTree('nucleus:span', [ParentedTree('text', ['0'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['1'])])]), ParentedTree('nucleus:same-unit', [ParentedTree('text', ['2'])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:joint', [ParentedTree('text', ['3'])]), ParentedTree('nucleus:joint', [ParentedTree('nucleus:span', [ParentedTree('nucleus:joint', [ParentedTree('nucleus:span', [ParentedTree('text', ['4'])]), ParentedTree('satellite:background', [ParentedTree('text', ['5'])])]), ParentedTree('nucleus:joint', [ParentedTree('text', ['6'])])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['7'])])])])]), ParentedTree('satellite:background', [ParentedTree('nucleus:same-unit', [ParentedTree('nucleus:span', [ParentedTree('text', ['8'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['9'])])]), ParentedTree('nucleus:same-unit', [ParentedTree('nucleus:span', [ParentedTree('text', ['10'])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('text', ['11'])]), ParentedTree('satellite:enablement', [ParentedTree('text', ['12'])])])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('text', ['13'])]), ParentedTree('satellite:cause', [ParentedTree('text', ['14'])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('text', ['15'])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:joint', [ParentedTree('text', ['16'])]), ParentedTree('nucleus:joint', [ParentedTree('nucleus:span', [ParentedTree('text', ['17'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['18'])])]), ParentedTree('nucleus:joint', [ParentedTree('text', ['19'])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('text', ['20'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['21'])])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('nucleus:joint', [ParentedTree('text', ['22'])]), ParentedTree('nucleus:joint', [ParentedTree('nucleus:span', [ParentedTree('text', ['23'])]), ParentedTree('satellite:temporal', [ParentedTree('text', ['24'])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('nucleus:joint', [ParentedTree('text', ['25'])]), ParentedTree('nucleus:joint', [ParentedTree('text', ['26'])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:joint', [ParentedTree('text', ['27'])]), ParentedTree('nucleus:joint', [ParentedTree('nucleus:span', [ParentedTree('text', ['28'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['29'])])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('text', ['30'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['31'])])])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:span', [ParentedTree('nucleus:span', [ParentedTree('text', ['32'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['33'])])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:same-unit', [ParentedTree('nucleus:span', [ParentedTree('text', ['34'])]), ParentedTree('satellite:elaboration', [ParentedTree('nucleus:joint', [ParentedTree('text', ['35'])]), ParentedTree('nucleus:joint', [ParentedTree('text', ['36'])])])]), ParentedTree('nucleus:same-unit', [ParentedTree('nucleus:same-unit', [ParentedTree('nucleus:span', [ParentedTree('text', ['37'])]), ParentedTree('satellite:elaboration', [ParentedTree('text', ['38'])])]), ParentedTree('nucleus:same-unit', [ParentedTree('text', ['39'])])])])])])])"""
In [3]:
# input_short.txt.edus
INPUT_SHORT_EDUS = """Although they did n't like it ,
they accepted the offer .
"""
# input_long.txt.edus
INPUT_LONG_EDUS = """Henryk Szeryng
( 22 September 1918 - 8 March 1988 )
was a violin virtuoso of Polish and Jewish heritage .
He was born in Zelazowa Wola , Poland .
Henryk started piano and harmony training with his mother
when he was 5 ,
and at age 7 turned to the violin ,
receiving instruction from Maurice Frenkel .
After studies with Carl Flesch in Berlin
( 1929-32 ) ,
he went to Paris
to continue his training with Jacques Thibaud at the Conservatory ,
graduating with a premier prix in 1937 .
He made his solo debut in 1933
playing the Brahms Violin Concerto .
From 1933 to 1939
he studied composition in Paris with Nadia Boulanger ,
and during World War II he worked as an interpreter for the Polish government in exile
( Szeryng was fluent in seven languages )
and gave concerts for Allied troops all over the world .
During one of these concerts in Mexico City he received an offer
to take over the string department of the university there .
In 1946 , he became a naturalized citizen of Mexico .
Szeryng subsequently focused on teaching
before resuming his concert career in 1954 .
His debut in New York City brought him great acclaim ,
and he toured widely for the rest of his life .
He died in Kassel .
Szeryng made a number of recordings ,
including two of the complete sonatas and partitas for violin by Johann Sebastian Bach , and several of sonatas of Beethoven and Brahms with the pianist Arthur Rubinstein .
He also composed ;
his works include a number of violin concertos and pieces of chamber music .
He owned the Del Gesu " Le Duc " , the Stradivarius " King David " as well as the Messiah Strad copy by Jean-Baptiste Vuillaume
which he gave to Prince Rainier III of Monaco .
The " Le Duc " was the instrument
on which he performed
and recorded mostly ,
while the latter
( " King David " Strad )
was donated to the State of Israel .
"""
In [4]:
eval(INPUT_SHORT_PTREE_STR)
Out[4]:
In [5]:
eval(INPUT_LONG_PTREE_STR)
Out[5]:
In [ ]:
In [6]:
edus_long = INPUT_LONG_EDUS.splitlines()
ptree_long = eval(INPUT_LONG_PTREE_STR)
for i, leaf_pos in enumerate(ptree_long.treepositions('leaves')):
leaf_parent_pos = leaf_pos[:-1]
ptree_long[leaf_parent_pos] = edus_long[i]
ptree_long
Out[6]:
In [19]:
def make_tree(ptree_str, edu_str):
edus = edu_str.splitlines()
ptree = eval(ptree_str)
for i, leaf_pos in enumerate(ptree.treepositions('leaves')):
leaf_parent_pos = leaf_pos[:-1]
ptree[leaf_parent_pos] = edus[i]
return ptree
In [22]:
ptree_short = make_tree(INPUT_SHORT_PTREE_STR, INPUT_SHORT_EDUS)
ptree_short
Out[22]:
In [9]:
all_positions = set(ptree_long.treepositions())
leaf_positions = set(ptree_long.treepositions('leaves'))
non_leaf_positions = all_positions.difference(leaf_positions)
In [10]:
ptree_long._label
Out[10]:
In [11]:
from collections import defaultdict
def transform(ptree, pos=None):
if pos is None:
pos = ()
children = defaultdict(list)
if len(ptree[pos]) == 2:
for child in ptree[pos]:
child_pos = child.treeposition()
nuclearity, relname = child.label().split(':')
children[nuclearity].append( (child_pos, relname) )
ptree[child_pos] = transform(ptree, child_pos)
if len(children['nucleus']) == 2:
_, ptree._label = children['nucleus'][0]
else:
_, ptree._label = children['satellite'][0]
return ptree[pos]
In [23]:
ptree_short
Out[23]:
In [24]:
transform(ptree_short)
Out[24]:
In [13]:
transform(ptree_long)
Out[13]:
In [ ]: