In [14]:
from nltk.tree import ParentedTree
import discoursegraphs as dg
In [4]:
INPUT_SHORT_FILE = '/tmp/input_short.txt.parsetree'
INPUT_SHORT_STR = """ParentedTree('NS-elaboration', [ParentedTree('EDU', ['1']), ParentedTree('EDU', ['2'])])"""
In [6]:
INPUT_LONG_FILE = "/tmp/input_long.txt.parsetree"
INPUT_LONG_STR = """ParentedTree('NN-textualorganization', [ParentedTree('NN-same_unit', [ParentedTree('NS-elaboration', [ParentedTree('EDU', ['1']), ParentedTree('EDU', ['2'])]), ParentedTree('EDU', ['3'])]), ParentedTree('NS-circumstance', [ParentedTree('EDU', ['4']), ParentedTree('NS-elaboration', [ParentedTree('NS-elaboration', [ParentedTree('EDU', ['5']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['6']), ParentedTree('EDU', ['7'])])]), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['8']), ParentedTree('NN-textualorganization', [ParentedTree('EDU', ['9']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['10']), ParentedTree('NN-same_unit', [ParentedTree('NS-elaboration', [ParentedTree('EDU', ['11']), ParentedTree('NS-purpose', [ParentedTree('EDU', ['12']), ParentedTree('EDU', ['13'])])]), ParentedTree('NN-textualorganization', [ParentedTree('EDU', ['14']), ParentedTree('NN-textualorganization', [ParentedTree('EDU', ['15']), ParentedTree('NN-list', [ParentedTree('EDU', ['16']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['17']), ParentedTree('NN-textualorganization', [ParentedTree('EDU', ['18']), ParentedTree('NN-textualorganization', [ParentedTree('EDU', ['19']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['20']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['21']), ParentedTree('NS-elaboration', [ParentedTree('NS-elaboration', [ParentedTree('EDU', ['22']), ParentedTree('EDU', ['23'])]), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['24']), ParentedTree('NS-temporal', [ParentedTree('NS-elaboration', [ParentedTree('EDU', ['25']), ParentedTree('EDU', ['26'])]), ParentedTree('EDU', ['27'])])])])])])])])])])])])])])])])])])])"""
In [8]:
eval(INPUT_SHORT_STR)
Out[8]:
In [7]:
eval(INPUT_LONG_STR)
Out[7]:
Henryk Szeryng (22 September 1918 - 8 March 1988) was a violin virtuoso of Polish and Jewish heritage.
He was born in Zelazowa Wola, Poland. Henryk started piano and harmony training with his mother when he was 5, and at age 7 turned to the violin, receiving instruction from Maurice Frenkel.
After studies with Carl Flesch in Berlin (1929-32), he went to Paris to continue his training with Jacques Thibaud at the Conservatory, graduating with a premier prix in 1937.
In [12]:
# input_medium.txt.parsetree
INPUT_MEDIUM_PARSETREE = """ParentedTree('NS-elaboration', [ParentedTree('NN-same_unit', [ParentedTree('NS-elaboration', [ParentedTree('EDU', ['1']), ParentedTree('EDU', ['2'])]), ParentedTree('EDU', ['3'])]), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['4']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['5']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['6']), ParentedTree('EDU', ['7'])])])])])"""
In [13]:
eval(INPUT_MEDIUM_PARSETREE)
Out[13]:
In [9]:
#input_medium.txt.brackets
INPUT_MEDIUM_BRACKETS = """((1, 1), 'Nucleus', 'span')
((2, 2), 'Satellite', 'elaboration')
((1, 2), 'Nucleus', 'same_unit')
((3, 3), 'Nucleus', 'same_unit')
((1, 3), 'Nucleus', 'span')
((4, 4), 'Nucleus', 'span')
((5, 5), 'Nucleus', 'span')
((6, 6), 'Nucleus', 'span')
((7, 7), 'Satellite', 'elaboration')
((6, 7), 'Satellite', 'elaboration')
((5, 7), 'Satellite', 'elaboration')
((4, 7), 'Satellite', 'elaboration')
"""
In [10]:
#input_medium.txt.merge [27/1911]
INPUT_MEDIUM_MERGE = """0 1 Henryk Henryk NNP compound 4 PERSON (ROOT (FRAG (NP (NP (NNP Henryk) 1
0 2 Szeryng Szeryng NNP compound 4 PERSON (NNP Szeryng) 1
0 3 (22 (22 CD nummod 4 NUMBER (CD (22) 1
0 4 September September NNP root 0 DATE (NNP September)) 1
0 5 1918 1918 CD nummod 7 DATE (SBAR (S (NP (NP (CD 1918) 1
0 6 - - : punct 7 O (: -) 2
0 7 8 8 CD nsubj 12 DATE (CD 8)) 2
0 8 March March NNP nmod:tmod 7 DATE (NP-TMP (NNP March) 2
0 9 1988) 1988) CD nummod 8 DATE (CD 1988)))) 2
0 10 was be VBD cop 12 O (VP (VBD was) 3
0 11 a a DT det 12 O (NP (NP (DT a) 3
0 12 violin violin NN acl:relcl 4 O (NN violin) 3
0 13 virtuoso virtuoso JJ amod 12 O (JJ virtuoso)) 3
0 14 of of IN case 18 O (PP (IN of) 3
0 15 Polish polish JJ amod 18 MISC (NP (JJ Polish) 3
0 16 and and CC cc 15 O (CC and) 3
0 17 Jewish jewish JJ conj 15 MISC (JJ Jewish) 3
0 18 heritage. heritage. NN nmod 12 O (NN heritage.)))))))))) 3
1 1 He he PRP nsubjpass 3 O (ROOT (S (NP (PRP He)) 4
1 2 was be VBD auxpass 3 O (VP (VBD was) 4
1 3 born bear VBN root 0 O (VP (VBN born) 4
1 4 in in IN case 8 O (PP (IN in) 4
1 5 Zelazowa Zelazowa NNP compound 8 LOCATION (NP (NP (NNP Zelazowa) 4
1 6 Wola, Wola, NNP compound 8 O (NNP Wola,) 4
1 7 Poland. Poland. NNP compound 8 O (NNP Poland.) 4
1 8 Henryk Henryk NNP nmod 3 PERSON (NNP Henryk)) 4
1 9 started start VBD acl:relcl 8 O (SBAR (S (VP (VP (VBD started) 4
1 10 piano piano NN compound 13 O (NP (NN piano) 4
1 11 and and CC cc 10 O (CC and) 4
1 12 harmony harmony NN conj 10 O (NN harmony) 4
1 13 training training NN dobj 9 O (NN training)) 4
1 14 with with IN case 16 O (PP (IN with) 4
1 15 his he PRP$ nmod:poss 16 O (NP (PRP$ his) 4
1 16 mother mother NN nmod 9 O (NN mother))) 4
1 17 when when WRB advmod 20 O (SBAR (WHADVP (WRB when)) 5
1 18 he he PRP nsubj 20 O (S (NP (PRP he)) 5
1 19 was be VBD cop 20 O (VP (VBD was) 5
1 20 5, 5, CD advcl 9 NUMBER (NP (CD 5,)))))) 5
1 21 and and CC cc 9 O (CC and) 5
1 22 at at IN case 23 O (VP (PP (IN at) 5
1 23 age age NN nmod 25 O (NP (NN age) 5
1 24 7 7 CD nummod 23 NUMBER (CD 7))) 5
1 25 turned turn VBD conj 9 O (VBD turned) 5
1 26 to to TO case 28 O (PP (TO to) 5
1 27 the the DT det 28 O (NP (DT the) 5
1 28 violin, violin, NN nmod 25 O (NN violin,))) 5
1 29 receiving receive VBG xcomp 25 O (S (VP (VBG receiving) 5
1 30 instruction instruction NN dobj 29 O (NP (NN instruction)) 5
1 31 from from IN case 33 O (PP (IN from) 5
1 32 Maurice Maurice NNP compound 33 O (NP (NP (NNP Maurice) 5
1 33 Frenkel. Frenkel. NNP nmod 29 O (NNP Frenkel.)) 5
1 34 After after IN case 35 O (SBAR (S (PP (IN After) 5
1 35 studies study NNS nmod 43 O (NP (NP (NNS studies)) 5
1 36 with with IN case 38 O (PP (IN with) 5
1 37 Carl Carl NNP compound 38 PERSON (NP (NP (NNP Carl) 5
1 38 Flesch Flesch NNP nmod 35 PERSON (NNP Flesch)) 5
1 39 in in IN case 40 O (PP (IN in) 5
1 40 Berlin Berlin NNP nmod 38 LOCATION (NP (NNP Berlin) 5
1 41 (1929-32), (1929-32), CD nummod 40 NUMBER (CD (1929-32),))))))) 5
1 42 he he PRP nsubj 43 O (NP (PRP he)) 5
1 43 went go VBD acl:relcl 33 O (VP (VBD went) 5
1 44 to to TO case 45 O (PP (TO to) 5
1 45 Paris Paris NNP nmod 43 LOCATION (NP (NNP Paris))) 5
1 46 to to TO mark 47 O (S (VP (TO to) 5
1 47 continue continue VB xcomp 43 O (VP (VB continue) 5
1 48 his he PRP$ nmod:poss 49 O (NP (PRP$ his) 5
1 49 training training NN dobj 47 O (NN training)) 5
1 50 with with IN case 52 O (PP (IN with) 6
1 51 Jacques Jacques NNP compound 52 PERSON (NP (NNP Jacques) 6
1 52 Thibaud Thibaud NNP nmod 47 PERSON (NNP Thibaud))) 6
1 53 at at IN case 55 O (PP (IN at) 6
1 54 the the DT det 55 O (NP (NP (DT the) 6
1 55 Conservatory, conservatory, NN nmod 47 O (NN Conservatory,)) 6
1 56 graduating graduate VBG acl 55 O (VP (VBG graduating) 7
1 57 with with IN case 60 O (PP (IN with) 7
1 58 a a DT det 60 O (NP (NP (DT a) 7
1 59 premier premier NN compound 60 O (NN premier) 7
1 60 prix prix NN nmod 56 O (NN prix)) 7
1 61 in in IN case 62 O (PP (IN in) 7
1 62 1937. 1937. CD nmod 60 DATE (NP (CD 1937.)))))))))))))))))))))))))))) 7
"""
In [48]:
with open('/tmp/input_medium.txt.merge', 'r') as medium_file:
medium = medium_file.readlines()
medium[0]
Out[48]:
In [50]:
from collections import defaultdict
def extract_edus(dplp_merge_filepath):
"""Extract EDUs from DPLPs .merge output files.
Returns
-------
edus : dict from EDU IDs (int) to words (list(str))
"""
with open(dplp_merge_filepath, 'r') as merge_file:
lines = merge_file.readlines()
edus = defaultdict(list)
for line in lines:
if line.strip(): # ignore empty lines
token = line.split('\t')[2]
edu_id = int(line.split('\t')[9])
edus[edu_id].append(token)
return edus
In [52]:
edu_dict = extract_edus('/tmp/input_medium.txt.merge')
In [80]:
medium_tree = eval(INPUT_MEDIUM_PARSETREE)
medium_tree
Out[80]:
In [76]:
leaf_positions = medium_tree.treepositions('leaves')
In [77]:
for leaf_pos in leaf_positions:
edu_id = int(medium_tree[leaf_pos])
edu_tokens = edu_dict[edu_id]
parent_pos = leaf_pos[:-1]
medium_tree[parent_pos] = u" ".join(edu_tokens)
In [78]:
medium_tree
Out[78]:
In [82]:
def add_edus(dplp_tree):
leaf_positions = dplp_tree.treepositions('leaves')
for leaf_pos in leaf_positions:
edu_id = int(dplp_tree[leaf_pos])
edu_tokens = edu_dict[edu_id]
parent_pos = leaf_pos[:-1]
dplp_tree[parent_pos] = u" ".join(edu_tokens)
return dplp_tree
In [89]:
medium_tree = eval(INPUT_MEDIUM_PARSETREE)
# add_edus(medium_tree)
In [88]:
from copy import deepcopy
In [ ]: