In [14]:
from nltk.tree import ParentedTree
import discoursegraphs as dg

In [4]:
INPUT_SHORT_FILE = '/tmp/input_short.txt.parsetree'

INPUT_SHORT_STR = """ParentedTree('NS-elaboration', [ParentedTree('EDU', ['1']), ParentedTree('EDU', ['2'])])"""

In [6]:
INPUT_LONG_FILE = "/tmp/input_long.txt.parsetree"

INPUT_LONG_STR = """ParentedTree('NN-textualorganization', [ParentedTree('NN-same_unit', [ParentedTree('NS-elaboration', [ParentedTree('EDU', ['1']), ParentedTree('EDU', ['2'])]), ParentedTree('EDU', ['3'])]), ParentedTree('NS-circumstance', [ParentedTree('EDU', ['4']), ParentedTree('NS-elaboration', [ParentedTree('NS-elaboration', [ParentedTree('EDU', ['5']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['6']), ParentedTree('EDU', ['7'])])]), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['8']), ParentedTree('NN-textualorganization', [ParentedTree('EDU', ['9']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['10']), ParentedTree('NN-same_unit', [ParentedTree('NS-elaboration', [ParentedTree('EDU', ['11']), ParentedTree('NS-purpose', [ParentedTree('EDU', ['12']), ParentedTree('EDU', ['13'])])]), ParentedTree('NN-textualorganization', [ParentedTree('EDU', ['14']), ParentedTree('NN-textualorganization', [ParentedTree('EDU', ['15']), ParentedTree('NN-list', [ParentedTree('EDU', ['16']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['17']), ParentedTree('NN-textualorganization', [ParentedTree('EDU', ['18']), ParentedTree('NN-textualorganization', [ParentedTree('EDU', ['19']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['20']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['21']), ParentedTree('NS-elaboration', [ParentedTree('NS-elaboration', [ParentedTree('EDU', ['22']), ParentedTree('EDU', ['23'])]), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['24']), ParentedTree('NS-temporal', [ParentedTree('NS-elaboration', [ParentedTree('EDU', ['25']), ParentedTree('EDU', ['26'])]), ParentedTree('EDU', ['27'])])])])])])])])])])])])])])])])])])])"""

In [8]:
eval(INPUT_SHORT_STR)


Out[8]:

In [7]:
eval(INPUT_LONG_STR)


Out[7]:

input_medium.txt

Henryk Szeryng (22 September 1918 - 8 March 1988) was a violin virtuoso of Polish and Jewish heritage.

He was born in Zelazowa Wola, Poland. Henryk started piano and harmony training with his mother when he was 5, and at age 7 turned to the violin, receiving instruction from Maurice Frenkel.

After studies with Carl Flesch in Berlin (1929-32), he went to Paris to continue his training with Jacques Thibaud at the Conservatory, graduating with a premier prix in 1937.


In [12]:
# input_medium.txt.parsetree 
INPUT_MEDIUM_PARSETREE = """ParentedTree('NS-elaboration', [ParentedTree('NN-same_unit', [ParentedTree('NS-elaboration', [ParentedTree('EDU', ['1']), ParentedTree('EDU', ['2'])]), ParentedTree('EDU', ['3'])]), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['4']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['5']), ParentedTree('NS-elaboration', [ParentedTree('EDU', ['6']), ParentedTree('EDU', ['7'])])])])])"""

In [13]:
eval(INPUT_MEDIUM_PARSETREE)


Out[13]:

In [9]:
#input_medium.txt.brackets 
INPUT_MEDIUM_BRACKETS = """((1, 1), 'Nucleus', 'span')
((2, 2), 'Satellite', 'elaboration')
((1, 2), 'Nucleus', 'same_unit')
((3, 3), 'Nucleus', 'same_unit')
((1, 3), 'Nucleus', 'span')
((4, 4), 'Nucleus', 'span')
((5, 5), 'Nucleus', 'span')
((6, 6), 'Nucleus', 'span')
((7, 7), 'Satellite', 'elaboration')
((6, 7), 'Satellite', 'elaboration')
((5, 7), 'Satellite', 'elaboration')
((4, 7), 'Satellite', 'elaboration')
"""

In [10]:
#input_medium.txt.merge                                                                                                                                                                             [27/1911]
INPUT_MEDIUM_MERGE = """0       1       Henryk  Henryk  NNP     compound        4       PERSON   (ROOT (FRAG (NP (NP (NNP Henryk)       1
0       2       Szeryng Szeryng NNP     compound        4       PERSON   (NNP Szeryng)  1
0       3       (22     (22     CD      nummod  4       NUMBER   (CD (22)       1
0       4       September       September       NNP     root    0       DATE     (NNP September))       1
0       5       1918    1918    CD      nummod  7       DATE     (SBAR (S (NP (NP (CD 1918)     1        
0       6       -       -       :       punct   7       O        (: -)  2        
0       7       8       8       CD      nsubj   12      DATE     (CD 8))        2        
0       8       March   March   NNP     nmod:tmod       7       DATE     (NP-TMP (NNP March)    2
0       9       1988)   1988)   CD      nummod  8       DATE     (CD 1988))))   2                
0       10      was     be      VBD     cop     12      O        (VP (VBD was)  3
0       11      a       a       DT      det     12      O        (NP (NP (DT a) 3                        
0       12      violin  violin  NN      acl:relcl       4       O        (NN violin)    3
0       13      virtuoso        virtuoso        JJ      amod    12      O        (JJ virtuoso)) 3
0       14      of      of      IN      case    18      O        (PP (IN of)    3
0       15      Polish  polish  JJ      amod    18      MISC     (NP (JJ Polish)        3        
0       16      and     and     CC      cc      15      O        (CC and)       3        
0       17      Jewish  jewish  JJ      conj    15      MISC     (JJ Jewish)    3
0       18      heritage.       heritage.       NN      nmod    12      O        (NN heritage.))))))))))        3
                                                                                                         
1       1       He      he      PRP     nsubjpass       3       O        (ROOT (S (NP (PRP He)) 4        
1       2       was     be      VBD     auxpass 3       O        (VP (VBD was)  4
1       3       born    bear    VBN     root    0       O        (VP (VBN born) 4
1       4       in      in      IN      case    8       O        (PP (IN in)    4        
1       5       Zelazowa        Zelazowa        NNP     compound        8       LOCATION         (NP (NP (NNP Zelazowa) 4
1       6       Wola,   Wola,   NNP     compound        8       O        (NNP Wola,)    4
1       7       Poland. Poland. NNP     compound        8       O        (NNP Poland.)  4                        
1       8       Henryk  Henryk  NNP     nmod    3       PERSON   (NNP Henryk))  4
1       9       started start   VBD     acl:relcl       8       O        (SBAR (S (VP (VP (VBD started) 4
1       10      piano   piano   NN      compound        13      O        (NP (NN piano) 4
1       11      and     and     CC      cc      10      O        (CC and)       4
1       12      harmony harmony NN      conj    10      O        (NN harmony)   4
1       13      training        training        NN      dobj    9       O        (NN training)) 4
1       14      with    with    IN      case    16      O        (PP (IN with)  4
1       15      his     he      PRP$    nmod:poss       16      O        (NP (PRP$ his) 4
1       16      mother  mother  NN      nmod    9       O        (NN mother)))  4
1       17      when    when    WRB     advmod  20      O        (SBAR (WHADVP (WRB when))      5
1       18      he      he      PRP     nsubj   20      O        (S (NP (PRP he))       5
1       19      was     be      VBD     cop     20      O        (VP (VBD was)  5
1       20      5,      5,      CD      advcl   9       NUMBER   (NP (CD 5,))))))       5
1       21      and     and     CC      cc      9       O        (CC and)       5
1       22      at      at      IN      case    23      O        (VP (PP (IN at)        5
1       23      age     age     NN      nmod    25      O        (NP (NN age)   5
1       24      7       7       CD      nummod  23      NUMBER   (CD 7)))       5
1       25      turned  turn    VBD     conj    9       O        (VBD turned)   5
1       26      to      to      TO      case    28      O        (PP (TO to)    5
1       27      the     the     DT      det     28      O        (NP (DT the)   5
1       28      violin, violin, NN      nmod    25      O        (NN violin,))) 5
1       29      receiving       receive VBG     xcomp   25      O        (S (VP (VBG receiving) 5
1       30      instruction     instruction     NN      dobj    29      O        (NP (NN instruction))  5
1       31      from    from    IN      case    33      O        (PP (IN from)  5
1       32      Maurice Maurice NNP     compound        33      O        (NP (NP (NNP Maurice)  5
1       33      Frenkel.        Frenkel.        NNP     nmod    29      O        (NNP Frenkel.))        5
1       34      After   after   IN      case    35      O        (SBAR (S (PP (IN After)        5
1       35      studies study   NNS     nmod    43      O        (NP (NP (NNS studies)) 5
1       36      with    with    IN      case    38      O        (PP (IN with)  5
1       37      Carl    Carl    NNP     compound        38      PERSON   (NP (NP (NNP Carl)     5
1       38      Flesch  Flesch  NNP     nmod    35      PERSON   (NNP Flesch))  5
1       39      in      in      IN      case    40      O        (PP (IN in)    5
1       40      Berlin  Berlin  NNP     nmod    38      LOCATION         (NP (NNP Berlin)       5
1       41      (1929-32),      (1929-32),      CD      nummod  40      NUMBER   (CD (1929-32),)))))))  5
1       42      he      he      PRP     nsubj   43      O        (NP (PRP he))  5
1       43      went    go      VBD     acl:relcl       33      O        (VP (VBD went) 5
1       44      to      to      TO      case    45      O        (PP (TO to)    5
1       45      Paris   Paris   NNP     nmod    43      LOCATION         (NP (NNP Paris)))      5
1       46      to      to      TO      mark    47      O        (S (VP (TO to) 5
1       47      continue        continue        VB      xcomp   43      O        (VP (VB continue)      5
1       48      his     he      PRP$    nmod:poss       49      O        (NP (PRP$ his) 5
1       49      training        training        NN      dobj    47      O        (NN training)) 5
1       50      with    with    IN      case    52      O        (PP (IN with)  6
1       51      Jacques Jacques NNP     compound        52      PERSON   (NP (NNP Jacques)      6
1       52      Thibaud Thibaud NNP     nmod    47      PERSON   (NNP Thibaud)))        6
1       53      at      at      IN      case    55      O        (PP (IN at)    6
1       54      the     the     DT      det     55      O        (NP (NP (DT the)       6
1       55      Conservatory,   conservatory,   NN      nmod    47      O        (NN Conservatory,))    6
1       56      graduating      graduate        VBG     acl     55      O        (VP (VBG graduating)   7
1       57      with    with    IN      case    60      O        (PP (IN with)  7
1       58      a       a       DT      det     60      O        (NP (NP (DT a) 7
1       59      premier premier NN      compound        60      O        (NN premier)   7
1       60      prix    prix    NN      nmod    56      O        (NN prix))     7
1       61      in      in      IN      case    62      O        (PP (IN in)    7
1       62      1937.   1937.   CD      nmod    60      DATE     (NP (CD 1937.))))))))))))))))))))))))))))      7

"""

In [48]:
with open('/tmp/input_medium.txt.merge', 'r') as medium_file:
    medium = medium_file.readlines()
    
medium[0]


Out[48]:
'0\t1\tHenryk\tHenryk\tNNP\tcompound\t4\tPERSON\t (ROOT (FRAG (NP (NP (NNP Henryk)\t1\n'

In [50]:
from collections import defaultdict


def extract_edus(dplp_merge_filepath):
    """Extract EDUs from DPLPs .merge output files.
    
    Returns
    -------
    edus : dict from EDU IDs (int) to words (list(str))
    """
    with open(dplp_merge_filepath, 'r') as merge_file:
        lines = merge_file.readlines()
    
    edus = defaultdict(list)
    for line in lines:
        if line.strip():  # ignore empty lines
            token = line.split('\t')[2]
            edu_id = int(line.split('\t')[9])
            edus[edu_id].append(token)
    return edus

In [52]:
edu_dict = extract_edus('/tmp/input_medium.txt.merge')

In [80]:
medium_tree = eval(INPUT_MEDIUM_PARSETREE)
medium_tree


Out[80]:

In [76]:
leaf_positions = medium_tree.treepositions('leaves')

In [77]:
for leaf_pos in leaf_positions:
    edu_id = int(medium_tree[leaf_pos])
    edu_tokens = edu_dict[edu_id]
    parent_pos = leaf_pos[:-1]
    medium_tree[parent_pos] = u" ".join(edu_tokens)

In [78]:
medium_tree


Out[78]:

In [82]:
def add_edus(dplp_tree):
    leaf_positions = dplp_tree.treepositions('leaves')
    
    for leaf_pos in leaf_positions:
        edu_id = int(dplp_tree[leaf_pos])
        edu_tokens = edu_dict[edu_id]
        parent_pos = leaf_pos[:-1]
        dplp_tree[parent_pos] = u" ".join(edu_tokens)
    return dplp_tree

In [89]:
medium_tree = eval(INPUT_MEDIUM_PARSETREE)
# add_edus(medium_tree)

In [88]:
from copy import deepcopy

In [ ]: