notebook.community

Edit and run



In [1]:

    
import os
from operator import itemgetter
import educe
from educe import pdtb
import networkx as nx

import discoursegraphs as dg
from discoursegraphs.util import natural_sort_key, find_files



In [2]:

    
PDTB_ROOT_DIR = os.path.expanduser('~/corpora/pdtb_v2/data')
PDTB_TEST_FILE = os.path.join(PDTB_ROOT_DIR, '00/wsj_0003.pdtb')

PTB_ROOT_DIR = os.path.expanduser('~/corpora/pennTreebank/parsed/mrg/wsj')
PTB_TEST_FILE = os.path.join(PTB_ROOT_DIR, '00/wsj_0003.mrg')



In [3]:

    
# ptb files, ordered by file size (ascending order)
sorted_ptb_files = sorted((ptb_file for ptb_file in find_files(PTB_ROOT_DIR, '*.mrg')),
                           key=os.path.getsize)

sorted_pdtb_files = sorted((pdtb_file for pdtb_file in find_files(PDTB_ROOT_DIR, '*.pdtb')),
                           key=os.path.getsize)



In [4]:

    
def wsjid2filepaths(wsj_id, ptb_root=PTB_ROOT_DIR, pdtb_root=PDTB_ROOT_DIR):
    """converts a PTB-WSJ document ID into PTB and PDTB file paths."""
    if isinstance(wsj_id, int):
        wsj_id = str(wsj_id)
    prefix = wsj_id[:2]
    return os.path.join(ptb_root, prefix, 'wsj_{}.mrg'.format(wsj_id)), \
        os.path.join(pdtb_root, prefix, 'wsj_{}.pdtb'.format(wsj_id))



In [5]:

    
# some helper functions from the educe tutorial

def show_type(rel):
    "short string for a relation type"
    return type(rel).__name__[:-8]  # remove "Relation"

def highlight(astring, color=1):
    "coloured text"
    return("\x1b[3{color}m{str}\x1b[0m".format(color=color, str=astring))

def display_rel(r):
    "pretty print a relation instance"
    rtype = show_type(r)
    
    if rtype == "Explicit":
        conn = highlight(r.connhead)
    elif rtype == "Implicit":
        conn = "{rtype} {conn1}".format(rtype=rtype,
                                        conn1=highlight(str(r.connective1)))
    elif rtype == "AltLex":
        conn = "{rtype} {sem1}".format(rtype=rtype,
                                       sem1=highlight(r.semclass1))
    else:
        conn = rtype

    fmt = "{src}\n \t ---[{label}]---->\n \t\t\t{tgt}"
    print fmt.format(src=highlight(r.arg1.text, 2),
                      label=conn,
                      tgt=highlight(r.arg2.text, 2))



In [6]:

    
def read_pdtb(pdtb_filepath):
    """parses a PTDB file with educe"""
    return educe.pdtb.parse.parse(pdtb_filepath)

def get_subtree(docgraph, node_id):
    """
    given a document graph and a node ID, returns the subgraph/subtree
    dominated by that node.
    """
    return docgraph.subgraph(nx.bfs_tree(docgraph, node_id).nodes())

def gorn2node(docgraph, gorn_address):
    """
    given a document graph and a gorn adress, returns the (root) node ID
    of the subgraph/subtree the address points to.
    """
    if isinstance(gorn_address, educe.pdtb.parse.GornAddress):
        gorn_ints = gorn_address.parts
    else:
        gorn_ints = [int(num) for num in gorn_address.split('.')]
    sentence_index, gorn_numbers = gorn_ints[0], gorn_ints[1:]
    sentence_id = docgraph.sentences[sentence_index]
    
    current_node_id = sentence_id
    for gorn_int in gorn_ints:
        current_node_id = sorted(docgraph.neighbors(current_node_id), key=natural_sort_key)[gorn_int]
    return current_node_id

def gorn2subtree(docgraph, gorn_address):
    """
    given a document graph and a gorn adress, returns the subgraph/subtree
    the address points to.
    """
    subtree_root_node = gorn2node(docgraph, gorn_address)
    return get_subtree(docgraph, subtree_root_node)



In [7]:

    
# for i, sent_root_node in enumerate(ptb_0003.sentences):
#     print i, sent_root_node
#     %dotstr dg.print_dot(get_sentence_subgraph(ptb_0003, sent_root_node))



In [8]:

    
# %dotstr dg.print_dot(gorn2subtree(ptb_0003, '3'))



In [9]:

    
%load_ext gvmagic

def filepath2wsjid(filepath):
    return os.path.basename(filepath).split('.')[0].split('_')[1]

def pdtb_info(pdtb_file):
    wsj_id = filepath2wsjid(pdtb_file)
    ptb_file, _ = wsjid2filepaths(wsj_id)
    print ptb_file
    ptb_graph = dg.read_ptb(ptb_file)
    %dotstr dg.print_dot(ptb_graph)
    
    educe_pdtb = read_pdtb(pdtb_file)
    for i, rel in enumerate(educe_pdtb):
        print i, type(rel)
        display_rel(rel)
        print 'arg1: ', rel.arg1.gorn
        for gorn_address in rel.arg1.gorn:
#             %dotstr dg.print_dot(gorn2subtree(pdtb_file, gorn_address))
#             gorn2subtree(pdtb_file, gorn_address)
            gorn2subtree(ptb_graph, gorn_address)
        print 'arg2: ', rel.arg2.gorn
        print



In [10]:

    
# find the shortest PDTB file that contains a gorn address w/ > 1 parts
def find_shortest_pdtb(sorted_pdtb_files):
    for i, pdtb_file in enumerate(sorted_pdtb_files):
        educe_pdtb = read_pdtb(pdtb_file)
        for i, rel in enumerate(educe_pdtb):
            for arg in (rel.arg1, rel.arg2):
                for address in arg.gorn:
                    if len(address.parts) > 1:
                        print pdtb_file
                        print i, type(rel)
                        print rel.arg1.gorn, rel.arg2.gorn
                        return display_rel(rel)



In [11]:

    
find_shortest_pdtb(sorted_pdtb_files)









    



/home/arne/corpora/pdtb_v2/data/22/wsj_2257.pdtb
0 <class 'educe.pdtb.parse.ImplicitRelation'>
[23.0] [23.2]
These rate indications aren't directly comparable
 	 ---[Implicit Connective(because | Contingency.Cause.Reason)]---->
 			lending practices vary widely by location



In [12]:

    
ptb_2257, pdtb_2257 = wsjid2filepaths(2257)



In [15]:

    
%dotstr dg.print_dot(dg.read_ptb(ptb_2257))



In [ ]:

    
# gorn2subtree(ptb_0003, '1.0')

# %dotstr dg.print_dot(get_sentence_subgraph(ptb_0003, 118))

# ptb_0003.neighbors(118)



In [ ]:

    
ptb_0001path, pdtb_0001path = wsjid2filepaths('0001')

ptb_0001 = dg.read_ptb(ptb_0001path)
pdtb_0001 = read_pdtb(pdtb_0001path)



In [ ]:

    
# pdtb_info(wsjid2filepaths('0004')[1])

TODO: repair node order in write_dot / print_dot for good



In [ ]:

    
# %dotstr dg.print_dot(nx.bfs_tree(ptb_0003, 118))



In [ ]:

    
# %dotstr dg.print_dot(ptb_0003)



In [ ]:

    
from discoursegraphs.util import find_files

def parse_corpus(corpus_dir=PDTB_ROOT_DIR):
    for pdtb_file in find_files(corpus_dir, '*.pdtb'):
        pdtb.parse.parse(pdtb_file)



In [ ]:

    
# %time [pdtb.parse.parse(pdtb_file) for pdtb_file in find_files(PDTB_ROOT_DIR, '*.pdtb')] # 1m45s



In [ ]:

    
# %timeit parse_corpus() #1 loops, best of 3: 1min 39s per loop



In [ ]:

    
# pdtb_reader = pdtb.Reader(PDTB_ROOT_DIR)



In [ ]:

    
# len(pdtb_reader.files()) # 2159



In [ ]:

    
# pdtb_corpys = pdtb_reader.slurp()



In [ ]:

    
from math import sqrt
from joblib import Parallel, delayed



In [ ]:

    
def parse_pdtb_file(pdtb_file):
    pdtb.parse.parse(pdtb_file)



In [ ]:

    
# Parallel(n_jobs=4)(delayed(parse_pdtb_file)(pdtb_file) for pdtb_file in find_files(PDTB_ROOT_DIR, '*.pdtb')) # 49.6s