In [1]:
import os
from operator import itemgetter
import educe
from educe import pdtb
import networkx as nx
import discoursegraphs as dg
from discoursegraphs.util import natural_sort_key, find_files
In [2]:
PDTB_ROOT_DIR = os.path.expanduser('~/corpora/pdtb_v2/data')
PDTB_TEST_FILE = os.path.join(PDTB_ROOT_DIR, '00/wsj_0003.pdtb')
PTB_ROOT_DIR = os.path.expanduser('~/corpora/pennTreebank/parsed/mrg/wsj')
PTB_TEST_FILE = os.path.join(PTB_ROOT_DIR, '00/wsj_0003.mrg')
In [3]:
# ptb files, ordered by file size (ascending order)
sorted_ptb_files = sorted((ptb_file for ptb_file in find_files(PTB_ROOT_DIR, '*.mrg')),
key=os.path.getsize)
sorted_pdtb_files = sorted((pdtb_file for pdtb_file in find_files(PDTB_ROOT_DIR, '*.pdtb')),
key=os.path.getsize)
In [4]:
def wsjid2filepaths(wsj_id, ptb_root=PTB_ROOT_DIR, pdtb_root=PDTB_ROOT_DIR):
"""converts a PTB-WSJ document ID into PTB and PDTB file paths."""
if isinstance(wsj_id, int):
wsj_id = str(wsj_id)
prefix = wsj_id[:2]
return os.path.join(ptb_root, prefix, 'wsj_{}.mrg'.format(wsj_id)), \
os.path.join(pdtb_root, prefix, 'wsj_{}.pdtb'.format(wsj_id))
In [5]:
# some helper functions from the educe tutorial
def show_type(rel):
"short string for a relation type"
return type(rel).__name__[:-8] # remove "Relation"
def highlight(astring, color=1):
"coloured text"
return("\x1b[3{color}m{str}\x1b[0m".format(color=color, str=astring))
def display_rel(r):
"pretty print a relation instance"
rtype = show_type(r)
if rtype == "Explicit":
conn = highlight(r.connhead)
elif rtype == "Implicit":
conn = "{rtype} {conn1}".format(rtype=rtype,
conn1=highlight(str(r.connective1)))
elif rtype == "AltLex":
conn = "{rtype} {sem1}".format(rtype=rtype,
sem1=highlight(r.semclass1))
else:
conn = rtype
fmt = "{src}\n \t ---[{label}]---->\n \t\t\t{tgt}"
print fmt.format(src=highlight(r.arg1.text, 2),
label=conn,
tgt=highlight(r.arg2.text, 2))
In [6]:
def read_pdtb(pdtb_filepath):
"""parses a PTDB file with educe"""
return educe.pdtb.parse.parse(pdtb_filepath)
def get_subtree(docgraph, node_id):
"""
given a document graph and a node ID, returns the subgraph/subtree
dominated by that node.
"""
return docgraph.subgraph(nx.bfs_tree(docgraph, node_id).nodes())
def gorn2node(docgraph, gorn_address):
"""
given a document graph and a gorn adress, returns the (root) node ID
of the subgraph/subtree the address points to.
"""
if isinstance(gorn_address, educe.pdtb.parse.GornAddress):
gorn_ints = gorn_address.parts
else:
gorn_ints = [int(num) for num in gorn_address.split('.')]
sentence_index, gorn_numbers = gorn_ints[0], gorn_ints[1:]
sentence_id = docgraph.sentences[sentence_index]
current_node_id = sentence_id
for gorn_int in gorn_ints:
current_node_id = sorted(docgraph.neighbors(current_node_id), key=natural_sort_key)[gorn_int]
return current_node_id
def gorn2subtree(docgraph, gorn_address):
"""
given a document graph and a gorn adress, returns the subgraph/subtree
the address points to.
"""
subtree_root_node = gorn2node(docgraph, gorn_address)
return get_subtree(docgraph, subtree_root_node)
In [7]:
# for i, sent_root_node in enumerate(ptb_0003.sentences):
# print i, sent_root_node
# %dotstr dg.print_dot(get_sentence_subgraph(ptb_0003, sent_root_node))
In [8]:
# %dotstr dg.print_dot(gorn2subtree(ptb_0003, '3'))
In [9]:
%load_ext gvmagic
def filepath2wsjid(filepath):
return os.path.basename(filepath).split('.')[0].split('_')[1]
def pdtb_info(pdtb_file):
wsj_id = filepath2wsjid(pdtb_file)
ptb_file, _ = wsjid2filepaths(wsj_id)
print ptb_file
ptb_graph = dg.read_ptb(ptb_file)
%dotstr dg.print_dot(ptb_graph)
educe_pdtb = read_pdtb(pdtb_file)
for i, rel in enumerate(educe_pdtb):
print i, type(rel)
display_rel(rel)
print 'arg1: ', rel.arg1.gorn
for gorn_address in rel.arg1.gorn:
# %dotstr dg.print_dot(gorn2subtree(pdtb_file, gorn_address))
# gorn2subtree(pdtb_file, gorn_address)
gorn2subtree(ptb_graph, gorn_address)
print 'arg2: ', rel.arg2.gorn
print
In [10]:
# find the shortest PDTB file that contains a gorn address w/ > 1 parts
def find_shortest_pdtb(sorted_pdtb_files):
for i, pdtb_file in enumerate(sorted_pdtb_files):
educe_pdtb = read_pdtb(pdtb_file)
for i, rel in enumerate(educe_pdtb):
for arg in (rel.arg1, rel.arg2):
for address in arg.gorn:
if len(address.parts) > 1:
print pdtb_file
print i, type(rel)
print rel.arg1.gorn, rel.arg2.gorn
return display_rel(rel)
In [11]:
find_shortest_pdtb(sorted_pdtb_files)
In [12]:
ptb_2257, pdtb_2257 = wsjid2filepaths(2257)
In [15]:
%dotstr dg.print_dot(dg.read_ptb(ptb_2257))
In [ ]:
# gorn2subtree(ptb_0003, '1.0')
# %dotstr dg.print_dot(get_sentence_subgraph(ptb_0003, 118))
# ptb_0003.neighbors(118)
In [ ]:
ptb_0001path, pdtb_0001path = wsjid2filepaths('0001')
ptb_0001 = dg.read_ptb(ptb_0001path)
pdtb_0001 = read_pdtb(pdtb_0001path)
In [ ]:
# pdtb_info(wsjid2filepaths('0004')[1])
In [ ]:
# %dotstr dg.print_dot(nx.bfs_tree(ptb_0003, 118))
In [ ]:
# %dotstr dg.print_dot(ptb_0003)
In [ ]:
from discoursegraphs.util import find_files
def parse_corpus(corpus_dir=PDTB_ROOT_DIR):
for pdtb_file in find_files(corpus_dir, '*.pdtb'):
pdtb.parse.parse(pdtb_file)
In [ ]:
# %time [pdtb.parse.parse(pdtb_file) for pdtb_file in find_files(PDTB_ROOT_DIR, '*.pdtb')] # 1m45s
In [ ]:
# %timeit parse_corpus() #1 loops, best of 3: 1min 39s per loop
In [ ]:
# pdtb_reader = pdtb.Reader(PDTB_ROOT_DIR)
In [ ]:
# len(pdtb_reader.files()) # 2159
In [ ]:
# pdtb_corpys = pdtb_reader.slurp()
In [ ]:
from math import sqrt
from joblib import Parallel, delayed
In [ ]:
def parse_pdtb_file(pdtb_file):
pdtb.parse.parse(pdtb_file)
In [ ]:
# Parallel(n_jobs=4)(delayed(parse_pdtb_file)(pdtb_file) for pdtb_file in find_files(PDTB_ROOT_DIR, '*.pdtb')) # 49.6s