In [1]:
import os
from operator import itemgetter
import educe
from educe import pdtb
import networkx as nx

import discoursegraphs as dg
from discoursegraphs.util import natural_sort_key, find_files

In [2]:
PDTB_ROOT_DIR = os.path.expanduser('~/corpora/pdtb_v2/data')
PDTB_TEST_FILE = os.path.join(PDTB_ROOT_DIR, '00/wsj_0003.pdtb')

PTB_ROOT_DIR = os.path.expanduser('~/corpora/pennTreebank/parsed/mrg/wsj')
PTB_TEST_FILE = os.path.join(PTB_ROOT_DIR, '00/wsj_0003.mrg')

In [3]:
# ptb files, ordered by file size (ascending order)
sorted_ptb_files = sorted((ptb_file for ptb_file in find_files(PTB_ROOT_DIR, '*.mrg')),
                           key=os.path.getsize)

sorted_pdtb_files = sorted((pdtb_file for pdtb_file in find_files(PDTB_ROOT_DIR, '*.pdtb')),
                           key=os.path.getsize)

In [4]:
def wsjid2filepaths(wsj_id, ptb_root=PTB_ROOT_DIR, pdtb_root=PDTB_ROOT_DIR):
    """converts a PTB-WSJ document ID into PTB and PDTB file paths."""
    if isinstance(wsj_id, int):
        wsj_id = str(wsj_id)
    prefix = wsj_id[:2]
    return os.path.join(ptb_root, prefix, 'wsj_{}.mrg'.format(wsj_id)), \
        os.path.join(pdtb_root, prefix, 'wsj_{}.pdtb'.format(wsj_id))

In [5]:
# some helper functions from the educe tutorial

def show_type(rel):
    "short string for a relation type"
    return type(rel).__name__[:-8]  # remove "Relation"

def highlight(astring, color=1):
    "coloured text"
    return("\x1b[3{color}m{str}\x1b[0m".format(color=color, str=astring))

def display_rel(r):
    "pretty print a relation instance"
    rtype = show_type(r)
    
    if rtype == "Explicit":
        conn = highlight(r.connhead)
    elif rtype == "Implicit":
        conn = "{rtype} {conn1}".format(rtype=rtype,
                                        conn1=highlight(str(r.connective1)))
    elif rtype == "AltLex":
        conn = "{rtype} {sem1}".format(rtype=rtype,
                                       sem1=highlight(r.semclass1))
    else:
        conn = rtype

    fmt = "{src}\n \t ---[{label}]---->\n \t\t\t{tgt}"
    print fmt.format(src=highlight(r.arg1.text, 2),
                      label=conn,
                      tgt=highlight(r.arg2.text, 2))

In [6]:
def read_pdtb(pdtb_filepath):
    """parses a PTDB file with educe"""
    return educe.pdtb.parse.parse(pdtb_filepath)

def get_subtree(docgraph, node_id):
    """
    given a document graph and a node ID, returns the subgraph/subtree
    dominated by that node.
    """
    return docgraph.subgraph(nx.bfs_tree(docgraph, node_id).nodes())

def gorn2node(docgraph, gorn_address):
    """
    given a document graph and a gorn adress, returns the (root) node ID
    of the subgraph/subtree the address points to.
    """
    if isinstance(gorn_address, educe.pdtb.parse.GornAddress):
        gorn_ints = gorn_address.parts
    else:
        gorn_ints = [int(num) for num in gorn_address.split('.')]
    sentence_index, gorn_numbers = gorn_ints[0], gorn_ints[1:]
    sentence_id = docgraph.sentences[sentence_index]
    
    current_node_id = sentence_id
    for gorn_int in gorn_ints:
        current_node_id = sorted(docgraph.neighbors(current_node_id), key=natural_sort_key)[gorn_int]
    return current_node_id

def gorn2subtree(docgraph, gorn_address):
    """
    given a document graph and a gorn adress, returns the subgraph/subtree
    the address points to.
    """
    subtree_root_node = gorn2node(docgraph, gorn_address)
    return get_subtree(docgraph, subtree_root_node)

In [7]:
# for i, sent_root_node in enumerate(ptb_0003.sentences):
#     print i, sent_root_node
#     %dotstr dg.print_dot(get_sentence_subgraph(ptb_0003, sent_root_node))

In [8]:
# %dotstr dg.print_dot(gorn2subtree(ptb_0003, '3'))

In [9]:
%load_ext gvmagic

def filepath2wsjid(filepath):
    return os.path.basename(filepath).split('.')[0].split('_')[1]

def pdtb_info(pdtb_file):
    wsj_id = filepath2wsjid(pdtb_file)
    ptb_file, _ = wsjid2filepaths(wsj_id)
    print ptb_file
    ptb_graph = dg.read_ptb(ptb_file)
    %dotstr dg.print_dot(ptb_graph)
    
    educe_pdtb = read_pdtb(pdtb_file)
    for i, rel in enumerate(educe_pdtb):
        print i, type(rel)
        display_rel(rel)
        print 'arg1: ', rel.arg1.gorn
        for gorn_address in rel.arg1.gorn:
#             %dotstr dg.print_dot(gorn2subtree(pdtb_file, gorn_address))
#             gorn2subtree(pdtb_file, gorn_address)
            gorn2subtree(ptb_graph, gorn_address)
        print 'arg2: ', rel.arg2.gorn
        print

In [10]:
# find the shortest PDTB file that contains a gorn address w/ > 1 parts
def find_shortest_pdtb(sorted_pdtb_files):
    for i, pdtb_file in enumerate(sorted_pdtb_files):
        educe_pdtb = read_pdtb(pdtb_file)
        for i, rel in enumerate(educe_pdtb):
            for arg in (rel.arg1, rel.arg2):
                for address in arg.gorn:
                    if len(address.parts) > 1:
                        print pdtb_file
                        print i, type(rel)
                        print rel.arg1.gorn, rel.arg2.gorn
                        return display_rel(rel)

In [11]:
find_shortest_pdtb(sorted_pdtb_files)


/home/arne/corpora/pdtb_v2/data/22/wsj_2257.pdtb
0 <class 'educe.pdtb.parse.ImplicitRelation'>
[23.0] [23.2]
These rate indications aren't directly comparable
 	 ---[Implicit Connective(because | Contingency.Cause.Reason)]---->
 			lending practices vary widely by location

In [12]:
ptb_2257, pdtb_2257 = wsjid2filepaths(2257)

In [15]:
%dotstr dg.print_dot(dg.read_ptb(ptb_2257))


wsj_2257.mrg 0 ptb:root_node 1 NP-TMP 0->1 16 S 0->16 77 NP 0->77 95 NP 0->95 129 NP 0->129 201 S 0->201 250 NP 0->250 273 NP 0->273 288 NP 0->288 327 NP 0->327 352 NP 0->352 383 NP 0->383 539 NP 0->539 634 NP 0->634 713 NP 0->713 793 S 0->793 812 NP 0->812 871 NP 0->871 958 NP 0->958 988 NP 0->988 1149 NP 0->1149 1235 NP 0->1235 1295 NP 0->1295 1361 S 0->1361 1402 NP 0->1402 1534 NP 0->1534 1592 NP 0->1592 1641 NP 0->1641 1656 NP 0->1656 1775 NP 0->1775 1790 NP 0->1790 1811 NP 0->1811 2 NP 1->2 5 , 1->5 7 NP 1->7 17 NP-SBJ 16->17 39 VP 16->39 75 . 16->75 78 NP 77->78 83 : 77->83 85 NP 77->85 93 . 77->93 96 NP 95->96 103 PP 95->103 111 PP-LOC 95->111 127 . 95->127 130 NP 129->130 135 : 129->135 137 NP 129->137 199 . 129->199 202 NP-SBJ 201->202 205 VP 201->205 248 . 201->248 251 NP 250->251 254 : 250->254 256 NP 250->256 271 . 250->271 274 NP 273->274 279 : 273->279 281 NP 273->281 286 . 273->286 289 NP 288->289 294 PP 288->294 325 . 288->325 328 NP 327->328 333 : 327->333 335 NP 327->335 350 . 327->350 353 NP 352->353 358 PP 352->358 381 . 352->381 384 NP 383->384 410 : 383->410 412 NP 383->412 537 . 383->537 540 NP 539->540 545 : 539->545 547 NP 539->547 632 . 539->632 635 NP 634->635 645 : 634->645 647 NP 634->647 711 . 634->711 714 NP 713->714 717 PP 713->717 791 . 713->791 794 NP-SBJ 793->794 801 VP 793->801 810 . 793->810 813 NP 812->813 829 : 812->829 831 NP 812->831 869 . 812->869 872 NP 871->872 877 : 871->877 879 NP 871->879 956 . 871->956 959 NP 958->959 973 VP 958->973 986 . 958->986 989 NP 988->989 996 : 988->996 998 NP 988->998 1147 . 988->1147 1150 NP 1149->1150 1168 : 1149->1168 1170 NP 1149->1170 1233 . 1149->1233 1236 NP 1235->1236 1241 PP 1235->1241 1271 VP 1235->1271 1293 . 1235->1293 1296 NP 1295->1296 1303 : 1295->1303 1305 NP 1295->1305 1359 . 1295->1359 1362 S 1361->1362 1380 ; 1361->1380 1382 S 1361->1382 1400 . 1361->1400 1403 NP 1402->1403 1408 : 1402->1408 1410 NP 1402->1410 1532 . 1402->1532 1535 NP 1534->1535 1559 : 1534->1559 1561 NP 1534->1561 1593 NP 1592->1593 1610 ; 1592->1610 1612 NP 1592->1612 1639 . 1592->1639 1642 NP 1641->1642 1645 : 1641->1645 1647 NP 1641->1647 1654 . 1641->1654 1657 NP 1656->1657 1677 : 1656->1677 1679 NP 1656->1679 1773 . 1656->1773 1776 NP 1775->1776 1779 : 1775->1779 1781 NP 1775->1781 1788 . 1775->1788 1791 NP 1790->1791 1802 : 1790->1802 1804 NP 1790->1804 1809 . 1790->1809 1812 NP 1811->1812 1844 ; 1811->1844 1846 NP 1811->1846 1862 . 1811->1862 3 Monday 2->3 8 October 7->8 10 16 7->10 12 , 7->12 14 1989 7->14 18 NP 17->18 36 ADVP-LOC 17->36 40 VP 39->40 57 but 39->57 59 VP 39->59 19 The 18->19 21 key 18->21 23 UCP 18->23 30 annual 18->30 32 interest 18->32 34 rates 18->34 37 below 36->37 24 U.S. 23->24 26 and 23->26 28 foreign 23->28 41 are 40->41 43 NP-PRD 40->43 60 do 59->60 62 n't 59->62 64 ADVP-TMP 59->64 67 VP 59->67 44 NP 43->44 49 PP 43->49 45 a 44->45 47 guide 44->47 50 to 49->50 52 NP 49->52 53 general 52->53 55 levels 52->55 65 always 64->65 68 represent 67->68 70 NP 67->70 71 actual 70->71 73 transactions 70->73 79 PRIME 78->79 81 RATE 78->81 86 QP 85->86 91 % 85->91 87 10 86->87 89 1/2 86->89 97 The 96->97 99 base 96->99 101 rate 96->101 104 on 103->104 106 NP 103->106 112 at 111->112 114 NP 111->114 107 corporate 106->107 109 loans 106->109 115 large 114->115 117 U.S. 114->117 119 money 114->119 121 center 114->121 123 commercial 114->123 125 banks 114->125 131 FEDERAL 130->131 133 FUNDS 130->133 138 NP 137->138 150 , 137->150 152 NP 137->152 164 , 137->164 166 NP 137->166 183 , 137->183 185 NP 137->185 139 NP 138->139 147 ADJP 138->147 153 NP 152->153 161 ADJP 152->161 167 NP 166->167 175 PP 166->175 186 NP 185->186 194 VP 185->194 140 QP 139->140 145 % 139->145 148 high 147->148 141 8 140->141 143 3/4 140->143 154 QP 153->154 159 % 153->159 162 low 161->162 155 8 154->155 157 1/2 154->157 168 QP 167->168 173 % 167->173 176 near 175->176 178 NP 175->178 169 8 168->169 171 5/8 168->171 179 closing 178->179 181 bid 178->181 187 QP 186->187 192 % 186->192 195 offered 194->195 197 NP 194->197 188 8 187->188 190 3/4 187->190 203 Reserves 202->203 206 traded 205->206 208 PP-LOC 205->208 216 PP 205->216 224 PP-MNR 205->224 209 among 208->209 211 NP 208->211 217 for 216->217 219 NP 216->219 225 in 224->225 227 NP 224->227 212 commercial 211->212 214 banks 211->214 220 overnight 219->220 222 use 219->222 228 NP 227->228 231 PP 227->231 229 amounts 228->229 232 of 231->232 234 NP 231->234 235 QP 234->235 243 QP 234->243 236 $ 235->236 238 1 235->238 240 million 235->240 244 or 243->244 246 more 243->246 252 Source 251->252 257 Fulton 256->257 259 Prebon 256->259 261 -LRB- 256->261 263 U.S.A 256->263 265 . 256->265 267 -RRB- 256->267 269 Inc 256->269 275 DISCOUNT 274->275 277 RATE 274->277 282 7 281->282 284 % 281->284 290 The 289->290 292 charge 289->292 295 on 294->295 297 NP 294->297 298 NP 297->298 301 PP 297->301 309 PP 297->309 299 loans 298->299 302 to 301->302 304 NP 301->304 310 by 309->310 312 NP 309->312 305 depository 304->305 307 institutions 304->307 313 the 312->313 315 New 312->315 317 York 312->317 319 Federal 312->319 321 Reserve 312->321 323 Bank 312->323 329 CALL 328->329 331 MONEY 328->331 336 QP 335->336 337 9 336->337 339 3/4 336->339 341 % 336->341 343 to 336->343 345 10 336->345 347 % 336->347 354 The 353->354 356 charge 353->356 359 on 358->359 361 NP 358->361 362 NP 361->362 365 PP 361->365 371 PP 361->371 363 loans 362->363 366 to 365->366 368 NP 365->368 372 on 371->372 374 NP 371->374 369 brokers 368->369 375 stock 374->375 377 exchange 374->377 379 collateral 374->379 385 NP 384->385 390 VP 384->390 413 NP 412->413 429 ; 412->429 431 NP 412->431 447 ; 412->447 449 NP 412->449 465 ; 412->465 467 NP 412->467 483 ; 412->483 485 NP 412->485 501 ; 412->501 503 NP 412->503 519 ; 412->519 521 NP 412->521 386 COMMERCIAL 385->386 388 PAPER 385->388 391 placed 390->391 393 NP 390->393 395 ADVP-MNR 390->395 398 PP 390->398 396 directly 395->396 399 by 398->399 401 NP-LGS 398->401 402 General 401->402 404 Motors 401->404 406 Acceptance 401->406 408 Corp. 401->408 414 NP 413->414 419 NP 413->419 432 NP 431->432 437 NP 431->437 450 NP 449->450 455 NP 449->455 468 NP 467->468 473 NP 467->473 486 NP 485->486 491 NP 485->491 504 NP 503->504 509 NP 503->509 522 NP 521->522 527 NP 521->527 415 8.30 414->415 417 % 414->417 420 QP 419->420 427 days 419->427 421 5 420->421 423 to 420->423 425 44 420->425 433 8.20 432->433 435 % 432->435 438 QP 437->438 445 days 437->445 439 45 438->439 441 to 438->441 443 59 438->443 451 8 450->451 453 % 450->453 456 QP 455->456 463 days 455->463 457 60 456->457 459 to 456->459 461 89 456->461 469 7.875 468->469 471 % 468->471 474 QP 473->474 481 days 473->481 475 90 474->475 477 to 474->477 479 119 474->479 487 7.75 486->487 489 % 486->489 492 QP 491->492 499 days 491->499 493 120 492->493 495 to 492->495 497 149 492->497 505 7.625 504->505 507 % 504->507 510 QP 509->510 517 days 509->517 511 150 510->511 513 to 510->513 515 179 510->515 523 7.375 522->523 525 % 522->525 528 QP 527->528 535 days 527->535 529 180 528->529 531 to 528->531 533 270 528->533 541 COMMERCIAL 540->541 543 PAPER 540->543 548 NP 547->548 592 : 547->592 594 NP 547->594 549 NP 548->549 557 VP 548->557 595 NP 594->595 606 ; 594->606 608 NP 594->608 619 ; 594->619 621 NP 594->621 550 ADJP 549->550 555 notes 549->555 558 sold 557->558 560 NP 557->560 562 PP-MNR 557->562 568 PP 557->568 576 PP 557->576 551 High-grade 550->551 553 unsecured 550->553 563 through 562->563 565 NP 562->565 569 by 568->569 571 NP-LGS 568->571 577 in 576->577 579 NP 576->579 566 dealers 565->566 572 major 571->572 574 corporations 571->574 580 NP 579->580 583 PP 579->583 581 multiples 580->581 584 of 583->584 586 NP 583->586 587 $ 586->587 589 1,000 586->589 596 NP 595->596 601 NP 595->601 609 NP 608->609 614 NP 608->614 622 NP 621->622 627 NP 621->627 597 8.40 596->597 599 % 596->599 602 30 601->602 604 days 601->604 610 8.33 609->610 612 % 609->612 615 60 614->615 617 days 614->617 623 8.26 622->623 625 % 622->625 628 90 627->628 630 days 627->630 636 NP 635->636 639 PP 635->639 648 NP 647->648 659 ; 647->659 661 NP 647->661 672 ; 647->672 674 NP 647->674 685 ; 647->685 687 NP 647->687 698 ; 647->698 700 NP 647->700 637 CERTIFICATES 636->637 640 OF 639->640 642 NP 639->642 643 DEPOSIT 642->643 649 NP 648->649 654 NP 648->654 662 NP 661->662 667 NP 661->667 675 NP 674->675 680 NP 674->680 688 NP 687->688 693 NP 687->693 701 NP 700->701 706 NP 700->706 650 8.05 649->650 652 % 649->652 655 one 654->655 657 month 654->657 663 8.02 662->663 665 % 662->665 668 two 667->668 670 months 667->670 676 8 675->676 678 % 675->678 681 three 680->681 683 months 680->683 689 7.98 688->689 691 % 688->691 694 six 693->694 696 months 693->696 702 7.95 701->702 704 % 701->704 707 one 706->707 709 year 706->709 715 Average 714->715 718 of 717->718 720 NP 717->720 721 NP 720->721 726 VP 720->726 722 top 721->722 724 rates 721->724 727 paid 726->727 729 NP 726->729 731 PP 726->731 743 PP 726->743 763 , 726->763 765 PP 726->765 732 by 731->732 734 NP-LGS 731->734 744 on 743->744 746 NP 743->746 766 usually 765->766 768 on 765->768 770 NP 765->770 735 major 734->735 737 New 734->737 739 York 734->739 741 banks 734->741 747 NP 746->747 755 PP 746->755 748 ADJP 747->748 753 issues 747->753 756 of 755->756 758 NP 755->758 749 primary 748->749 751 new 748->751 759 negotiable 758->759 761 C.D.s 758->761 771 NP 770->771 774 PP 770->774 772 amounts 771->772 775 of 774->775 777 NP 774->777 778 QP 777->778 786 QP 777->786 779 $ 778->779 781 1 778->781 783 million 778->783 787 and 786->787 789 more 786->789 795 The 794->795 797 minimum 794->797 799 unit 794->799 802 is 801->802 804 NP-PRD 801->804 805 $ 804->805 807 100,000 804->807 814 NP 813->814 819 PP-LOC 813->819 832 NP 831->832 843 ; 831->843 845 NP 831->845 856 ; 831->856 858 NP 831->858 815 Typical 814->815 817 rates 814->817 820 in 819->820 822 NP 819->822 823 the 822->823 825 secondary 822->825 827 market 822->827 833 NP 832->833 838 NP 832->838 846 NP 845->846 851 NP 845->851 859 NP 858->859 864 NP 858->864 834 8.40 833->834 836 % 833->836 839 one 838->839 841 month 838->841 847 8.40 846->847 849 % 846->849 852 three 851->852 854 months 851->854 860 8.40 859->860 862 % 859->862 865 six 864->865 867 months 864->867 873 BANKERS 872->873 875 ACCEPTANCES 872->875 880 NP 879->880 891 ; 879->891 893 NP 879->893 904 ; 879->904 906 NP 879->906 917 ; 879->917 919 NP 879->919 930 ; 879->930 932 NP 879->932 943 ; 879->943 945 NP 879->945 881 NP 880->881 886 NP 880->886 894 NP 893->894 899 NP 893->899 907 NP 906->907 912 NP 906->912 920 NP 919->920 925 NP 919->925 933 NP 932->933 938 NP 932->938 946 NP 945->946 951 NP 945->951 882 8.40 881->882 884 % 881->884 887 30 886->887 889 days 886->889 895 8.35 894->895 897 % 894->897 900 60 899->900 902 days 899->902 908 8.27 907->908 910 % 907->910 913 90 912->913 915 days 912->915 921 8.20 920->921 923 % 920->923 926 120 925->926 928 days 925->928 934 8.15 933->934 936 % 933->936 939 150 938->939 941 days 938->941 947 8.02 946->947 949 % 946->949 952 180 951->952 954 days 951->954 960 ADJP 959->960 967 business 959->967 969 credit 959->969 971 instruments 959->971 974 ADVP 973->974 977 financing 973->977 979 NP 973->979 961 Negotiable 960->961 963 , 960->963 965 bank-backed 960->965 975 typically 974->975 980 an 979->980 982 import 979->982 984 order 979->984 990 LONDON 989->990 992 LATE 989->992 994 EURODOLLARS 989->994 999 NP 998->999 1022 ; 998->1022 1024 NP 998->1024 1047 ; 998->1047 1049 NP 998->1049 1072 ; 998->1072 1074 NP 998->1074 1097 ; 998->1097 1099 NP 998->1099 1122 ; 998->1122 1124 NP 998->1124 1000 NP 999->1000 1017 NP 999->1017 1025 NP 1024->1025 1042 NP 1024->1042 1050 NP 1049->1050 1067 NP 1049->1067 1075 NP 1074->1075 1092 NP 1074->1092 1100 NP 1099->1100 1117 NP 1099->1117 1125 NP 1124->1125 1142 NP 1124->1142 1001 QP 1000->1001 1018 one 1017->1018 1020 month 1017->1020 1002 8 1001->1002 1004 5/8 1001->1004 1006 % 1001->1006 1008 to 1001->1008 1010 8 1001->1010 1012 1/2 1001->1012 1014 % 1001->1014 1026 QP 1025->1026 1043 two 1042->1043 1045 months 1042->1045 1027 8 1026->1027 1029 5/8 1026->1029 1031 % 1026->1031 1033 to 1026->1033 1035 8 1026->1035 1037 1/2 1026->1037 1039 % 1026->1039 1051 QP 1050->1051 1068 three 1067->1068 1070 months 1067->1070 1052 8 1051->1052 1054 9/16 1051->1054 1056 % 1051->1056 1058 to 1051->1058 1060 8 1051->1060 1062 7/16 1051->1062 1064 % 1051->1064 1076 QP 1075->1076 1093 four 1092->1093 1095 months 1092->1095 1077 8 1076->1077 1079 1/2 1076->1079 1081 % 1076->1081 1083 to 1076->1083 1085 8 1076->1085 1087 3/8 1076->1087 1089 % 1076->1089 1101 QP 1100->1101 1118 five 1117->1118 1120 months 1117->1120 1102 8 1101->1102 1104 1/2 1101->1104 1106 % 1101->1106 1108 to 1101->1108 1110 8 1101->1110 1112 3/8 1101->1112 1114 % 1101->1114 1126 QP 1125->1126 1143 six 1142->1143 1145 months 1142->1145 1127 8 1126->1127 1129 1/2 1126->1129 1131 % 1126->1131 1133 to 1126->1133 1135 8 1126->1135 1137 3/8 1126->1137 1139 % 1126->1139 1151 NP 1150->1151 1160 PRN 1150->1160 1171 NP 1170->1171 1185 ; 1170->1185 1187 NP 1170->1187 1201 ; 1170->1201 1203 NP 1170->1203 1217 ; 1170->1217 1219 NP 1170->1219 1152 LONDON 1151->1152 1154 INTERBANK 1151->1154 1156 OFFERED 1151->1156 1158 RATES 1151->1158 1161 -LRB- 1160->1161 1163 NP 1160->1163 1166 -RRB- 1160->1166 1164 LIBOR 1163->1164 1172 NP 1171->1172 1180 NP 1171->1180 1188 NP 1187->1188 1196 NP 1187->1196 1204 NP 1203->1204 1212 NP 1203->1212 1220 NP 1219->1220 1228 NP 1219->1228 1173 QP 1172->1173 1178 % 1172->1178 1181 one 1180->1181 1183 month 1180->1183 1174 8 1173->1174 1176 1/2 1173->1176 1189 QP 1188->1189 1194 % 1188->1194 1197 three 1196->1197 1199 months 1196->1199 1190 8 1189->1190 1192 1/2 1189->1192 1205 QP 1204->1205 1210 % 1204->1210 1213 six 1212->1213 1215 months 1212->1215 1206 8 1205->1206 1208 7/16 1205->1208 1221 QP 1220->1221 1226 % 1220->1226 1229 one 1228->1229 1231 year 1228->1231 1222 8 1221->1222 1224 3/8 1221->1224 1237 The 1236->1237 1239 average 1236->1239 1242 of 1241->1242 1244 NP 1241->1244 1272 based 1271->1272 1274 NP 1271->1274 1276 PP-CLR 1271->1276 1245 NP 1244->1245 1252 PP 1244->1252 1246 interbank 1245->1246 1248 offered 1245->1248 1250 rates 1245->1250 1253 for 1252->1253 1255 NP 1252->1255 1256 NP 1255->1256 1261 PP-LOC 1255->1261 1257 dollar 1256->1257 1259 deposits 1256->1259 1262 in 1261->1262 1264 NP 1261->1264 1265 the 1264->1265 1267 London 1264->1267 1269 market 1264->1269 1277 on 1276->1277 1279 NP 1276->1279 1280 NP 1279->1280 1283 PP-LOC 1279->1283 1281 quotations 1280->1281 1284 at 1283->1284 1286 NP 1283->1286 1287 five 1286->1287 1289 major 1286->1289 1291 banks 1286->1291 1297 FOREIGN 1296->1297 1299 PRIME 1296->1299 1301 RATES 1296->1301 1306 NP 1305->1306 1315 ; 1305->1315 1317 NP 1305->1317 1326 ; 1305->1326 1328 NP 1305->1328 1337 ; 1305->1337 1339 NP 1305->1339 1348 ; 1305->1348 1350 NP 1305->1350 1307 NP 1306->1307 1310 NP 1306->1310 1318 NP 1317->1318 1321 NP 1317->1321 1329 NP 1328->1329 1332 NP 1328->1332 1340 NP 1339->1340 1343 NP 1339->1343 1351 NP 1350->1351 1354 NP 1350->1354 1308 Canada 1307->1308 1311 13.50 1310->1311 1313 % 1310->1313 1319 Germany 1318->1319 1322 8.50 1321->1322 1324 % 1321->1324 1330 Japan 1329->1330 1333 4.875 1332->1333 1335 % 1332->1335 1341 Switzerland 1340->1341 1344 8.50 1343->1344 1346 % 1343->1346 1352 Britain 1351->1352 1355 15 1354->1355 1357 % 1354->1357 1363 NP-SBJ 1362->1363 1370 VP 1362->1370 1383 NP-SBJ 1382->1383 1388 VP 1382->1388 1364 These 1363->1364 1366 rate 1363->1366 1368 indications 1363->1368 1371 are 1370->1371 1373 n't 1370->1373 1375 ADJP-PRD 1370->1375 1376 directly 1375->1376 1378 comparable 1375->1378 1384 lending 1383->1384 1386 practices 1383->1386 1389 vary 1388->1389 1391 ADVP-MNR 1388->1391 1394 PP 1388->1394 1392 widely 1391->1392 1395 by 1394->1395 1397 NP 1394->1397 1398 location 1397->1398 1404 TREASURY 1403->1404 1406 BILLS 1403->1406 1411 NP 1410->1411 1501 : 1410->1501 1503 NP 1410->1503 1412 NP 1411->1412 1415 PP 1411->1415 1504 NP 1503->1504 1517 ; 1503->1517 1519 NP 1503->1519 1413 Results 1412->1413 1416 of 1415->1416 1418 NP 1415->1418 1419 NP 1418->1419 1439 PP 1418->1439 1420 the 1419->1420 1422 NAC-TMP 1419->1422 1437 auction 1419->1437 1440 of 1439->1440 1442 NP 1439->1442 1423 Monday 1422->1423 1425 , 1422->1425 1427 October 1422->1427 1429 16 1422->1429 1431 , 1422->1431 1433 1989 1422->1433 1435 , 1422->1435 1443 NP 1442->1443 1452 , 1442->1452 1454 VP 1442->1454 1444 short-term 1443->1444 1446 U.S. 1443->1446 1448 government 1443->1448 1450 bills 1443->1450 1455 sold 1454->1455 1457 NP 1454->1457 1459 PP-CLR 1454->1459 1476 PP 1454->1476 1460 at 1459->1460 1462 NP 1459->1462 1477 in 1476->1477 1479 NP 1476->1479 1463 NP 1462->1463 1468 PP 1462->1468 1464 a 1463->1464 1466 discount 1463->1466 1469 from 1468->1469 1471 NP 1468->1471 1472 face 1471->1472 1474 value 1471->1474 1480 NP 1479->1480 1483 PP 1479->1483 1481 units 1480->1481 1484 of 1483->1484 1486 NP 1483->1486 1487 QP 1486->1487 1488 $ 1487->1488 1490 10,000 1487->1490 1492 to 1487->1492 1494 $ 1487->1494 1496 1 1487->1496 1498 million 1487->1498 1505 NP 1504->1505 1510 , 1504->1510 1512 NP 1504->1512 1520 NP 1519->1520 1525 , 1519->1525 1527 NP 1519->1527 1506 7.37 1505->1506 1508 % 1505->1508 1513 13 1512->1513 1515 weeks 1512->1515 1521 7.42 1520->1521 1523 % 1520->1523 1528 26 1527->1528 1530 weeks 1527->1530 1536 NP 1535->1536 1549 PRN 1535->1549 1562 NP 1561->1562 1567 PP 1561->1567 1577 PP 1561->1577 1537 FEDERAL 1536->1537 1539 HOME 1536->1539 1541 LOAN 1536->1541 1543 MORTGAGE 1536->1543 1545 CORP 1536->1545 1547 . 1536->1547 1550 -LRB- 1549->1550 1552 NP 1549->1552 1557 -RRB- 1549->1557 1553 Freddie 1552->1553 1555 Mac 1552->1555 1563 Posted 1562->1563 1565 yields 1562->1565 1568 on 1567->1568 1570 NP 1567->1570 1578 for 1577->1578 1580 NP 1577->1580 1571 30-year 1570->1571 1573 mortgage 1570->1573 1575 commitments 1570->1575 1581 NP 1580->1581 1584 PP 1580->1584 1582 delivery 1581->1582 1585 within 1584->1585 1587 NP 1584->1587 1588 30 1587->1588 1590 days. 1587->1590 1594 NP 1593->1594 1599 , 1593->1599 1601 NP 1593->1601 1613 NP 1612->1613 1618 , 1612->1618 1620 NP 1612->1620 1595 9.83 1594->1595 1597 % 1594->1597 1602 standard 1601->1602 1604 conventional 1601->1604 1606 fixedrate 1601->1606 1608 mortgages 1601->1608 1614 7.875 1613->1614 1616 % 1613->1616 1621 ADJP 1620->1621 1631 one-year 1620->1631 1633 adjustable 1620->1633 1635 rate 1620->1635 1637 mortgages 1620->1637 1622 ADJP 1621->1622 1627 rate 1621->1627 1629 capped 1621->1629 1623 2 1622->1623 1625 % 1622->1625 1643 Source 1642->1643 1648 Telerate 1647->1648 1650 Systems 1647->1650 1652 Inc 1647->1652 1658 NP 1657->1658 1667 PRN 1657->1667 1680 NP 1679->1680 1729 NP 1679->1729 1659 FEDERAL 1658->1659 1661 NATIONAL 1658->1661 1663 MORTGAGE 1658->1663 1665 ASSOCIATION 1658->1665 1668 -LRB- 1667->1668 1670 NP 1667->1670 1675 -RRB- 1667->1675 1671 Fannie 1670->1671 1673 Mae 1670->1673 1681 NP 1680->1681 1686 PP 1680->1686 1698 PP 1680->1698 1713 PRN 1680->1713 1730 NP 1729->1730 1747 ; 1729->1747 1749 NP 1729->1749 1682 Posted 1681->1682 1684 yields 1681->1684 1687 on 1686->1687 1689 NP 1686->1689 1699 for 1698->1699 1701 NP 1698->1701 1714 -LRB- 1713->1714 1716 VP 1713->1716 1727 -RRB- 1713->1727 1690 30 1689->1690 1692 year 1689->1692 1694 mortgage 1689->1694 1696 commitments 1689->1696 1702 NP 1701->1702 1705 PP-TMP 1701->1705 1703 delivery 1702->1703 1706 within 1705->1706 1708 NP 1705->1708 1709 30 1708->1709 1711 days 1708->1711 1717 priced 1716->1717 1719 NP 1716->1719 1721 PP-CLR 1716->1721 1722 at 1721->1722 1724 NP 1721->1724 1725 par 1724->1725 1731 NP 1730->1731 1736 , 1730->1736 1738 NP 1730->1738 1750 NP 1749->1750 1755 , 1749->1755 1757 NP 1749->1757 1732 .9.82 1731->1732 1734 % 1731->1734 1739 standard 1738->1739 1741 conventional 1738->1741 1743 fixed 1738->1743 1745 rate-mortgages 1738->1745 1751 8.70 1750->1751 1753 % 1750->1753 1758 6/2 1757->1758 1760 ADJP 1757->1760 1765 one-year 1757->1765 1767 adjustable 1757->1767 1769 rate 1757->1769 1771 mortgages 1757->1771 1761 rate 1760->1761 1763 capped 1760->1763 1777 Source 1776->1777 1782 Telerate 1781->1782 1784 Systems 1781->1784 1786 Inc 1781->1786 1792 MERRILL 1791->1792 1794 LYNCH 1791->1794 1796 READY 1791->1796 1798 ASSETS 1791->1798 1800 TRUST 1791->1800 1805 8.49 1804->1805 1807 % 1804->1807 1813 NP 1812->1813 1820 PP 1812->1820 1826 PP-TMP 1812->1826 1832 PP-TMP 1812->1832 1847 NP 1846->1847 1854 PP 1846->1854 1814 Annualized 1813->1814 1816 average 1813->1816 1818 rate 1813->1818 1821 of 1820->1821 1823 NP 1820->1823 1827 after 1826->1827 1829 NP 1826->1829 1833 for 1832->1833 1835 NP 1832->1835 1824 return 1823->1824 1830 expenses 1829->1830 1836 the 1835->1836 1838 past 1835->1838 1840 30 1835->1840 1842 days 1835->1842 1848 not 1847->1848 1850 a 1847->1850 1852 forecast 1847->1852 1855 of 1854->1855 1857 NP 1854->1857 1858 future 1857->1858 1860 returns 1857->1860

In [ ]:
# gorn2subtree(ptb_0003, '1.0')

# %dotstr dg.print_dot(get_sentence_subgraph(ptb_0003, 118))

# ptb_0003.neighbors(118)

In [ ]:
ptb_0001path, pdtb_0001path = wsjid2filepaths('0001')

ptb_0001 = dg.read_ptb(ptb_0001path)
pdtb_0001 = read_pdtb(pdtb_0001path)

In [ ]:
# pdtb_info(wsjid2filepaths('0004')[1])

TODO: repair node order in write_dot / print_dot for good


In [ ]:
# %dotstr dg.print_dot(nx.bfs_tree(ptb_0003, 118))

In [ ]:
# %dotstr dg.print_dot(ptb_0003)

In [ ]:
from discoursegraphs.util import find_files

def parse_corpus(corpus_dir=PDTB_ROOT_DIR):
    for pdtb_file in find_files(corpus_dir, '*.pdtb'):
        pdtb.parse.parse(pdtb_file)

In [ ]:
# %time [pdtb.parse.parse(pdtb_file) for pdtb_file in find_files(PDTB_ROOT_DIR, '*.pdtb')] # 1m45s

In [ ]:
# %timeit parse_corpus() #1 loops, best of 3: 1min 39s per loop

In [ ]:
# pdtb_reader = pdtb.Reader(PDTB_ROOT_DIR)

In [ ]:
# len(pdtb_reader.files()) # 2159

In [ ]:
# pdtb_corpys = pdtb_reader.slurp()

In [ ]:
from math import sqrt
from joblib import Parallel, delayed

In [ ]:
def parse_pdtb_file(pdtb_file):
    pdtb.parse.parse(pdtb_file)

In [ ]:
# Parallel(n_jobs=4)(delayed(parse_pdtb_file)(pdtb_file) for pdtb_file in find_files(PDTB_ROOT_DIR, '*.pdtb')) # 49.6s