In [1]:
import os
import re
import discoursegraphs as dg

PTB_WSJ_ROOT_DIR = os.path.expanduser('~/corpora/pennTreebank/parsed/mrg/wsj')
RST_ROOT_DIR = os.path.expanduser('~/repos/rst_discourse_treebank_rs3/tokenized/')

RST_TEST_FILE = os.path.join(RST_ROOT_DIR, 'TEST', 'wsj_1197.rs3')
PTB_WSJ_TEST_FILE = os.path.join(PTB_WSJ_ROOT_DIR, '11/wsj_1197.mrg')

WSJ_SUBDIR_REGEX = re.compile('wsj_(\d{2})')
WSJ_DOCID_REGEX = re.compile('wsj_(\d{4})')

In [2]:
rdg = dg.read_rs3(RST_TEST_FILE)
pdg = dg.read_ptb(PTB_WSJ_TEST_FILE)

In [3]:
dg.get_text(pdg)


Out[3]:
"Homestake Mining Co. , San Francisco , blamed the continued slump in gold prices for an 83 % plunge in third-quarter net income to $ 2 million , or two cents a share , from $ 11.2 million , or 12 cents a share , a year earlier . Revenue rose 5 % to $ 110.4 million from $ 105.4 million . In New York Stock Exchange composite trading , Homestake closed at $ 15.25 , down 25 cents . `` A significant increase in gold sales to 248,279 ounces for the quarter from 188,726 in the third quarter of 1988 was more than offset by the continued decline in average gold price realization to $ 367 from $ 429 per ounce , '' the company said . For the nine months , the mining company posted a 40 % drop in profit to $ 30.1 million , or 31 cents a share , from $ 50.6 million , or 52 cents a share , on a 6 % rise in revenue to $ 323.2 million from $ 305.7 million ."

In [4]:
pdg = dg.read_ptb(PTB_WSJ_TEST_FILE, ignore_traces=False)

In [5]:
dg.get_text(rdg)


Out[5]:
u'Homestake Mining Co. , San Francisco , blamed the continued slump in gold prices for an 83 % plunge in third-quarter net income to $ 2 million , or two cents a share , from $ 11.2 million , or 12 cents a share , a year earlier . Revenue rose 5 % to $ 110.4 million from $ 105.4 million . In New York Stock Exchange composite trading , Homestake closed at $ 15.25 , down 25 cents . A significant increase in gold sales to 248,279 ounces for the quarter from 188,726 in the third quarter of 1988 was more than offset by the continued decline in average gold price realization to $ 367 from $ 429 per ounce , the company said . For the nine months , the mining company posted a 40 % drop in profit to $ 30.1 million , or 31 cents a share , from $ 50.6 million , or 52 cents a share , on a 6 % rise in revenue to $ 323.2 million from $ 305.7 million .'

In [6]:
# rdg.merge_graphs(pdg, verbose=False)

In [11]:
# PTB_WSJ_ROOT_DIR = os.path.expanduser('~/corpora/pennTreebank/parsed/mrg/wsj')
RST_ORIG_ROOT_DIR = os.path.expanduser('~/repos/rst_discourse_treebank_rs3/untokenized/')

def fix_wsj_document(wsj_id):
    assert isinstance(wsj_id, (str, int))
    if isinstance(wsj_id, int):
        wsj_id = str(wsj_id)

    rst_subdirs = ('TEST', 'TRAINING')
    for rst_subdir in rst_subdirs:
        potential_filepath = os.path.join(RST_ORIG_ROOT_DIR, rst_subdir, 'wsj_{}.rs3').format(wsj_id)
        if os.path.exists(potential_filepath):
            rst_filepath = potential_filepath
            break
    if 'rst_filepath' not in locals():
        raise IOError("There's no rs3 file with ID '{}'".format(wsj_id))

    ptb_subdir = wsj_id[:2]
    ptb_filepath = os.path.join(PTB_WSJ_ROOT_DIR, ptb_subdir, 'wsj_{}.mrg').format(wsj_id)
    assert os.path.exists(ptb_filepath)
    return rst_filepath, ptb_filepath

Mismatches between .rs3 (RST-DT) and .mrg (PTB)

  • after tokenizing the untokenized RST-DT rs3 files,
    some minor tokenization issues (e.g. missing quotation marks, escaping characters, bracket names ...) remain
  • major issue: RST segment order often doesn't match the order in PTB sentences

In [12]:
import re
import glob
import sys

for folder in ('TEST', 'TRAINING'):
    for rst_file in glob.glob(os.path.join(RST_ROOT_DIR, folder, '*.rs3')):
        try:
            rdg = dg.read_rs3(rst_file)
            rst_fname = os.path.basename(rst_file).lower()
            try:
                doc_id = WSJ_DOCID_REGEX.match(rst_fname).groups()[0]
                wsj_subdir = WSJ_SUBDIR_REGEX.match(rst_fname).groups()[0]

                ptb_file = os.path.join(PTB_WSJ_ROOT_DIR, wsj_subdir, 'wsj_{}.mrg'.format(doc_id))
                pdg = dg.read_ptb(ptb_file)
                
                rdg.merge_graphs(pdg)
                print "merged: {}\n".format(rst_file)
            except AttributeError as e:
                sys.stderr.write("Error in {}: {}\n".format(os.path.basename(rst_file), e))
            except ValueError as e:
#                 sys.stderr.write("Error in {}: {}\n".format(os.path.basename(rst_file), e))
                print "geany {} {}\n".format(*fix_wsj_document(doc_id))
                try:
                    rdg.merge_graphs(pdg, verbose=True)
                except ValueError as e:
                    print e
                except StopIteration as e:
                    print e
                except IndexError as e:
                    print e
                    
            except StopIteration as e:
                sys.stderr.write("Error in {}: {}\n".format(os.path.basename(rst_file), e))

        except KeyError as e:
            sys.stderr.write("Error in {}: {}\n".format(os.path.basename(rst_file), e))


geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1148.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/11/wsj_1148.mrg

Tokenization mismatch:
wsj_1148.mrg (Index: 101): in New Orleans and Denver . [[This]] would be a second round
wsj_1148.rs3 (Index: 101): in New Orleans and Denver . [[Mobil]] 's latest move could signal

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1387.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/13/wsj_1387.mrg

Tokenization mismatch:
wsj_1387.mrg (Index: 0):  [[Yet]] another political scandal is racking
wsj_1387.rs3 (Index: 0):  [[But]] this time it 's hurting

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1169.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/11/wsj_1169.mrg

Tokenization mismatch:
wsj_1169.mrg (Index: 20): parts , including ceramic condensers . [[The]] Tokyo maker of ceramic capacitors
wsj_1169.rs3 (Index: 20): parts , including ceramic condensers . [[it]] purchased a plant in Plymouth

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_0644.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/06/wsj_0644.mrg

Tokenization mismatch:
wsj_0644.mrg (Index: 40): Thomas A. Bullock , 66 , [[who]] is retiring as chairman but
wsj_0644.rs3 (Index: 40): Thomas A. Bullock , 66 , [[but]] will continue as a director

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1189.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/11/wsj_1189.mrg

Tokenization mismatch:
wsj_1189.mrg (Index: 60): on the financial position of applicants [[and]] thus ca n't determine why
wsj_1189.rs3 (Index: 60): on the financial position of applicants [[why]] blacks are rejected more often

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1183.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/11/wsj_1183.mrg

Tokenization mismatch:
wsj_1183.mrg (Index: 0):  [[ALBERTA]] ENERGY Co. , Calgary ,
wsj_1183.rs3 (Index: 0):  [[it]] filed a preliminary prospectus for

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1126.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/11/wsj_1126.mrg

Tokenization mismatch:
wsj_1126.mrg (Index: 0):  [[Sheraton]] Corp. and Pan American World
wsj_1126.rs3 (Index: 0):  [[that]] they and two Soviet partners

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_2385.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/23/wsj_2385.mrg

Tokenization mismatch:
wsj_2385.mrg (Index: 84): common shares , via Salomon Brothers [[Inc]] . Baldwin Technology Co. --
wsj_2385.rs3 (Index: 84): common shares , via Salomon Brothers [[Inc.]] . Baldwin Technology Co. --

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1365.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/13/wsj_1365.mrg

Tokenization mismatch:
wsj_1365.mrg (Index: 42): requirements that federal prosecutors avoid disrupting [[``]] the normal business functions ''
wsj_1365.rs3 (Index: 42): requirements that federal prosecutors avoid disrupting [[the]] normal business functions of companies

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_0689.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/06/wsj_0689.mrg

Tokenization mismatch:
wsj_0689.mrg (Index: 24): return worth getting excited about . [[With]] $ 150 billion of CDs
wsj_0689.rs3 (Index: 24): return worth getting excited about . [[a]] lot of people have been

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1307.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/13/wsj_1307.mrg

Tokenization mismatch:
wsj_1307.mrg (Index: 77): assassination . With due respect to [[``]] highly classified correspondence '' and
wsj_1307.rs3 (Index: 77): assassination . With due respect to [[highly]] classified correspondence and other buzzwords

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_0627.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/06/wsj_0627.mrg

Tokenization mismatch:
wsj_0627.mrg (Index: 14): to be the most confusing . [[On]] the surface , the overall
wsj_0627.rs3 (Index: 14): to be the most confusing . [[But]] the actual head count of

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_0654.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/06/wsj_0654.mrg

Tokenization mismatch:
wsj_0654.mrg (Index: 189): since the leveraged buy-out , but [[``]] our performance since the -LCB-
wsj_0654.rs3 (Index: 189): since the leveraged buy-out , but [[our]] performance since the -LCB- buy-out

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1346.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/13/wsj_1346.mrg

Tokenization mismatch:
wsj_1346.mrg (Index: 119): month after the company projected a [[``]] substantial '' third-period loss ,
wsj_1346.rs3 (Index: 119): month after the company projected a [[substantial]] third-period loss , the stock

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1306.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/13/wsj_1306.mrg

Tokenization mismatch:
wsj_1306.mrg (Index: 37): second setback for U.S. Memories . [[Last]] month , Apple Computer Inc.
wsj_1306.rs3 (Index: 37): second setback for U.S. Memories . [[that]] it would n't invest in

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1325.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/13/wsj_1325.mrg

Tokenization mismatch:
wsj_1325.mrg (Index: 0):  [[Dana]] Corp. said its third-quarter net
wsj_1325.rs3 (Index: 0):  [[its]] third-quarter net income fell 27

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_0602.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/06/wsj_0602.mrg

Tokenization mismatch:
wsj_0602.mrg (Index: 33): transactions . PRIME RATE : 10 [[1\/2]] % . The base rate
wsj_0602.rs3 (Index: 33): transactions . PRIME RATE : 10 [[1/2]] % . The base rate

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_0607.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/06/wsj_0607.mrg

Tokenization mismatch:
wsj_0607.mrg (Index: 19): and one began trading on the [[Nasdaq\/National]] Market System last week .
wsj_0607.rs3 (Index: 19): and one began trading on the [[Nasdaq/National]] Market System last week .

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1376.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/13/wsj_1376.mrg

Tokenization mismatch:
wsj_1376.mrg (Index: 52): ring as big as the Ritz [[-LRB-]] `` my day diamond ,
wsj_1376.rs3 (Index: 52): ring as big as the Ritz [[-LCB-]] my day diamond , darling

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_2373.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/23/wsj_2373.mrg

Tokenization mismatch:
wsj_2373.mrg (Index: 14): piece on how long unemployment lasts [[-LRB-]] People Patterns , Sept. 20
wsj_2373.rs3 (Index: 14): piece on how long unemployment lasts [[-LCB-]] People Patterns , Sept. 20

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1142.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/11/wsj_1142.mrg

Tokenization mismatch:
wsj_1142.mrg (Index: 65): behind in the subsequent rally . [[After]] plummeting 1.8 % at one
wsj_1142.rs3 (Index: 65): behind in the subsequent rally . [[the]] composite rebounded a little ,

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_0655.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/06/wsj_0655.mrg

Tokenization mismatch:
wsj_0655.mrg (Index: 58): here Saturday , saying only that [[``]] if there 's an all-out
wsj_0655.rs3 (Index: 58): here Saturday , saying only that [[that]] 's going to change the

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_0616.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/06/wsj_0616.mrg

Tokenization mismatch:
wsj_0616.mrg (Index: 0):  [[As]] competition heats up in Spain
wsj_0616.rs3 (Index: 0):  [[Banco]] Exterior de Espana is seeking

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1380.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/13/wsj_1380.mrg

Tokenization mismatch:
wsj_1380.mrg (Index: 260): last month was Morgan Stanley & [[Co]] . Of Morgan Stanley 's
wsj_1380.rs3 (Index: 260): last month was Morgan Stanley & [[Co.]] . Of Morgan Stanley 's

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_0632.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/06/wsj_0632.mrg

Tokenization mismatch:
wsj_0632.mrg (Index: 77): similar go-ahead earlier in October , [[and]] on Friday , Jaguar announced
wsj_0632.rs3 (Index: 77): similar go-ahead earlier in October , [[that]] the No. 2 U.S. auto

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_2375.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/23/wsj_2375.mrg

Tokenization mismatch:
wsj_2375.mrg (Index: 76): holder of so-called junk bonds . [[New]] federal legislation requires that all
wsj_2375.rs3 (Index: 76): holder of so-called junk bonds . [[that]] all thrifts divest themselves of

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_2336.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/23/wsj_2336.mrg
Error in wsj_0684.rs3: 'Node 130 is not in the graph'
Error in wsj_0667.rs3: 'Node 409 is not in the graph'
Tokenization mismatch:
wsj_2336.mrg (Index: 27): U.S. patent held by Pharmacia Inc. [[was]] `` without merit . ''
wsj_2336.rs3 (Index: 27): U.S. patent held by Pharmacia Inc. [[.]] was without merit . that

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1146.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/11/wsj_1146.mrg

Tokenization mismatch:
wsj_1146.mrg (Index: 0):  [[The]] Polish rat will eat well
wsj_1146.rs3 (Index: 0):  [[Once]] again , the indomitable peasant

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1197.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/11/wsj_1197.mrg

Tokenization mismatch:
wsj_1197.mrg (Index: 80): 15.25 , down 25 cents . [[``]] A significant increase in gold
wsj_1197.rs3 (Index: 80): 15.25 , down 25 cents . [[A]] significant increase in gold sales

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_2354.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/23/wsj_2354.mrg

Tokenization mismatch:
wsj_2354.mrg (Index: 3): Call it the [[``]] we 're too broke to
wsj_2354.rs3 (Index: 3): Call it the [[we]] 're too broke to fight

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_2386.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/23/wsj_2386.mrg

Tokenization mismatch:
wsj_2386.mrg (Index: 22): to be taking a philosophical approach [[and]] said they were resigned to
wsj_2386.rs3 (Index: 22): to be taking a philosophical approach [[they]] were resigned to riding out

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1331.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/13/wsj_1331.mrg

Tokenization mismatch:
wsj_1331.mrg (Index: 232): in a mistrial after four weeks [[when]] U.S. District Judge Mary Johnson
wsj_1331.rs3 (Index: 232): in a mistrial after four weeks [[that]] a prosecutor improperly , but

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1354.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/13/wsj_1354.mrg

Tokenization mismatch:
wsj_1354.mrg (Index: 44): accessories supplier , said he was [[``]] comfortable '' with analysts '
wsj_1354.rs3 (Index: 44): accessories supplier , said he was [[comfortable]] with analysts ' expectations that

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_1113.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/11/wsj_1113.mrg

Tokenization mismatch:
wsj_1113.mrg (Index: 0):  [[Falcon]] Holding Group Inc. said it
wsj_1113.rs3 (Index: 0):  [[it]] agreed to acquire about 54,000

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TEST/wsj_0623.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/06/wsj_0623.mrg

Tokenization mismatch:
wsj_0623.mrg (Index: 16): It was down a little . [[The]] next time you hear a
wsj_0623.rs3 (Index: 16): It was down a little . [[consider]] what Congress did Friday .

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TRAINING/wsj_0630.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/06/wsj_0630.mrg

Tokenization mismatch:
wsj_0630.mrg (Index: 16): concern , said Friday that Bond [[Corp]] . Holdings Ltd. is ``
wsj_0630.rs3 (Index: 16): concern , said Friday that Bond [[Corp.]] . Holdings Ltd. is committed

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TRAINING/wsj_2321.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/23/wsj_2321.mrg

Tokenization mismatch:
wsj_2321.mrg (Index: 0):  [[With]] economic tension between the U.S.
wsj_2321.rs3 (Index: 0):  [[many]] Japanese had feared last week

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TRAINING/wsj_2303.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/23/wsj_2303.mrg

Tokenization mismatch:
wsj_2303.mrg (Index: 0):  [[A]] consortium of private investors operating
wsj_2303.rs3 (Index: 0):  [[it]] has made a $ 409

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TRAINING/wsj_1159.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/11/wsj_1159.mrg

Tokenization mismatch:
wsj_1159.mrg (Index: 65): racketeering defendants prior to trial . [[But]] David Runkel , chief Justice
wsj_1159.rs3 (Index: 65): racketeering defendants prior to trial . [[the]] guidelines are a codification and

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TRAINING/wsj_0622.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/06/wsj_0622.mrg

Tokenization mismatch:
wsj_0622.mrg (Index: 0):  [[The]] Treasury Department said the U.S.
wsj_0622.rs3 (Index: 0):  [[the]] U.S. trade deficit may worsen

geany /home/arne/repos/rst_discourse_treebank_rs3/untokenized/TRAINING/wsj_2309.rs3 /home/arne/corpora/pennTreebank/parsed/mrg/wsj/23/wsj_2309.mrg

Tokenization mismatch:
wsj_2309.mrg (Index: 23): more than one billion Canadian dollars [[-LRB-]] US$ 851 million -RRB- ,
wsj_2309.rs3 (Index: 23): more than one billion Canadian dollars [[-LCB-]] US$ 851 million -RCB- ,

Error in wsj_1129.rs3: 'Node 129 is not in the graph'
---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-12-eafe25ee41a7> in <module>()
     21             except ValueError as e:
     22 #                 sys.stderr.write("Error in {}: {}\n".format(os.path.basename(rst_file), e))
---> 23                 print "geany {} {}\n".format(*fix_wsj_document(doc_id))
     24                 try:
     25                     rdg.merge_graphs(pdg, verbose=True)

<ipython-input-11-967d55283cf1> in fix_wsj_document(wsj_id)
     15             break
     16     if 'rst_filepath' not in locals():
---> 17         raise IOError("There's no rs3 file with ID '{}'".format(wsj_id))
     18 
     19     ptb_subdir = wsj_id[:2]

IOError: There's no rs3 file with ID '1379'

In [8]: