Tokenization of RST-DT files using off-the-shelf tokenizers

CoreNLP: failed
nltk's TreebankWordTokenizer: failed, but might be adaptable
let's try the preprocessing provided by the Educational Testing Service's RST discourse parser,
cf. rstdt-fixing-tokenization.ipynb



In [ ]:

    
import os



In [4]:

    
from stanford_corenlp_pywrapper import sockwrap

CORENLP_PYWRAPPER_DIR = os.path.expanduser('~/repos/stanford_corenlp_pywrapper')
jars = ("stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar",
        "stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar")

p=sockwrap.SockWrap("pos",
                    corenlp_jars=[os.path.join(CORENLP_PYWRAPPER_DIR, jar) for jar in jars])









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-4-b2b6b3b69279> in <module>()
----> 1 from stanford_corenlp_pywrapper import sockwrap
      2 
      3 CORENLP_PYWRAPPER_DIR = os.path.expanduser('~/repos/stanford_corenlp_pywrapper')
      4 jars = ("stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar",
      5         "stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar")

ImportError: No module named stanford_corenlp_pywrapper



In [5]:

    
import re
import discoursegraphs as dg

# a string enclosed in '_!', possibly with '<P>' before the closing '_!' 
RST_DIS_TEXT_REGEX = re.compile("_!(.*?)(\<P\>)?_!", re.DOTALL)









    



Couldn't import dot_parser, loading of dot files will not be possible.



In [7]:

    
corenlp_result = p.parse_doc("""that its money would be better spent "in areas such as research" and development.""")

print ' '.join(tok for sent in corenlp_result['sentences'] for tok in sent['tokens'])









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-4ec65294dfcd> in <module>()
----> 1 corenlp_result = p.parse_doc("""that its money would be better spent "in areas such as research" and development.""")
      2 
      3 print ' '.join(tok for sent in corenlp_result['sentences'] for tok in sent['tokens'])

NameError: name 'p' is not defined



In [8]:

    
import sys
import glob
import os
import codecs

RSTDT_MAIN_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0')
RSTDT_TOKENIZED_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0-tokenized')

RSTDT_TEST_FILE = os.path.join(RSTDT_MAIN_ROOT, 'TEST', 'wsj_1306.out.dis')

def tokenize_rst_file(rst_input_path, rst_output_path):
#     edus = {}
    with open(rst_input_path, 'r') as rstfile, codecs.open(rst_output_path, 'w', encoding='utf-8') as outfile:
        rstfile_str = rstfile.read()
        input_file_onset = 0
        edu_matches = RST_DIS_TEXT_REGEX.finditer(rstfile_str)

        for edu in edu_matches:
            doc_onset = edu.start()
            doc_offset = edu.end()
            doc_untokenized_str = edu.groups()[0]
            corenlp_result = p.parse_doc(doc_untokenized_str)
            corenlp_tokenized_str = u' '.join(tok for sent in corenlp_result['sentences'] for tok in sent['tokens'])
            outfile.write(rstfile_str[input_file_onset:doc_onset])
            outfile.write(u'"{}"'.format(corenlp_tokenized_str))
            input_file_onset = doc_offset
        outfile.write(rstfile_str[input_file_onset:])



In [9]:

    
# with open(RSTDT_TEST_FILE, 'r') as f:
#     print f.read()[325]

tokenize_rst_file(RSTDT_TEST_FILE, '/tmp/1306.dis')









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-6d2e76672065> in <module>()
      2 #     print f.read()[325]
      3 
----> 4 tokenize_rst_file(RSTDT_TEST_FILE, '/tmp/1306.dis')

<ipython-input-8-41190de0ed14> in tokenize_rst_file(rst_input_path, rst_output_path)
     20             doc_offset = edu.end()
     21             doc_untokenized_str = edu.groups()[0]
---> 22             corenlp_result = p.parse_doc(doc_untokenized_str)
     23             corenlp_tokenized_str = u' '.join(tok for sent in corenlp_result['sentences'] for tok in sent['tokens'])
     24             outfile.write(rstfile_str[input_file_onset:doc_onset])

NameError: global name 'p' is not defined



In [ ]:

    
%%time
for folder in ('TEST', 'TRAINING'):
    for rst_fpath in glob.glob(os.path.join(RSTDT_MAIN_ROOT, folder, '*.dis')):
        out_fpath = os.path.join(RSTDT_TOKENIZED_ROOT, folder, os.path.basename(rst_fpath))
        out_dir, _fname = os.path.split(out_fpath)
        dg.util.create_dir(out_dir)
        tokenize_rst_file(rst_fpath, out_fpath)

tokenize using nltk.tokenize.treebank.TreebankWordTokenizer



In [10]:

    
from nltk.tokenize.treebank import TreebankWordTokenizer



In [11]:

    
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize("""that its money would be better spent "in areas such as research" and development.""")









    Out[11]:





['that',
 'its',
 'money',
 'would',
 'be',
 'better',
 'spent',
 '``',
 'in',
 'areas',
 'such',
 'as',
 'research',
 "''",
 'and',
 'development',
 '.']



In [12]:

    
import re

ENDS_WITH_COMMA = re.compile('(.*),$')
ENDS_WITH_PUNCTUATION = re.compile('(.*)(,|.|!|:|;)$')

foo = "Cummins Engine Co. , Columbus , Ind.,"
bar = ENDS_WITH_COMMA.sub(r'\1 ,', foo)

BRACKETS = {
    '(': '-LRB-', # round brackets
    ')': '-RRB-',
    '[': '-LSB-', # square brackets
    ']': '-RSB-',
    '{': '-LCB-', # curly brackets
    '}': '-RCB-'
}

def fix_tokenized_sentence(tokenized_sentence):
    # If an EDU ends with a comma, we'll have to tokenize it,
    # e.g. "when it ends," -> "when it ends ,"
    tokenized_sentence[-1] = ENDS_WITH_PUNCTUATION.sub(r'\1 \2', tokenized_sentence[-1])
    for i, token in enumerate(tokenized_sentence):
        if token in BRACKETS:
            tokenized_sentence[i] = BRACKETS[token]
    return tokenized_sentence



In [13]:

    
ENDS_WITH_PUNCTUATION = re.compile('(.*)(,|\.|!|:|;)$')

ENDS_WITH_PUNCTUATION.match(foo).groups()









    Out[13]:





('Cummins Engine Co. , Columbus , Ind.', ',')



In [14]:

    
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize

TOKENIZER = TreebankWordTokenizer()

def tokenize_rst_file_with_nltk(rst_input_path, rst_output_path, tokenizer):
#     edus = {}
    with open(rst_input_path, 'r') as rstfile, codecs.open(rst_output_path, 'w', encoding='utf-8') as outfile:
        rstfile_str = rstfile.read()
        input_file_onset = 0
        edu_matches = RST_DIS_TEXT_REGEX.finditer(rstfile_str)

        for edu in edu_matches:
            doc_onset = edu.start()
            doc_offset = edu.end()
            doc_untokenized_str = edu.groups()[0]
            untokenized_sents = sent_tokenize(doc_untokenized_str)
            tokenized_sents = tokenizer.tokenize_sents(untokenized_sents)
            fixed_tokenized_sents = [fix_tokenized_sentence(sent) for sent in tokenized_sents]
            tokenized_str = u' '.join(tok for sent in fixed_tokenized_sents for tok in sent)

            outfile.write(rstfile_str[input_file_onset:doc_onset])
            outfile.write(u'"{}"'.format(tokenized_str))
            input_file_onset = doc_offset
        outfile.write(rstfile_str[input_file_onset:])



In [15]:

    
%%time

RSTDT_NLTK_TOKENIZED_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0-nltk-tokenized')

for folder in ('TEST', 'TRAINING'):
    for rst_fpath in glob.glob(os.path.join(RSTDT_MAIN_ROOT, folder, '*.dis')):
        out_fpath = os.path.join(RSTDT_NLTK_TOKENIZED_ROOT, folder, os.path.basename(rst_fpath))
        out_dir, _fname = os.path.split(out_fpath)
        dg.util.create_dir(out_dir)
        tokenize_rst_file_with_nltk(rst_fpath, out_fpath, TOKENIZER)









    



CPU times: user 3.18 s, sys: 864 ms, total: 4.04 s
Wall time: 4.05 s



In [20]:

    
TOKENIZER.tokenize("on Monday the small ( investors ) are going to panic and sell")









    Out[20]:





['on',
 'Monday',
 'the',
 'small',
 '(',
 'investors',
 ')',
 'are',
 'going',
 'to',
 'panic',
 'and',
 'sell']

The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This is the method that is invoked by word_tokenize().
It assumes that the text has already been segmented into sentences, e.g. using sent_tokenize().



In [17]:

    
from nltk.tokenize import sent_tokenize



In [18]:

    
sents = sent_tokenize("a tree. You are a ball.")



In [19]:

    
tokenized_sents = TOKENIZER.tokenize_sents(sents)
u' '.join(tok for sent in tokenized_sents for tok in sent)









    Out[19]:





u'a tree . You are a ball .'



In [19]: