Tokenization of RST-DT files using off-the-shelf tokenizers


In [ ]:
import os

In [4]:
from stanford_corenlp_pywrapper import sockwrap

CORENLP_PYWRAPPER_DIR = os.path.expanduser('~/repos/stanford_corenlp_pywrapper')
jars = ("stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar",
        "stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar")

p=sockwrap.SockWrap("pos",
                    corenlp_jars=[os.path.join(CORENLP_PYWRAPPER_DIR, jar) for jar in jars])


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-4-b2b6b3b69279> in <module>()
----> 1 from stanford_corenlp_pywrapper import sockwrap
      2 
      3 CORENLP_PYWRAPPER_DIR = os.path.expanduser('~/repos/stanford_corenlp_pywrapper')
      4 jars = ("stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar",
      5         "stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar")

ImportError: No module named stanford_corenlp_pywrapper

In [5]:
import re
import discoursegraphs as dg

# a string enclosed in '_!', possibly with '<P>' before the closing '_!' 
RST_DIS_TEXT_REGEX = re.compile("_!(.*?)(\<P\>)?_!", re.DOTALL)


Couldn't import dot_parser, loading of dot files will not be possible.

In [7]:
corenlp_result = p.parse_doc("""that its money would be better spent "in areas such as research" and development.""")

print ' '.join(tok for sent in corenlp_result['sentences'] for tok in sent['tokens'])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-4ec65294dfcd> in <module>()
----> 1 corenlp_result = p.parse_doc("""that its money would be better spent "in areas such as research" and development.""")
      2 
      3 print ' '.join(tok for sent in corenlp_result['sentences'] for tok in sent['tokens'])

NameError: name 'p' is not defined

In [8]:
import sys
import glob
import os
import codecs

RSTDT_MAIN_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0')
RSTDT_TOKENIZED_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0-tokenized')

RSTDT_TEST_FILE = os.path.join(RSTDT_MAIN_ROOT, 'TEST', 'wsj_1306.out.dis')

def tokenize_rst_file(rst_input_path, rst_output_path):
#     edus = {}
    with open(rst_input_path, 'r') as rstfile, codecs.open(rst_output_path, 'w', encoding='utf-8') as outfile:
        rstfile_str = rstfile.read()
        input_file_onset = 0
        edu_matches = RST_DIS_TEXT_REGEX.finditer(rstfile_str)

        for edu in edu_matches:
            doc_onset = edu.start()
            doc_offset = edu.end()
            doc_untokenized_str = edu.groups()[0]
            corenlp_result = p.parse_doc(doc_untokenized_str)
            corenlp_tokenized_str = u' '.join(tok for sent in corenlp_result['sentences'] for tok in sent['tokens'])
            outfile.write(rstfile_str[input_file_onset:doc_onset])
            outfile.write(u'"{}"'.format(corenlp_tokenized_str))
            input_file_onset = doc_offset
        outfile.write(rstfile_str[input_file_onset:])

In [9]:
# with open(RSTDT_TEST_FILE, 'r') as f:
#     print f.read()[325]

tokenize_rst_file(RSTDT_TEST_FILE, '/tmp/1306.dis')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-6d2e76672065> in <module>()
      2 #     print f.read()[325]
      3 
----> 4 tokenize_rst_file(RSTDT_TEST_FILE, '/tmp/1306.dis')

<ipython-input-8-41190de0ed14> in tokenize_rst_file(rst_input_path, rst_output_path)
     20             doc_offset = edu.end()
     21             doc_untokenized_str = edu.groups()[0]
---> 22             corenlp_result = p.parse_doc(doc_untokenized_str)
     23             corenlp_tokenized_str = u' '.join(tok for sent in corenlp_result['sentences'] for tok in sent['tokens'])
     24             outfile.write(rstfile_str[input_file_onset:doc_onset])

NameError: global name 'p' is not defined

In [ ]:
%%time
for folder in ('TEST', 'TRAINING'):
    for rst_fpath in glob.glob(os.path.join(RSTDT_MAIN_ROOT, folder, '*.dis')):
        out_fpath = os.path.join(RSTDT_TOKENIZED_ROOT, folder, os.path.basename(rst_fpath))
        out_dir, _fname = os.path.split(out_fpath)
        dg.util.create_dir(out_dir)
        tokenize_rst_file(rst_fpath, out_fpath)

tokenize using nltk.tokenize.treebank.TreebankWordTokenizer


In [10]:
from nltk.tokenize.treebank import TreebankWordTokenizer

In [11]:
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize("""that its money would be better spent "in areas such as research" and development.""")


Out[11]:
['that',
 'its',
 'money',
 'would',
 'be',
 'better',
 'spent',
 '``',
 'in',
 'areas',
 'such',
 'as',
 'research',
 "''",
 'and',
 'development',
 '.']

In [12]:
import re

ENDS_WITH_COMMA = re.compile('(.*),$')
ENDS_WITH_PUNCTUATION = re.compile('(.*)(,|.|!|:|;)$')

foo = "Cummins Engine Co. , Columbus , Ind.,"
bar = ENDS_WITH_COMMA.sub(r'\1 ,', foo)

BRACKETS = {
    '(': '-LRB-', # round brackets
    ')': '-RRB-',
    '[': '-LSB-', # square brackets
    ']': '-RSB-',
    '{': '-LCB-', # curly brackets
    '}': '-RCB-'
}

def fix_tokenized_sentence(tokenized_sentence):
    # If an EDU ends with a comma, we'll have to tokenize it,
    # e.g. "when it ends," -> "when it ends ,"
    tokenized_sentence[-1] = ENDS_WITH_PUNCTUATION.sub(r'\1 \2', tokenized_sentence[-1])
    for i, token in enumerate(tokenized_sentence):
        if token in BRACKETS:
            tokenized_sentence[i] = BRACKETS[token]
    return tokenized_sentence

In [13]:
ENDS_WITH_PUNCTUATION = re.compile('(.*)(,|\.|!|:|;)$')

ENDS_WITH_PUNCTUATION.match(foo).groups()


Out[13]:
('Cummins Engine Co. , Columbus , Ind.', ',')

In [14]:
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize

TOKENIZER = TreebankWordTokenizer()

def tokenize_rst_file_with_nltk(rst_input_path, rst_output_path, tokenizer):
#     edus = {}
    with open(rst_input_path, 'r') as rstfile, codecs.open(rst_output_path, 'w', encoding='utf-8') as outfile:
        rstfile_str = rstfile.read()
        input_file_onset = 0
        edu_matches = RST_DIS_TEXT_REGEX.finditer(rstfile_str)

        for edu in edu_matches:
            doc_onset = edu.start()
            doc_offset = edu.end()
            doc_untokenized_str = edu.groups()[0]
            untokenized_sents = sent_tokenize(doc_untokenized_str)
            tokenized_sents = tokenizer.tokenize_sents(untokenized_sents)
            fixed_tokenized_sents = [fix_tokenized_sentence(sent) for sent in tokenized_sents]
            tokenized_str = u' '.join(tok for sent in fixed_tokenized_sents for tok in sent)

            outfile.write(rstfile_str[input_file_onset:doc_onset])
            outfile.write(u'"{}"'.format(tokenized_str))
            input_file_onset = doc_offset
        outfile.write(rstfile_str[input_file_onset:])

In [15]:
%%time

RSTDT_NLTK_TOKENIZED_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0-nltk-tokenized')

for folder in ('TEST', 'TRAINING'):
    for rst_fpath in glob.glob(os.path.join(RSTDT_MAIN_ROOT, folder, '*.dis')):
        out_fpath = os.path.join(RSTDT_NLTK_TOKENIZED_ROOT, folder, os.path.basename(rst_fpath))
        out_dir, _fname = os.path.split(out_fpath)
        dg.util.create_dir(out_dir)
        tokenize_rst_file_with_nltk(rst_fpath, out_fpath, TOKENIZER)


CPU times: user 3.18 s, sys: 864 ms, total: 4.04 s
Wall time: 4.05 s

In [20]:
TOKENIZER.tokenize("on Monday the small ( investors ) are going to panic and sell")


Out[20]:
['on',
 'Monday',
 'the',
 'small',
 '(',
 'investors',
 ')',
 'are',
 'going',
 'to',
 'panic',
 'and',
 'sell']

The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This is the method that is invoked by word_tokenize().
It assumes that the text has already been segmented into sentences, e.g. using sent_tokenize().


In [17]:
from nltk.tokenize import sent_tokenize

In [18]:
sents = sent_tokenize("a tree. You are a ball.")

In [19]:
tokenized_sents = TOKENIZER.tokenize_sents(sents)
u' '.join(tok for sent in tokenized_sents for tok in sent)


Out[19]:
u'a tree . You are a ball .'

In [19]: