In [ ]:
import os
In [4]:
from stanford_corenlp_pywrapper import sockwrap
CORENLP_PYWRAPPER_DIR = os.path.expanduser('~/repos/stanford_corenlp_pywrapper')
jars = ("stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar",
"stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar")
p=sockwrap.SockWrap("pos",
corenlp_jars=[os.path.join(CORENLP_PYWRAPPER_DIR, jar) for jar in jars])
In [5]:
import re
import discoursegraphs as dg
# a string enclosed in '_!', possibly with '<P>' before the closing '_!'
RST_DIS_TEXT_REGEX = re.compile("_!(.*?)(\<P\>)?_!", re.DOTALL)
In [7]:
corenlp_result = p.parse_doc("""that its money would be better spent "in areas such as research" and development.""")
print ' '.join(tok for sent in corenlp_result['sentences'] for tok in sent['tokens'])
In [8]:
import sys
import glob
import os
import codecs
RSTDT_MAIN_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0')
RSTDT_TOKENIZED_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0-tokenized')
RSTDT_TEST_FILE = os.path.join(RSTDT_MAIN_ROOT, 'TEST', 'wsj_1306.out.dis')
def tokenize_rst_file(rst_input_path, rst_output_path):
# edus = {}
with open(rst_input_path, 'r') as rstfile, codecs.open(rst_output_path, 'w', encoding='utf-8') as outfile:
rstfile_str = rstfile.read()
input_file_onset = 0
edu_matches = RST_DIS_TEXT_REGEX.finditer(rstfile_str)
for edu in edu_matches:
doc_onset = edu.start()
doc_offset = edu.end()
doc_untokenized_str = edu.groups()[0]
corenlp_result = p.parse_doc(doc_untokenized_str)
corenlp_tokenized_str = u' '.join(tok for sent in corenlp_result['sentences'] for tok in sent['tokens'])
outfile.write(rstfile_str[input_file_onset:doc_onset])
outfile.write(u'"{}"'.format(corenlp_tokenized_str))
input_file_onset = doc_offset
outfile.write(rstfile_str[input_file_onset:])
In [9]:
# with open(RSTDT_TEST_FILE, 'r') as f:
# print f.read()[325]
tokenize_rst_file(RSTDT_TEST_FILE, '/tmp/1306.dis')
In [ ]:
%%time
for folder in ('TEST', 'TRAINING'):
for rst_fpath in glob.glob(os.path.join(RSTDT_MAIN_ROOT, folder, '*.dis')):
out_fpath = os.path.join(RSTDT_TOKENIZED_ROOT, folder, os.path.basename(rst_fpath))
out_dir, _fname = os.path.split(out_fpath)
dg.util.create_dir(out_dir)
tokenize_rst_file(rst_fpath, out_fpath)
In [10]:
from nltk.tokenize.treebank import TreebankWordTokenizer
In [11]:
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize("""that its money would be better spent "in areas such as research" and development.""")
Out[11]:
In [12]:
import re
ENDS_WITH_COMMA = re.compile('(.*),$')
ENDS_WITH_PUNCTUATION = re.compile('(.*)(,|.|!|:|;)$')
foo = "Cummins Engine Co. , Columbus , Ind.,"
bar = ENDS_WITH_COMMA.sub(r'\1 ,', foo)
BRACKETS = {
'(': '-LRB-', # round brackets
')': '-RRB-',
'[': '-LSB-', # square brackets
']': '-RSB-',
'{': '-LCB-', # curly brackets
'}': '-RCB-'
}
def fix_tokenized_sentence(tokenized_sentence):
# If an EDU ends with a comma, we'll have to tokenize it,
# e.g. "when it ends," -> "when it ends ,"
tokenized_sentence[-1] = ENDS_WITH_PUNCTUATION.sub(r'\1 \2', tokenized_sentence[-1])
for i, token in enumerate(tokenized_sentence):
if token in BRACKETS:
tokenized_sentence[i] = BRACKETS[token]
return tokenized_sentence
In [13]:
ENDS_WITH_PUNCTUATION = re.compile('(.*)(,|\.|!|:|;)$')
ENDS_WITH_PUNCTUATION.match(foo).groups()
Out[13]:
In [14]:
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
TOKENIZER = TreebankWordTokenizer()
def tokenize_rst_file_with_nltk(rst_input_path, rst_output_path, tokenizer):
# edus = {}
with open(rst_input_path, 'r') as rstfile, codecs.open(rst_output_path, 'w', encoding='utf-8') as outfile:
rstfile_str = rstfile.read()
input_file_onset = 0
edu_matches = RST_DIS_TEXT_REGEX.finditer(rstfile_str)
for edu in edu_matches:
doc_onset = edu.start()
doc_offset = edu.end()
doc_untokenized_str = edu.groups()[0]
untokenized_sents = sent_tokenize(doc_untokenized_str)
tokenized_sents = tokenizer.tokenize_sents(untokenized_sents)
fixed_tokenized_sents = [fix_tokenized_sentence(sent) for sent in tokenized_sents]
tokenized_str = u' '.join(tok for sent in fixed_tokenized_sents for tok in sent)
outfile.write(rstfile_str[input_file_onset:doc_onset])
outfile.write(u'"{}"'.format(tokenized_str))
input_file_onset = doc_offset
outfile.write(rstfile_str[input_file_onset:])
In [15]:
%%time
RSTDT_NLTK_TOKENIZED_ROOT = os.path.expanduser('~/repos/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0-nltk-tokenized')
for folder in ('TEST', 'TRAINING'):
for rst_fpath in glob.glob(os.path.join(RSTDT_MAIN_ROOT, folder, '*.dis')):
out_fpath = os.path.join(RSTDT_NLTK_TOKENIZED_ROOT, folder, os.path.basename(rst_fpath))
out_dir, _fname = os.path.split(out_fpath)
dg.util.create_dir(out_dir)
tokenize_rst_file_with_nltk(rst_fpath, out_fpath, TOKENIZER)
In [20]:
TOKENIZER.tokenize("on Monday the small ( investors ) are going to panic and sell")
Out[20]:
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This is the method that is invoked by word_tokenize().
It assumes that the text has already been segmented into sentences, e.g. using sent_tokenize().
In [17]:
from nltk.tokenize import sent_tokenize
In [18]:
sents = sent_tokenize("a tree. You are a ball.")
In [19]:
tokenized_sents = TOKENIZER.tokenize_sents(sents)
u' '.join(tok for sent in tokenized_sents for tok in sent)
Out[19]:
In [19]: