In [1]:
import codecs
import os
import jursegtok
from jursegtok.corpus import OJCorpus
from jursegtok.utils import get_data
from jursegtok import tokenizer
from jursegtok.tokenizer import JurSentTokenizer
from jursegtok import tools
from jursegtok.utils import find_files
from jursegtok.tools import random_sampling
In [2]:
OJCORPUS_ROOTDIR = os.path.expanduser('~/corpora/ojcorpus_cleaned/')
In [3]:
ojc = OJCorpus(OJCORPUS_ROOTDIR)
In [4]:
ojc.corpus_path
Out[4]:
In [5]:
docs = [doc for doc in ojc]
In [6]:
doc0 = docs[0]
In [7]:
doc0.filename
Out[7]:
In [8]:
# print doc0.raw_html
In [9]:
# print doc0.plain_text
In [ ]:
In [10]:
jst = JurSentTokenizer()
In [11]:
# jst.get_abbreviations()
In [12]:
# from segtok import tokenizer as segtoktokenizer
# segtoktokenizer.word_tokenizer(' '.join(tree.xpath('//article//text()')))
In [13]:
# segtoktokenizer.word_tokenizer(doc0.plain_text)
In [14]:
doc0.tokens[:10]
Out[14]:
In [15]:
# list(doc0.sentences)
In [ ]:
In [16]:
random_sampling(OJCORPUS_ROOTDIR, k=4, debug=True)
In [17]:
from jursegtok import tokenizer
from jursegtok.utils import create_dir
from jursegtok.tools import sentencelist2string
def convert2sentences(corpuspath, outputpath, debug=False):
"""
converts raw ojc data to sentence segmented plaintext files
:param corpuspath:
:param outputpath:
:return:
"""
corpus = OJCorpus(corpuspath)
output = os.path.abspath(outputpath)
create_dir(output)
if debug:
print "corpuspath:", corpuspath
print "outputpath:", outputpath
jst = tokenizer.JurSentTokenizer()
for document in corpus:
name = document.filename.split('.')[0]
outfilepath = os.path.join(output, name+'_sentences.txt')
if debug:
print "outfilepath:", outfilepath
with codecs.open(outfilepath, encoding='utf-8', mode='w') as sentencetokenized:
tokenized = sentencelist2string(jst.sentence_tokenize(document))
sentencetokenized.write(tokenized)
In [18]:
ojc.regenerate_paths()
In [ ]:
In [19]:
# jst = tokenizer.JurSentTokenizer()
# jst.sentence_tokenize(doc0.plain_text)[:5]
In [20]:
# SUBCORPUS_DIR = os.path.join(OJCORPUS_ROOTDIR, 'rawdata/101')
# convert2sentences(SUBCORPUS_DIR, '/tmp/100', debug=False)
In [21]:
# test_corpus = OJCorpus(SUBCORPUS_DIR)
In [22]:
# !ls $test_corpus.corpus_path
In [23]:
# !ls /home/arne/corpora/ojcorpus_cleaned/rawdata
In [24]:
# test_corpus.next()
In [25]:
jst = tokenizer.JurSentTokenizer()
In [26]:
jst.sentence_tokenize(doc0.plain_text)[:2]
Out[26]:
In [27]:
random_sampling(OJCORPUS_ROOTDIR, '/home/arne/corpora/ojcorpus_cleaned/annotated', debug=True)
In [28]:
anno_dir = '/home/arne/corpora/ojcorpus_cleaned/annotated'
for f in find_files(anno_dir):
tools.htmlgz2text(f, anno_dir)
In [29]:
In [30]:
ojc = OJCorpus(OJCORPUS_ROOTDIR)
broken_docs = []
for doc in ojc:
try:
doc.plain_text
except AssertionError as e:
broken_docs.append(doc.document_path)
In [33]:
broken_docs
Out[33]:
In [31]:
def get_plaintext(doc):
try:
doc.plain_text
return None
except AssertionError as e:
return doc.document_path
In [34]:
from joblib import Parallel, delayed
In [38]:
broken_parallel = Parallel(n_jobs=1)(delayed(get_plaintext(doc) for doc in list(ojc)))
In [36]:
ojc = OJCorpus(OJCORPUS_ROOTDIR)
In [ ]: