In [1]:
import codecs
import os

import jursegtok
from jursegtok.corpus import OJCorpus
from jursegtok.utils import get_data
from jursegtok import tokenizer
from jursegtok.tokenizer import JurSentTokenizer
from jursegtok import tools

from jursegtok.utils import find_files
from jursegtok.tools import random_sampling

In [2]:
OJCORPUS_ROOTDIR = os.path.expanduser('~/corpora/ojcorpus_cleaned/')

In [3]:
ojc = OJCorpus(OJCORPUS_ROOTDIR)

In [4]:
ojc.corpus_path


Out[4]:
'/home/arne/corpora/ojcorpus_cleaned'

In [5]:
docs = [doc for doc in ojc]

In [6]:
doc0 = docs[0]

In [7]:
doc0.filename


Out[7]:
'185990.html.gz'

In [8]:
# print doc0.raw_html

In [9]:
# print doc0.plain_text

In [ ]:


In [10]:
jst = JurSentTokenizer()

In [11]:
# jst.get_abbreviations()

In [12]:
# from segtok import tokenizer as segtoktokenizer

# segtoktokenizer.word_tokenizer(' '.join(tree.xpath('//article//text()')))

In [13]:
# segtoktokenizer.word_tokenizer(doc0.plain_text)

In [14]:
doc0.tokens[:10]


Out[14]:
[u'VGH',
 u'Baden-W\xfcrttemberg',
 u'\xb7',
 u'Beschluss',
 u'vom',
 u'3.',
 u'Juni',
 u'1991',
 u'\xb7',
 u'Az.']

In [15]:
# list(doc0.sentences)

In [ ]:


In [16]:
random_sampling(OJCORPUS_ROOTDIR, k=4, debug=True)


708809.html.gz
272114.html.gz
687451.html.gz
648396.html.gz

In [17]:
from jursegtok import tokenizer
from jursegtok.utils import create_dir
from jursegtok.tools import sentencelist2string

def convert2sentences(corpuspath, outputpath, debug=False):
    """
    converts raw ojc data to sentence segmented plaintext files
    :param corpuspath:
    :param outputpath:
    :return:
    """       
    corpus = OJCorpus(corpuspath)
    output = os.path.abspath(outputpath)
    create_dir(output)

    if debug:
        print "corpuspath:", corpuspath
        print "outputpath:", outputpath
        
    jst = tokenizer.JurSentTokenizer()
    for document in corpus:
        name = document.filename.split('.')[0]
        outfilepath = os.path.join(output, name+'_sentences.txt')
        if debug:
            print "outfilepath:", outfilepath

        with codecs.open(outfilepath, encoding='utf-8', mode='w') as sentencetokenized:
            tokenized = sentencelist2string(jst.sentence_tokenize(document))
            sentencetokenized.write(tokenized)

In [18]:
ojc.regenerate_paths()

In [ ]:


In [19]:
# jst = tokenizer.JurSentTokenizer()

# jst.sentence_tokenize(doc0.plain_text)[:5]

In [20]:
# SUBCORPUS_DIR = os.path.join(OJCORPUS_ROOTDIR, 'rawdata/101')

# convert2sentences(SUBCORPUS_DIR, '/tmp/100', debug=False)

In [21]:
# test_corpus = OJCorpus(SUBCORPUS_DIR)

In [22]:
# !ls $test_corpus.corpus_path

In [23]:
# !ls /home/arne/corpora/ojcorpus_cleaned/rawdata

In [24]:
# test_corpus.next()

In [25]:
jst = tokenizer.JurSentTokenizer()

In [26]:
jst.sentence_tokenize(doc0.plain_text)[:2]


Out[26]:
[u'\t\t\n \n \n \n VGH Baden-W\xfcrttemberg  \xb7 Beschluss vom  3. Juni 1991 \xb7 Az. 1 S 1484/91 \n \n \n\n\t\t \n\t\t\n\t\t \n\n\t\t Schnellzugriff: \n\t\t \n\t\t Druckansicht Download Editieren \n\t\t \n\t\t \n\n\t\t \n\n\t\t Informationen zum Urteil \n \n\t\t \n\n\t\t\n\t\t \n\t\t\t \n\t\t\t\t \n\t\t\t\t\t Gericht: \n\t\t\t\t\t VGH Baden-W\xfcrttemberg \n\t\t\t\t \n\t\t\t\t \n\t\t\t\t\t Datum: \n\n\t\t\t\t\t  3. Juni 1991 \n\t\t\t\t \n\t\t\t\t \n\n\t\t\t\t\t Aktenzeichen: \n\t\t\t\t\t 1 S 1484/91 \n\t\t\t\t \n\t\t\t\t \n\n\t\t\t\t\t Typ: \n\t\t\t\t\t Beschluss \n\t\t\t\t \n\t\t\t \n\t\t\t \n\t\t\t\t \n\t\t\t\t\t Fundstelle: \n\n\t\t\t\t\t openJur 2013, 7757 \n\t\t\t\t \n\t\t\t\t \n\t\t\t\t\t Verfahrensgang: \n\t\t\t\t\t   \n\n\t\t\t\t \n\t\t\t \n\t\t\t\n\t\t\t  \n\t\t\t  \n\t   \n\t\t\t  \t\t\t\n\t\t\t \n\t\t\t \n\t\t\t\n\t\t \t\t\n\n\t\t \n\n\t\t \n\t\t  1.',
 u'Lehnt es das Innenministerium durch Sperrerkl\xe4rung in entsprechender Anwendung von \xa7 96 StPO ab, dem Strafgericht auf dessen Ersuchen Name und Anschrift eines verdeckten Ermittlers und einer Vertrauensperson mitzuteilen, weil die Preisgabe der Identit\xe4t die sachgerechte Erf\xfcllung der Aufgaben der Kriminalpolizei bei der Bek\xe4mpfung der Rauschgiftkriminalit\xe4t und au\xdferdem Leib und Leben dieser Personen gef\xe4hrden w\xfcrde, ist f\xfcr die \xdcberpr\xfcfung der Rechtm\xe4\xdfigkeit der Sperrerkl\xe4rung der Verwaltungsrechtsweg (\xa7 40 Abs 1 S 1 VwGO) gegeben.']

TODO: manual SBD on 10 random documents


In [27]:
random_sampling(OJCORPUS_ROOTDIR, '/home/arne/corpora/ojcorpus_cleaned/annotated', debug=True)


711619.html.gz
118122.html.gz
477014.html.gz
661815.html.gz
470659.html.gz
89545.html.gz
108096.html.gz
618019.html.gz
197211.html.gz
240484.html.gz

In [28]:
anno_dir = '/home/arne/corpora/ojcorpus_cleaned/annotated'
for f in find_files(anno_dir):
    tools.htmlgz2text(f, anno_dir)


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-28-a727450785fb> in <module>()
      1 anno_dir = '/home/arne/corpora/ojcorpus_cleaned/annotated'
      2 for f in find_files(anno_dir):
----> 3     tools.htmlgz2text(f, anno_dir)

/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/jursegtok-0.1-py2.7.egg/jursegtok/tools.pyc in htmlgz2text(htmlgz_doc_path, output_path)
     36     doc_id = doc.filename.split('.')[0]
     37     with codecs.open(os.path.join(output_path, doc_id+'.txt'), 'w', 'utf-8') as out_file:
---> 38         out_file.write(doc.plain_text)
     39 
     40 

/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/jursegtok-0.1-py2.7.egg/jursegtok/corpus.pyc in plain_text(self)
     66     def plain_text(self):
     67         tree = self._get_html_tree()
---> 68         return u' '.join(tree.xpath('//article//text()'))
     69 
     70     @property

src/lxml/lxml.etree.pyx in lxml.etree._ElementTree.xpath (src/lxml/lxml.etree.c:70708)()

src/lxml/lxml.etree.pyx in lxml.etree._ElementTree._assertHasRoot (src/lxml/lxml.etree.c:66083)()

AssertionError: ElementTree not initialized, missing root

In [29]:


In [30]:
ojc = OJCorpus(OJCORPUS_ROOTDIR)

broken_docs = []

for doc in ojc:
    try:
        doc.plain_text
    except AssertionError as e:
        broken_docs.append(doc.document_path)

In [33]:
broken_docs


Out[33]:
['/home/arne/corpora/ojcorpus_cleaned/rawdata/471/471267.html.gz']

In [31]:
def get_plaintext(doc):
    try:
        doc.plain_text
        return None
    except AssertionError as e:
        return doc.document_path

In [34]:
from joblib import Parallel, delayed

In [38]:
broken_parallel = Parallel(n_jobs=1)(delayed(get_plaintext(doc) for doc in list(ojc)))


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-38-ac078bea5a94> in <module>()
----> 1 broken_parallel = Parallel(n_jobs=1)(delayed(get_plaintext(doc) for doc in list(ojc)))

/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/joblib/parallel.pyc in delayed(function, check_pickle)
    160     # using with multiprocessing:
    161     if check_pickle:
--> 162         pickle.dumps(function)
    163 
    164     def delayed_function(*args, **kwargs):

/home/arne/.virtualenvs/notebook/lib/python2.7/copy_reg.pyc in _reduce_ex(self, proto)
     68     else:
     69         if base is self.__class__:
---> 70             raise TypeError, "can't pickle %s objects" % base.__name__
     71         state = base(self)
     72     args = (self.__class__, base, state)

TypeError: can't pickle generator objects

In [36]:
ojc = OJCorpus(OJCORPUS_ROOTDIR)

In [ ]: