notebook.community

Edit and run



In [1]:

    
import codecs
import os

import jursegtok
from jursegtok.corpus import OJCorpus
from jursegtok.utils import get_data
from jursegtok import tokenizer
from jursegtok.tokenizer import JurSentTokenizer
from jursegtok import tools

from jursegtok.utils import find_files
from jursegtok.tools import random_sampling



In [2]:

    
OJCORPUS_ROOTDIR = os.path.expanduser('~/corpora/ojcorpus_cleaned/')



In [3]:

    
ojc = OJCorpus(OJCORPUS_ROOTDIR)



In [4]:

    
ojc.corpus_path









    Out[4]:





'/home/arne/corpora/ojcorpus_cleaned'



In [5]:

    
docs = [doc for doc in ojc]



In [6]:

    
doc0 = docs[0]



In [7]:

    
doc0.filename









    Out[7]:





'185990.html.gz'



In [8]:

    
# print doc0.raw_html



In [9]:

    
# print doc0.plain_text



In [ ]:



In [10]:

    
jst = JurSentTokenizer()



In [11]:

    
# jst.get_abbreviations()



In [12]:

    
# from segtok import tokenizer as segtoktokenizer

# segtoktokenizer.word_tokenizer(' '.join(tree.xpath('//article//text()')))



In [13]:

    
# segtoktokenizer.word_tokenizer(doc0.plain_text)



In [14]:

    
doc0.tokens[:10]









    Out[14]:





[u'VGH',
 u'Baden-W\xfcrttemberg',
 u'\xb7',
 u'Beschluss',
 u'vom',
 u'3.',
 u'Juni',
 u'1991',
 u'\xb7',
 u'Az.']



In [15]:

    
# list(doc0.sentences)



In [ ]:



In [16]:

    
random_sampling(OJCORPUS_ROOTDIR, k=4, debug=True)









    



708809.html.gz
272114.html.gz
687451.html.gz
648396.html.gz



In [17]:

    
from jursegtok import tokenizer
from jursegtok.utils import create_dir
from jursegtok.tools import sentencelist2string

def convert2sentences(corpuspath, outputpath, debug=False):
    """
    converts raw ojc data to sentence segmented plaintext files
    :param corpuspath:
    :param outputpath:
    :return:
    """       
    corpus = OJCorpus(corpuspath)
    output = os.path.abspath(outputpath)
    create_dir(output)

    if debug:
        print "corpuspath:", corpuspath
        print "outputpath:", outputpath
        
    jst = tokenizer.JurSentTokenizer()
    for document in corpus:
        name = document.filename.split('.')[0]
        outfilepath = os.path.join(output, name+'_sentences.txt')
        if debug:
            print "outfilepath:", outfilepath

        with codecs.open(outfilepath, encoding='utf-8', mode='w') as sentencetokenized:
            tokenized = sentencelist2string(jst.sentence_tokenize(document))
            sentencetokenized.write(tokenized)



In [18]:

    
ojc.regenerate_paths()



In [ ]:



In [19]:

    
# jst = tokenizer.JurSentTokenizer()

# jst.sentence_tokenize(doc0.plain_text)[:5]



In [20]:

    
# SUBCORPUS_DIR = os.path.join(OJCORPUS_ROOTDIR, 'rawdata/101')

# convert2sentences(SUBCORPUS_DIR, '/tmp/100', debug=False)



In [21]:

    
# test_corpus = OJCorpus(SUBCORPUS_DIR)



In [22]:

    
# !ls $test_corpus.corpus_path



In [23]:

    
# !ls /home/arne/corpora/ojcorpus_cleaned/rawdata



In [24]:

    
# test_corpus.next()



In [25]:

    
jst = tokenizer.JurSentTokenizer()



In [26]:

    
jst.sentence_tokenize(doc0.plain_text)[:2]









    Out[26]:





[u'\t\t\n \n \n \n VGH Baden-W\xfcrttemberg  \xb7 Beschluss vom  3. Juni 1991 \xb7 Az. 1 S 1484/91 \n \n \n\n\t\t \n\t\t\n\t\t \n\n\t\t Schnellzugriff: \n\t\t \n\t\t Druckansicht Download Editieren \n\t\t \n\t\t \n\n\t\t \n\n\t\t Informationen zum Urteil \n \n\t\t \n\n\t\t\n\t\t \n\t\t\t \n\t\t\t\t \n\t\t\t\t\t Gericht: \n\t\t\t\t\t VGH Baden-W\xfcrttemberg \n\t\t\t\t \n\t\t\t\t \n\t\t\t\t\t Datum: \n\n\t\t\t\t\t  3. Juni 1991 \n\t\t\t\t \n\t\t\t\t \n\n\t\t\t\t\t Aktenzeichen: \n\t\t\t\t\t 1 S 1484/91 \n\t\t\t\t \n\t\t\t\t \n\n\t\t\t\t\t Typ: \n\t\t\t\t\t Beschluss \n\t\t\t\t \n\t\t\t \n\t\t\t \n\t\t\t\t \n\t\t\t\t\t Fundstelle: \n\n\t\t\t\t\t openJur 2013, 7757 \n\t\t\t\t \n\t\t\t\t \n\t\t\t\t\t Verfahrensgang: \n\t\t\t\t\t   \n\n\t\t\t\t \n\t\t\t \n\t\t\t\n\t\t\t  \n\t\t\t  \n\t   \n\t\t\t  \t\t\t\n\t\t\t \n\t\t\t \n\t\t\t\n\t\t \t\t\n\n\t\t \n\n\t\t \n\t\t  1.',
 u'Lehnt es das Innenministerium durch Sperrerkl\xe4rung in entsprechender Anwendung von \xa7 96 StPO ab, dem Strafgericht auf dessen Ersuchen Name und Anschrift eines verdeckten Ermittlers und einer Vertrauensperson mitzuteilen, weil die Preisgabe der Identit\xe4t die sachgerechte Erf\xfcllung der Aufgaben der Kriminalpolizei bei der Bek\xe4mpfung der Rauschgiftkriminalit\xe4t und au\xdferdem Leib und Leben dieser Personen gef\xe4hrden w\xfcrde, ist f\xfcr die \xdcberpr\xfcfung der Rechtm\xe4\xdfigkeit der Sperrerkl\xe4rung der Verwaltungsrechtsweg (\xa7 40 Abs 1 S 1 VwGO) gegeben.']

TODO: manual SBD on 10 random documents



In [27]:

    
random_sampling(OJCORPUS_ROOTDIR, '/home/arne/corpora/ojcorpus_cleaned/annotated', debug=True)









    



711619.html.gz
118122.html.gz
477014.html.gz
661815.html.gz
470659.html.gz
89545.html.gz
108096.html.gz
618019.html.gz
197211.html.gz
240484.html.gz



In [28]:

    
anno_dir = '/home/arne/corpora/ojcorpus_cleaned/annotated'
for f in find_files(anno_dir):
    tools.htmlgz2text(f, anno_dir)









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-28-a727450785fb> in <module>()
      1 anno_dir = '/home/arne/corpora/ojcorpus_cleaned/annotated'
      2 for f in find_files(anno_dir):
----> 3     tools.htmlgz2text(f, anno_dir)

/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/jursegtok-0.1-py2.7.egg/jursegtok/tools.pyc in htmlgz2text(htmlgz_doc_path, output_path)
     36     doc_id = doc.filename.split('.')[0]
     37     with codecs.open(os.path.join(output_path, doc_id+'.txt'), 'w', 'utf-8') as out_file:
---> 38         out_file.write(doc.plain_text)
     39 
     40 

/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/jursegtok-0.1-py2.7.egg/jursegtok/corpus.pyc in plain_text(self)
     66     def plain_text(self):
     67         tree = self._get_html_tree()
---> 68         return u' '.join(tree.xpath('//article//text()'))
     69 
     70     @property

src/lxml/lxml.etree.pyx in lxml.etree._ElementTree.xpath (src/lxml/lxml.etree.c:70708)()

src/lxml/lxml.etree.pyx in lxml.etree._ElementTree._assertHasRoot (src/lxml/lxml.etree.c:66083)()

AssertionError: ElementTree not initialized, missing root



In [29]:



In [30]:

    
ojc = OJCorpus(OJCORPUS_ROOTDIR)

broken_docs = []

for doc in ojc:
    try:
        doc.plain_text
    except AssertionError as e:
        broken_docs.append(doc.document_path)



In [33]:

    
broken_docs









    Out[33]:





['/home/arne/corpora/ojcorpus_cleaned/rawdata/471/471267.html.gz']



In [31]:

    
def get_plaintext(doc):
    try:
        doc.plain_text
        return None
    except AssertionError as e:
        return doc.document_path



In [34]:

    
from joblib import Parallel, delayed



In [38]:

    
broken_parallel = Parallel(n_jobs=1)(delayed(get_plaintext(doc) for doc in list(ojc)))









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-38-ac078bea5a94> in <module>()
----> 1 broken_parallel = Parallel(n_jobs=1)(delayed(get_plaintext(doc) for doc in list(ojc)))

/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/joblib/parallel.pyc in delayed(function, check_pickle)
    160     # using with multiprocessing:
    161     if check_pickle:
--> 162         pickle.dumps(function)
    163 
    164     def delayed_function(*args, **kwargs):

/home/arne/.virtualenvs/notebook/lib/python2.7/copy_reg.pyc in _reduce_ex(self, proto)
     68     else:
     69         if base is self.__class__:
---> 70             raise TypeError, "can't pickle %s objects" % base.__name__
     71         state = base(self)
     72     args = (self.__class__, base, state)

TypeError: can't pickle generator objects



In [36]:

    
ojc = OJCorpus(OJCORPUS_ROOTDIR)



In [ ]: