In [21]:
import os
import codecs
import requests
import simplejson

OPENJUR_TEST_JSON = 'http://openjur.de/u/759909.json'

In [22]:
result = requests.get(OPENJUR_TEST_JSON)

In [23]:
with codecs.open('/tmp/{}'.format(os.path.basename(OPENJUR_TEST_JSON)), 'w', encoding='utf-8') as ojfile:
    ojfile.write(result.text)

In [14]:
openjur_dict = simplejson.loads(result.text)

In [18]:
for k, v in openjur_dict.items():
    print k, type(v)
#     print v
#     print '\n\n'


lizenz_hinweis <type 'unicode'>
permalink <type 'unicode'>
gruende_isexcerpt <type 'bool'>
lizenz <type 'unicode'>
normen <type 'list'>
gruende <type 'unicode'>
datum <type 'unicode'>
tatbestand <type 'unicode'>
fundstelle <type 'unicode'>
permalink_json <type 'unicode'>
leitsaetze <type 'unicode'>
gericht <type 'unicode'>
tenor <type 'unicode'>
vorinstanz <type 'unicode'>
dokumenttyp <type 'unicode'>
aktenzeichen <type 'unicode'>
rechtsgebiete <type 'list'>

In [5]:
import treetagger

In [17]:
simplejson.load?

In [6]:
from nltk.internals import find_binary

In [7]:
find_binary(u'tree-tagger-german-utf8')
# find_binary()


[Found tree-tagger-german-utf8: /home/arne/repos/treetagger_german/cmd/tree-tagger-german-utf8]
Out[7]:
u'/home/arne/repos/treetagger_german/cmd/tree-tagger-german-utf8'

In [10]:
tt = treetagger.TreeTagger()

In [62]:
tenor = tt.tag(openjur_dict['tenor'])

In [75]:
def split_treetagger_output_to_sentences(tt_output):
    split_index = 0
    for i, (word, pos, lemma) in enumerate(tt_output):
        if pos == u'$.':
            last_split_index = split_index
            split_index = i
            yield tt_output[last_split_index+1:i+1]

In [76]:
list(split_treetagger_output_to_sentences(tenor))


Out[76]:
[[[u'die', u'ART', u'die'],
  [u'Revision', u'NN', u'Revision'],
  [u'der', u'ART', u'die'],
  [u'Kl\xe4gerin', u'NN', u'Kl\xe4gerin'],
  [u'wird', u'VAFIN', u'werden'],
  [u'das', u'ART', u'die'],
  [u'Urteil', u'NN', u'Urteil'],
  [u'des', u'ART', u'die'],
  [u'9', u'CARD', u'9'],
  [u'.', u'$.', u'.']],
 [[u'Zivilsenats', u'NN', u'Zivilsenat'],
  [u'des', u'ART', u'die'],
  [u'Oberlandesgerichts', u'NN', u'Oberlandesgericht'],
  [u'Stuttgart', u'NE', u'Stuttgart'],
  [u'vom', u'APPRART', u'von'],
  [u'27', u'CARD', u'27'],
  [u'.', u'$.', u'.']],
 [[u'Juni', u'NN', u'Juni'],
  [u'2012', u'CARD', u'@card@'],
  [u'aufgehoben', u'VVPP', u'aufheben'],
  [u'.', u'$.', u'.']],
 [[u'Die', u'ART', u'die'],
  [u'Sache', u'NN', u'Sache'],
  [u'wird', u'VAFIN', u'werden'],
  [u'zur', u'APPRART', u'zu'],
  [u'neuen', u'ADJA', u'neu'],
  [u'Verhandlung', u'NN', u'Verhandlung'],
  [u'und', u'KON', u'und'],
  [u'Entscheidung', u'NN', u'Entscheidung'],
  [u',', u'$,', u','],
  [u'auch', u'ADV', u'auch'],
  [u'\xfcber', u'APPR', u'\xfcber'],
  [u'die', u'ART', u'die'],
  [u'Kosten', u'NN', u'Kosten'],
  [u'des', u'ART', u'die'],
  [u'Revisionsverfahrens', u'NN', u'Revisionsverfahren'],
  [u',', u'$,', u','],
  [u'an', u'APPR', u'an'],
  [u'das', u'ART', u'die'],
  [u'Berufungsgericht', u'NN', u'Berufungsgericht'],
  [u'zur\xfcckverwiesen', u'VVPP', u'zur\xfcckverweisen'],
  [u'.', u'$.', u'.']],
 [[u'Von', u'APPR', u'von'],
  [u'Rechts', u'NN', u'Recht'],
  [u'wegen', u'APPO', u'wegen'],
  [u'.', u'$.', u'.']]]

In [12]:
tt.tag_sents?

In [81]:
import discoursegraphs as dg
import simplejson

import treetagger

TREE_TAGGER = treetagger.TreeTagger()


class OpenJurDocumentGraph(dg.DiscourseDocumentGraph):
    """
    represents an openJur JSON file as a multidigraph.

    Attributes
    ----------
    ns : str
        the namespace of the graph (default: conano)
    tokens : list of int
        a list of node IDs (int) which represent the tokens in the
        order they occur in the text
    root : str
        name of the document root node ID
        (default: 'openjur:root_node')
    """
    def __init__(self, openjur_filepath, name=None, namespace='openjur',
                 precedence=False, tokenize=False):
        """
        reads a openJur JSON file and converts it into a multidigraph.

        Parameters
        ----------
        openjur_filepath : str
            relative or absolute path to a Conano XML file
        name : str or None
            the name or ID of the graph to be generated. If no name is
            given, the basename of the input file is used.
        namespace : str
            the namespace of the graph (default: openjur)
        precedence : bool
            add precedence relation edges (root precedes token1, which precedes
            token2 etc.)
        tokenize : bool
            Tokenize the text using TreeTagger
        """
        # super calls __init__() of base class DiscourseDocumentGraph
        super(OpenJurDocumentGraph, self).__init__()

        self.name = name if name else os.path.basename(openjur_filepath)
        self.ns = namespace
        self.root = self.ns+':root_node'
        self.add_node(self.root, layers={self.ns})

        self.tokenize = tokenize
        if self.tokenize:
            self.tokens = []
#             self.token_count = 1

        self.sentence_count = 1
            
        with open(openjur_filepath, 'r') as ojfile:
            ojdict = simplejson.load(ojfile)
            for doc_part in ojdict:
                self.add_edge(self.root, doc_part)
                if isinstance(ojdict[doc_part], unicode):
                    parsed_string = TREE_TAGGER.tag(ojdict[doc_part])
                    if tokenize:
                        self.add_tokenized_part(doc_part, parsed_string)
                    else:
                        self.add_untokenized_part(doc_part, parsed_string)

    def add_untokenized_part(self, parsed_string, doc_part):
        current_sent_node_id = 'sentence_{}'.format(self.sentence_count)
        self.add_edge(doc_part, current_sent_node_id)
        sent_token_count = 1
        
        sentence_tokens = []
        for word, pos, lemma in parsed_string:
            sentence_tokens.append(word)
            if pos == u'$.':
                self.sentence_count += 1
                self.node[current_sent_node_id][self.ns+':text'] = u' '.join(sentence_tokens)
                sentence_tokens = []                                
                                
    def add_tokenized_part(self, parsed_string, doc_part):
        for word, pos, lemma in parsed_string:
            token_node_id = 's{}_t{}'.format(self.sentence_count, sent_token_count)
            self.add_node(
                token_node_id,
                attr_dict={'label': word,
                           self.ns+':pos': pos,
                           self.ns+':lemma': lemma})
            self.add_edge(current_sent_node_id, token_node_id)
            sent_token_count += 1
            if pos == u'$.':
                self.sentence_count += 1

In [82]:
ojg = OpenJurDocumentGraph('/tmp/{}'.format(os.path.basename(OPENJUR_TEST_JSON)))


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-82-09df9f7e6044> in <module>()
----> 1 ojg = OpenJurDocumentGraph('/tmp/{}'.format(os.path.basename(OPENJUR_TEST_JSON)))

<ipython-input-81-131f14aa2a51> in __init__(self, openjur_filepath, name, namespace, precedence, tokenize)
     66                         self.add_tokenized_part(doc_part, parsed_string)
     67                     else:
---> 68                         self.add_untokenized_part(doc_part, parsed_string)
     69 
     70     def add_untokenized_part(self, parsed_string, doc_part):

<ipython-input-81-131f14aa2a51> in add_untokenized_part(self, parsed_string, doc_part)
     70     def add_untokenized_part(self, parsed_string, doc_part):
     71         current_sent_node_id = 'sentence_{}'.format(self.sentence_count)
---> 72         self.add_edge(doc_part, current_sent_node_id)
     73         sent_token_count = 1
     74 

/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.1.2-py2.7.egg/discoursegraphs/discoursegraph.pyc in add_edge(self, u, v, layers, key, attr_dict, **attr)
    348         for node in (u, v):  # u = source, v = target
    349             if node not in self.nodes_iter():
--> 350                 self.add_node(node, layers={self.ns})
    351 
    352         if v in self.succ[u]:  # if there's already an edge from u to v

/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.1.2-py2.7.egg/discoursegraphs/discoursegraph.pyc in add_node(self, n, layers, attr_dict, **attr)
    151 
    152         # if there's no node with this ID in the graph, yet
--> 153         if n not in self.succ:
    154             self.succ[n] = {}
    155             self.pred[n] = {}

TypeError: unhashable type: 'list'

In [28]:
%load_ext gvmagic

In [59]:
%dotstr dg.print_dot(ojg)


759909.json gruende_isexcerpt gruende_isexcerpt datum datum tatbestand tatbestand sentence_235 sentence_235 tatbestand->sentence_235 tenor tenor sentence_263 sentence_263 tenor->sentence_263 openjur:root_node openjur:root_node openjur:root_node->gruende_isexcerpt openjur:root_node->datum openjur:root_node->tatbestand openjur:root_node->tenor normen normen openjur:root_node->normen aktenzeichen aktenzeichen openjur:root_node->aktenzeichen gericht gericht openjur:root_node->gericht fundstelle fundstelle openjur:root_node->fundstelle gruende gruende openjur:root_node->gruende rechtsgebiete rechtsgebiete openjur:root_node->rechtsgebiete dokumenttyp dokumenttyp openjur:root_node->dokumenttyp permalink_json permalink_json openjur:root_node->permalink_json lizenz_hinweis lizenz_hinweis openjur:root_node->lizenz_hinweis lizenz lizenz openjur:root_node->lizenz vorinstanz vorinstanz openjur:root_node->vorinstanz leitsaetze leitsaetze openjur:root_node->leitsaetze permalink permalink openjur:root_node->permalink sentence_2 sentence_2 gruende->sentence_2 sentence_1 sentence_1 lizenz_hinweis->sentence_1 sentence_261 sentence_261 leitsaetze->sentence_261 discoursegraph:root_node discoursegraph:root_node

In [ ]: