In [21]:
import os
import codecs
import requests
import simplejson
OPENJUR_TEST_JSON = 'http://openjur.de/u/759909.json'
In [22]:
result = requests.get(OPENJUR_TEST_JSON)
In [23]:
with codecs.open('/tmp/{}'.format(os.path.basename(OPENJUR_TEST_JSON)), 'w', encoding='utf-8') as ojfile:
ojfile.write(result.text)
In [14]:
openjur_dict = simplejson.loads(result.text)
In [18]:
for k, v in openjur_dict.items():
print k, type(v)
# print v
# print '\n\n'
In [5]:
import treetagger
In [17]:
simplejson.load?
In [6]:
from nltk.internals import find_binary
In [7]:
find_binary(u'tree-tagger-german-utf8')
# find_binary()
Out[7]:
In [10]:
tt = treetagger.TreeTagger()
In [62]:
tenor = tt.tag(openjur_dict['tenor'])
In [75]:
def split_treetagger_output_to_sentences(tt_output):
split_index = 0
for i, (word, pos, lemma) in enumerate(tt_output):
if pos == u'$.':
last_split_index = split_index
split_index = i
yield tt_output[last_split_index+1:i+1]
In [76]:
list(split_treetagger_output_to_sentences(tenor))
Out[76]:
In [12]:
tt.tag_sents?
In [81]:
import discoursegraphs as dg
import simplejson
import treetagger
TREE_TAGGER = treetagger.TreeTagger()
class OpenJurDocumentGraph(dg.DiscourseDocumentGraph):
"""
represents an openJur JSON file as a multidigraph.
Attributes
----------
ns : str
the namespace of the graph (default: conano)
tokens : list of int
a list of node IDs (int) which represent the tokens in the
order they occur in the text
root : str
name of the document root node ID
(default: 'openjur:root_node')
"""
def __init__(self, openjur_filepath, name=None, namespace='openjur',
precedence=False, tokenize=False):
"""
reads a openJur JSON file and converts it into a multidigraph.
Parameters
----------
openjur_filepath : str
relative or absolute path to a Conano XML file
name : str or None
the name or ID of the graph to be generated. If no name is
given, the basename of the input file is used.
namespace : str
the namespace of the graph (default: openjur)
precedence : bool
add precedence relation edges (root precedes token1, which precedes
token2 etc.)
tokenize : bool
Tokenize the text using TreeTagger
"""
# super calls __init__() of base class DiscourseDocumentGraph
super(OpenJurDocumentGraph, self).__init__()
self.name = name if name else os.path.basename(openjur_filepath)
self.ns = namespace
self.root = self.ns+':root_node'
self.add_node(self.root, layers={self.ns})
self.tokenize = tokenize
if self.tokenize:
self.tokens = []
# self.token_count = 1
self.sentence_count = 1
with open(openjur_filepath, 'r') as ojfile:
ojdict = simplejson.load(ojfile)
for doc_part in ojdict:
self.add_edge(self.root, doc_part)
if isinstance(ojdict[doc_part], unicode):
parsed_string = TREE_TAGGER.tag(ojdict[doc_part])
if tokenize:
self.add_tokenized_part(doc_part, parsed_string)
else:
self.add_untokenized_part(doc_part, parsed_string)
def add_untokenized_part(self, parsed_string, doc_part):
current_sent_node_id = 'sentence_{}'.format(self.sentence_count)
self.add_edge(doc_part, current_sent_node_id)
sent_token_count = 1
sentence_tokens = []
for word, pos, lemma in parsed_string:
sentence_tokens.append(word)
if pos == u'$.':
self.sentence_count += 1
self.node[current_sent_node_id][self.ns+':text'] = u' '.join(sentence_tokens)
sentence_tokens = []
def add_tokenized_part(self, parsed_string, doc_part):
for word, pos, lemma in parsed_string:
token_node_id = 's{}_t{}'.format(self.sentence_count, sent_token_count)
self.add_node(
token_node_id,
attr_dict={'label': word,
self.ns+':pos': pos,
self.ns+':lemma': lemma})
self.add_edge(current_sent_node_id, token_node_id)
sent_token_count += 1
if pos == u'$.':
self.sentence_count += 1
In [82]:
ojg = OpenJurDocumentGraph('/tmp/{}'.format(os.path.basename(OPENJUR_TEST_JSON)))
In [28]:
%load_ext gvmagic
In [59]:
%dotstr dg.print_dot(ojg)
In [ ]: