In [3]:
import os
import networkx as nx
import discoursegraphs as dg
import pocores as pc
MATE_PARSED_TUEBA_DIR = os.path.expanduser('~/corpora/tueba/tueba_mate_output')
TUEBA_TEST_FILE = os.path.join(MATE_PARSED_TUEBA_DIR, 'text_104.bp')
In [4]:
pocograph = pc.run_pocores(TUEBA_TEST_FILE, input_format='2009',
output_dest='/tmp/{}.pocores'.format(os.path.basename(TUEBA_TEST_FILE)))
In [5]:
print pc.main.output_with_brackets(pocograph)
In [6]:
tdg = dg.read_tiger('/home/arne/repos/pcc-annis-merged/maz176/syntax/maz-10207.xml')
In [7]:
# %load_ext gvmagic
In [8]:
# %dotstr dg.print_dot(pocograph.document)
In [9]:
# %dotstr dg.print_dot(dg.read_conll(TUEBA_TEST_FILE))
In [10]:
dg.write_conll(pocograph.document, '/tmp/{}.pocores.conll'.format(pocograph.document.name),
markable_layer='pocores:markable')
In [11]:
TUEBADZ8_FILE = os.path.expanduser(
'~/corpora/tueba/TuebaDZ8.0/tuebadz-8.0-mit-NE+Anaphern+Diskurs.exml.xml')
In [12]:
tueba_corpus = dg.read_exportxml(TUEBADZ8_FILE)
In [13]:
def get_specific_tueba_document(tueba_filepath, text_id, debug=False):
tueba_corpus = dg.read_exportxml(TUEBADZ8_FILE, debug=True)
for text_element in tueba_corpus:
if text_element.attrib[dg.readwrite.exportxml.add_ns('id')] == text_id:
if debug:
return text_element
else:
return dg.readwrite.exportxml.ExportXMLDocumentGraph(text_element)
raise ValueError("There's no text with ID: {} in the corpus file: {}".format(text_id, tueba_filepath))
In [14]:
text29 = get_specific_tueba_document(tueba_corpus, 'text_29')
In [ ]:
dg.write_conll(text29, '/tmp/text_29.conll')
In [ ]:
nx.is_directed_acyclic_graph(text29)
In [ ]:
nx.is_directed(text29)
In [ ]:
text29_multidigraph = nx.MultiDiGraph(text29.edges_iter())
for cycle in nx.simple_cycles(text29_multidigraph):
print cycle
# A generator that produces elementary cycles of the graph.
# Each cycle is a list of nodes with the first and last nodes being the same.
In [ ]:
cycle = ['s503_505', 's503_532', 's503_531', 's503_507', 's503_506']
cycle_graph = []
for node in cycle:
print node, "has out edges: ", [trg for (src, trg) in text29.out_edges(node)]
print node, "has in edges: ", [src for (src, trg) in text29.in_edges(node)]
cycle_graph.extend(text29.out_edges(node))
cycle_graph.extend(text29.in_edges(node))
In [ ]:
# %dotstr dg.print_dot(nx.DiGraph(cycle_graph))
In [ ]:
# %load_ext gvmagic
In [ ]:
tueba_corpus = dg.read_exportxml(TUEBADZ8_FILE)
In [ ]:
text922 = get_specific_tueba_document(TUEBADZ8_FILE, 'text_922')
In [ ]:
text_element922 = get_specific_tueba_document(TUEBADZ8_FILE, 'text_922', debug=True)
In [ ]:
from lxml import etree
with open('/tmp/text_922.xml', 'w') as tfile:
tfile.write(etree.tostring(text_element922))
In [ ]:
def get_cyclic_subgraphs(docgraph):
mdg = nx.MultiDiGraph(docgraph.edges_iter())
cycles = nx.simple_cycles(mdg)
for cycle in cycles:
yield mdg.subgraph(cycle)
In [ ]:
for subgraph in get_cyclic_subgraphs(text922):
%dotstr dg.print_dot(subgraph)
In [ ]:
pointing_relations = dg.select_edges_by(text922, edge_type=dg.EdgeTypes.pointing_relation)
In [ ]:
rel_dict = defaultdict(set)
for src_id, trg_id in pointing_relations:
rel_dict[src_id].add(trg_id)
In [ ]:
# dg.discoursegraph.__walk_chain(rel_dict, 's19293_501')
In [ ]:
# for doc in tueba_corpus:
# dg.write_conll(doc, '/tmp/{}.tueba.conll'.format(doc.name))