notebook.community

Edit and run



In [3]:

    
import os
import networkx as nx

import discoursegraphs as dg
import pocores as pc

MATE_PARSED_TUEBA_DIR = os.path.expanduser('~/corpora/tueba/tueba_mate_output')
TUEBA_TEST_FILE = os.path.join(MATE_PARSED_TUEBA_DIR, 'text_104.bp')



In [4]:

    
pocograph = pc.run_pocores(TUEBA_TEST_FILE, input_format='2009',
                           output_dest='/tmp/{}.pocores'.format(os.path.basename(TUEBA_TEST_FILE)))



In [5]:

    
print pc.main.output_with_brackets(pocograph)









    



[Erneute Kritik am Gleichberechtigungsgesetz]_{s1_t2} 
[Scharfe Kritik am Gleichstellungsgesetz für [Behinderte]_{s2_t6}]_{s1_t2} hat [der Landesgeschäftsführer des Sozialverbandes VDK , Berndt Mayer ,]_{s2_t9} geäußert . 
Das Gesetz sei " zahnlos " , unterstrich [er]_{s2_t9} am Mittwoch abend vor [rund 800 Behinderten]_{s2_t6} in der Kreuzberger Passionskirche . 
Das Diskriminierungsverbot sei lediglich als " Programmsatz " enthalten . 
Deshalb könnten daraus keine Rechte eingeklagt werden . 
ADN



In [6]:

    
tdg = dg.read_tiger('/home/arne/repos/pcc-annis-merged/maz176/syntax/maz-10207.xml')



In [7]:

    
# %load_ext gvmagic



In [8]:

    
# %dotstr dg.print_dot(pocograph.document)



In [9]:

    
# %dotstr dg.print_dot(dg.read_conll(TUEBA_TEST_FILE))



In [10]:

    
dg.write_conll(pocograph.document, '/tmp/{}.pocores.conll'.format(pocograph.document.name),
               markable_layer='pocores:markable')



In [11]:

    
TUEBADZ8_FILE = os.path.expanduser(
    '~/corpora/tueba/TuebaDZ8.0/tuebadz-8.0-mit-NE+Anaphern+Diskurs.exml.xml')



In [12]:

    
tueba_corpus = dg.read_exportxml(TUEBADZ8_FILE)



In [13]:

    
def get_specific_tueba_document(tueba_filepath, text_id, debug=False):
    tueba_corpus = dg.read_exportxml(TUEBADZ8_FILE, debug=True)
    for text_element in tueba_corpus:
        if text_element.attrib[dg.readwrite.exportxml.add_ns('id')] == text_id:
            if debug:
                return text_element
            else:
                return dg.readwrite.exportxml.ExportXMLDocumentGraph(text_element)
    raise ValueError("There's no text with ID: {} in the corpus file: {}".format(text_id, tueba_filepath))



In [14]:

    
text29 = get_specific_tueba_document(tueba_corpus, 'text_29')









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-14-c959e5fadfc9> in <module>()
----> 1 text29 = get_specific_tueba_document(tueba_corpus, 'text_29')

<ipython-input-13-8178c94595c1> in get_specific_tueba_document(tueba_filepath, text_id, debug)
      6                 return text_element
      7             else:
----> 8                 return dg.readwrite.exportxml.ExportXMLDocumentGraph(text_element)
      9     raise ValueError("There's no text with ID: {} in the corpus file: {}".format(text_id, tueba_filepath))

/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.1.2-py2.7.egg/discoursegraphs/readwrite/exportxml.pyc in __init__(self, text_element, name, namespace, precedence, ignore_relations, ignore_secedges)
    140         }
    141 
--> 142         self.parse_descedant_elements(text_element)
    143 
    144     def parse_child_elements(self, element):

/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.1.2-py2.7.egg/discoursegraphs/readwrite/exportxml.pyc in parse_descedant_elements(self, element)
    150         '''parses all descendants of an etree element'''
    151         for descendant in element.iterdescendants():
--> 152             self.parsers[descendant.tag](descendant)
    153 
    154     def add_connective(self, connective):

/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.1.2-py2.7.egg/discoursegraphs/readwrite/exportxml.pyc in add_secedge(self, secedge)
    420            </node>
    421         """
--> 422         if self.ignore_seceges is False:
    423             edge_source = self.get_parent_id(secedge)
    424             edge_target = self.get_element_id(secedge)

AttributeError: 'ExportXMLDocumentGraph' object has no attribute 'ignore_seceges'



In [ ]:

    
dg.write_conll(text29, '/tmp/text_29.conll')



In [ ]:

    
nx.is_directed_acyclic_graph(text29)



In [ ]:

    
nx.is_directed(text29)



In [ ]:

    
text29_multidigraph = nx.MultiDiGraph(text29.edges_iter())
for cycle in nx.simple_cycles(text29_multidigraph):
    print cycle

# A generator that produces elementary cycles of the graph.
# Each cycle is a list of nodes with the first and last nodes being the same.



In [ ]:

    
cycle = ['s503_505', 's503_532', 's503_531', 's503_507', 's503_506']

cycle_graph = []
for node in cycle:
    print node, "has out edges: ", [trg for (src, trg) in text29.out_edges(node)]
    print node, "has in edges: ", [src for (src, trg) in text29.in_edges(node)]
    cycle_graph.extend(text29.out_edges(node))
    cycle_graph.extend(text29.in_edges(node))



In [ ]:

    
# %dotstr dg.print_dot(nx.DiGraph(cycle_graph))



In [ ]:

    
# %load_ext gvmagic



In [ ]:

    
tueba_corpus = dg.read_exportxml(TUEBADZ8_FILE)



In [ ]:

    
text922 = get_specific_tueba_document(TUEBADZ8_FILE, 'text_922')



In [ ]:

    
text_element922 = get_specific_tueba_document(TUEBADZ8_FILE, 'text_922', debug=True)



In [ ]:

    
from lxml import etree

with open('/tmp/text_922.xml', 'w') as tfile:
    tfile.write(etree.tostring(text_element922))



In [ ]:

    
def get_cyclic_subgraphs(docgraph):
    mdg = nx.MultiDiGraph(docgraph.edges_iter())
    cycles = nx.simple_cycles(mdg)
    for cycle in cycles:
        yield mdg.subgraph(cycle)



In [ ]:

    
for subgraph in get_cyclic_subgraphs(text922):
    %dotstr dg.print_dot(subgraph)



In [ ]:

    
pointing_relations = dg.select_edges_by(text922, edge_type=dg.EdgeTypes.pointing_relation)



In [ ]:

    
rel_dict = defaultdict(set)
for src_id, trg_id in pointing_relations:
    rel_dict[src_id].add(trg_id)



In [ ]:

    
# dg.discoursegraph.__walk_chain(rel_dict, 's19293_501')



In [ ]:

    
# for doc in tueba_corpus:
#     dg.write_conll(doc, '/tmp/{}.tueba.conll'.format(doc.name))