notebook.community

Edit and run



In [1]:

    
from cStringIO import StringIO
import os
import tempfile

import pattern
from pattern.de import parsetree
from discoursegraphs import DiscourseDocumentGraph, EdgeTypes
import discoursegraphs as dg



In [22]:

    
class PatternDiscourseGraph(DiscourseDocumentGraph):
    """
    represents a plaintext document as a discourse graph
    (parsed with the ``pattern`` library).
    """
    def __init__(self, plaintext_fpath, name=None, namespace='pattern'):
        # super calls __init__() of base class DiscourseDocumentGraph
        super(PatternDiscourseGraph, self).__init__()

        self.name = name if name else os.path.basename(plaintext_fpath)
        with open(plaintext_fpath) as plaintext_file:
            self.parsed = parsetree(plaintext_file.read())

        self.ns = namespace
        self.root = self.ns+':root_node'
        self.add_node(self.root, layers={self.ns}, label=self.ns+':root_node')

        self.sentences = []
        self.tokens = []
        for i, sentence in enumerate(self.parsed):
            self.add_sentence(sentence, i)

    def add_sentence(self, sentence, index):
        token_index = 0
        chunk_index = 0
        sent_id = 'sent_{}'.format(index)
        self.add_node(sent_id, layers={self.ns+':syntax'})
        self.add_edge(self.root, sent_id, edge_type=EdgeTypes.dominance_relation)
        self.sentences.append(sent_id)
        
        for elem in sentence:
            if isinstance(elem, pattern.text.tree.Word):
                self.add_token(elem, index, token_index)
                token_index += 1
            else:  # Chunk
                self.add_chunk(elem, index, chunk_index)
                chunk_index += 1

    def add_token(self, token, sent_index, token_index):
        token_id = 'tok_s{}_{}'.format(sent_index, token_index)
        self.add_node(token_id, layers={self.ns+':token'})
#         raise NotImplementedError
        
    def add_chunk(self, chunk, sent_index, chunk_index):
        raise NotImplementedError
    
    @classmethod
    def from_string(cls, plaintext):
        tfile = tempfile.NamedTemporaryFile(delete=False)
        tfile.write(plaintext)
        tfile.close()
        return cls(tfile.name)



In [23]:

    
pdg = PatternDiscourseGraph.from_string("Angela Merkel ist gestern in Griechenland gelandet. Bernd ist noch dümmer als Angie.")



In [24]:

    
for elem in pdg.parsed[1]:
    print type(elem), elem









    



<class 'pattern.text.tree.Word'> Word(u'Bernd/NN')
<class 'pattern.text.tree.Word'> Word(u'ist/VB')
<class 'pattern.text.tree.Word'> Word(u'noch/IN')
<class 'pattern.text.tree.Word'> Word(u'd\xfcmmer/NN')
<class 'pattern.text.tree.Word'> Word(u'als/IN')
<class 'pattern.text.tree.Word'> Word(u'Angie/NN')
<class 'pattern.text.tree.Word'> Word(u'./.')



In [25]:

    
ist = pdg.parsed[0][2]



In [26]:

    
s0 = pdg.parsed[0]



In [27]:

    
s0.constituents()









    Out[27]:





[Chunk('Angela Merkel/NP'),
 Chunk('ist/VP'),
 Chunk('gestern/ADVP'),
 Chunk('in/PP'),
 Chunk('Griechenland/NP'),
 Word(u'gelandet/RP'),
 Word(u'./.')]



In [28]:

    
s0.lemmata









    Out[28]:





[None, None, None, None, None, None, None, None]



In [29]:

    
for elem in s0.constituents()[0]:
    print elem









    



Word(u'Angela/NNP')
Word(u'Merkel/NNP')



In [32]:

    
# %load_ext gvmagic



In [33]:

    
# %dotstr dg.print_dot(pdg)



In [12]:

    
for elem in parsetree("Der 1. FC Wuppertal hat 3:0 verloren.")[1]:
    print elem









    



Word(u'FC/NN')
Word(u'Wuppertal/NNP')
Word(u'hat/VB')
Word(u'3:0/CD')
Word(u'verloren/JJ')
Word(u'./.')



In [13]:

    
s = parsetree(
    "Der SV Werder Bremen hat gestern 3:0 auf dem Feld verloren. Angela Merkel interressiert das nicht.",
    relations=True, lemmata=True)



In [14]:

    
for sent in s:
    for constituent in sent.constituents(pnp=True):
        print constituent
        if isinstance(constituent, pattern.text.tree.Chunk):
            print '\t', [elem for elem in constituent]
        
#     for chunk in sent.chunks:
#         print chunk.type, [(w.string, w.type) for w in chunk.words]
    print









    



Chunk('Der SV Werder Bremen/NP-SBJ-1')
	[Word(u'Der/DT'), Word(u'SV/PRP$'), Word(u'Werder/NN'), Word(u'Bremen/NN')]
Chunk('hat/VP-1')
	[Word(u'hat/VB')]
Chunk('gestern/ADVP')
	[Word(u'gestern/RB')]
Word(u'3:0/CD')
Chunk('auf dem Feld/PNP')
	[Word(u'auf/IN'), Word(u'dem/DT'), Word(u'Feld/NN')]
Chunk('verloren/ADJP')
	[Word(u'verloren/JJ')]
Word(u'./.')

Chunk('Angela Merkel interressiert/NP')
	[Word(u'Angela/NNP'), Word(u'Merkel/NNP'), Word(u'interressiert/NNP')]
Word(u'das/DT')
Chunk('nicht/ADVP')
	[Word(u'nicht/RB')]
Word(u'./.')



In [15]:

    
print s[0].xml









    



<sentence id="5" token="word, part-of-speech, chunk, preposition, relation, lemma" language="de">
	<chunk type="NP" relation="SBJ" of="5.1">
		<word type="DT" lemma="der">Der</word>
		<word type="PRP$" lemma="sv">SV</word>
		<word type="NN" lemma="werder">Werder</word>
		<word type="NN" lemma="bremen">Bremen</word>
	</chunk>
	<chunk type="VP" id="5.1">
		<word type="VB" lemma="haben">hat</word>
	</chunk>
	<chunk type="ADVP">
		<word type="RB" lemma="gestern">gestern</word>
	</chunk>
	<chink>
		<word type="CD" lemma="3:0">3:0</word>
	</chink>
	<chunk type="PNP">
		<chunk type="PP">
			<word type="IN" lemma="auf">auf</word>
		</chunk>
		<chunk type="NP">
			<word type="DT" lemma="dem">dem</word>
			<word type="NN" lemma="feld">Feld</word>
		</chunk>
	</chunk>
	<chunk type="ADJP">
		<word type="JJ" lemma="verlor">verloren</word>
	</chunk>
	<chink>
		<word type="." lemma=".">.</word>
	</chink>
</sentence>



In [16]:

    
sent = s[0]



In [17]:

    
sent.relations









    Out[17]:





{'OBJ': {},
 'SBJ': {1: Chunk('Der SV Werder Bremen/NP-SBJ-1')},
 'VP': {1: Chunk('hat/VP-1')}}



In [18]:

    
c = sent.relations['SBJ'][1]



In [19]:

    
c.sentence









    Out[19]:





Sentence('Der/DT/B-NP/O/NP-SBJ-1/der SV/PRP$/I-NP/O/NP-SBJ-1/sv Werder/NN/I-NP/O/NP-SBJ-1/werder Bremen/NN/I-NP/O/NP-SBJ-1/bremen hat/VB/B-VP/O/VP-1/haben gestern/RB/B-ADVP/O/O/gestern 3:0/CD/O/O/O/3:0 auf/IN/B-PP/B-PNP/O/auf dem/DT/B-NP/I-PNP/O/dem Feld/NN/I-NP/I-PNP/O/feld verloren/JJ/B-ADJP/O/O/verlor ././O/O/O/.')



In [20]:

    
sent.pnp









    Out[20]:





[Chunk('auf dem Feld/PNP')]



In [21]:

    
pattern.text.tree.nltk_tree(sent)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-21-efe8ef8ccf39> in <module>()
----> 1 pattern.text.tree.nltk_tree(sent)

/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/pattern/text/tree.pyc in nltk_tree(sentence)
   1586             v.append(ch.pnp)
   1587     T.append(')')
-> 1588     return tree.bracket_parse(' '.join(T))
   1589 
   1590 ### GRAPHVIZ DOT ###################################################################################

/home/arne/.virtualenvs/notebook/local/lib/python2.7/site-packages/nltk-3.0.4-py2.7.egg/nltk/tree.pyc in bracket_parse(s)
   1488     Use Tree.read(s, remove_empty_top_bracketing=True) instead.
   1489     """
-> 1490     raise NameError("Use Tree.read(s, remove_empty_top_bracketing=True) instead.")
   1491 
   1492 def sinica_parse(s):

NameError: Use Tree.read(s, remove_empty_top_bracketing=True) instead.



In [40]:

    
hans_kennt_maria = parsetree("Hans kennt Maria.", relations=True)



In [45]:

    
# Signature: pattern.text.tree.nltk_tree(sentence)

from nltk.tree import Tree
from pattern.text import Chink

def nltk_tree(sentence):
    """ Returns an NLTK nltk.tree.Tree object from the given Sentence.
        The NLTK module should be on the search path somewhere.
    """
    def do_pnp(pnp):
        # Returns the PNPChunk (and the contained Chunk objects) in NLTK bracket format.
        s = ' '.join([do_chunk(ch) for ch in pnp.chunks])
        return '(PNP %s)' % s
    
    def do_chunk(ch):
        # Returns the Chunk in NLTK bracket format. Recurse attached PNP's.
        s = ' '.join(['(%s %s)' % (w.pos, w.string) for w in ch.words])
        s+= ' '.join([do_pnp(pnp) for pnp in ch.attachments])
        return '(%s %s)' % (ch.type, s)
    
    T = ['(S']
    v = [] # PNP's already visited.
    for ch in sentence.chunked():
        if not ch.pnp and isinstance(ch, Chink):
            T.append('(%s %s)' % (ch.words[0].pos, ch.words[0].string))
        elif not ch.pnp:
            T.append(do_chunk(ch))
        #elif ch.pnp not in v:
        elif ch.pnp.anchor is None and ch.pnp not in v:
            # The chunk is part of a PNP without an anchor.
            T.append(do_pnp(ch.pnp))
            v.append(ch.pnp)
    T.append(')')
    tree_string = ' '.join(T)
    return Tree.fromstring(tree_string)



In [46]:

    
nltk_tree(sent)









    Out[46]:



In [47]:

    
sent.relations









    Out[47]:





{'OBJ': {},
 'SBJ': {1: Chunk('Der SV Werder Bremen/NP-SBJ-1')},
 'VP': {1: Chunk('hat/VP-1')}}



In [48]:

    
sent.chunked()









    Out[48]:





[Chunk('Der SV Werder Bremen/NP-SBJ-1'),
 Chunk('hat/VP-1'),
 Chunk('gestern/ADVP'),
 Chink('3:0/O'),
 Chunk('auf/PP'),
 Chunk('dem Feld/NP'),
 Chunk('verloren/ADJP'),
 Chink('./O')]



In [51]:

    
parsed = parsetree("Der SV Werder Bremen hat gestern in der Allianz-Arena 3:0 gegen Mannheim verloren.")
sent0 = parsed[0]

nltk_tree(sent0)









    Out[51]:



In [52]:

    
sent0.relations









    Out[52]:





{'OBJ': {}, 'SBJ': {}, 'VP': {None: Chunk('hat/VP')}}



In [ ]: