In [1]:
from cStringIO import StringIO
import os
import tempfile
import pattern
from pattern.de import parsetree
from discoursegraphs import DiscourseDocumentGraph, EdgeTypes
import discoursegraphs as dg
In [22]:
class PatternDiscourseGraph(DiscourseDocumentGraph):
"""
represents a plaintext document as a discourse graph
(parsed with the ``pattern`` library).
"""
def __init__(self, plaintext_fpath, name=None, namespace='pattern'):
# super calls __init__() of base class DiscourseDocumentGraph
super(PatternDiscourseGraph, self).__init__()
self.name = name if name else os.path.basename(plaintext_fpath)
with open(plaintext_fpath) as plaintext_file:
self.parsed = parsetree(plaintext_file.read())
self.ns = namespace
self.root = self.ns+':root_node'
self.add_node(self.root, layers={self.ns}, label=self.ns+':root_node')
self.sentences = []
self.tokens = []
for i, sentence in enumerate(self.parsed):
self.add_sentence(sentence, i)
def add_sentence(self, sentence, index):
token_index = 0
chunk_index = 0
sent_id = 'sent_{}'.format(index)
self.add_node(sent_id, layers={self.ns+':syntax'})
self.add_edge(self.root, sent_id, edge_type=EdgeTypes.dominance_relation)
self.sentences.append(sent_id)
for elem in sentence:
if isinstance(elem, pattern.text.tree.Word):
self.add_token(elem, index, token_index)
token_index += 1
else: # Chunk
self.add_chunk(elem, index, chunk_index)
chunk_index += 1
def add_token(self, token, sent_index, token_index):
token_id = 'tok_s{}_{}'.format(sent_index, token_index)
self.add_node(token_id, layers={self.ns+':token'})
# raise NotImplementedError
def add_chunk(self, chunk, sent_index, chunk_index):
raise NotImplementedError
@classmethod
def from_string(cls, plaintext):
tfile = tempfile.NamedTemporaryFile(delete=False)
tfile.write(plaintext)
tfile.close()
return cls(tfile.name)
In [23]:
pdg = PatternDiscourseGraph.from_string("Angela Merkel ist gestern in Griechenland gelandet. Bernd ist noch dümmer als Angie.")
In [24]:
for elem in pdg.parsed[1]:
print type(elem), elem
In [25]:
ist = pdg.parsed[0][2]
In [26]:
s0 = pdg.parsed[0]
In [27]:
s0.constituents()
Out[27]:
In [28]:
s0.lemmata
Out[28]:
In [29]:
for elem in s0.constituents()[0]:
print elem
In [32]:
# %load_ext gvmagic
In [33]:
# %dotstr dg.print_dot(pdg)
In [12]:
for elem in parsetree("Der 1. FC Wuppertal hat 3:0 verloren.")[1]:
print elem
In [13]:
s = parsetree(
"Der SV Werder Bremen hat gestern 3:0 auf dem Feld verloren. Angela Merkel interressiert das nicht.",
relations=True, lemmata=True)
In [14]:
for sent in s:
for constituent in sent.constituents(pnp=True):
print constituent
if isinstance(constituent, pattern.text.tree.Chunk):
print '\t', [elem for elem in constituent]
# for chunk in sent.chunks:
# print chunk.type, [(w.string, w.type) for w in chunk.words]
print
In [15]:
print s[0].xml
In [16]:
sent = s[0]
In [17]:
sent.relations
Out[17]:
In [18]:
c = sent.relations['SBJ'][1]
In [19]:
c.sentence
Out[19]:
In [20]:
sent.pnp
Out[20]:
In [21]:
pattern.text.tree.nltk_tree(sent)
In [40]:
hans_kennt_maria = parsetree("Hans kennt Maria.", relations=True)
In [45]:
# Signature: pattern.text.tree.nltk_tree(sentence)
from nltk.tree import Tree
from pattern.text import Chink
def nltk_tree(sentence):
""" Returns an NLTK nltk.tree.Tree object from the given Sentence.
The NLTK module should be on the search path somewhere.
"""
def do_pnp(pnp):
# Returns the PNPChunk (and the contained Chunk objects) in NLTK bracket format.
s = ' '.join([do_chunk(ch) for ch in pnp.chunks])
return '(PNP %s)' % s
def do_chunk(ch):
# Returns the Chunk in NLTK bracket format. Recurse attached PNP's.
s = ' '.join(['(%s %s)' % (w.pos, w.string) for w in ch.words])
s+= ' '.join([do_pnp(pnp) for pnp in ch.attachments])
return '(%s %s)' % (ch.type, s)
T = ['(S']
v = [] # PNP's already visited.
for ch in sentence.chunked():
if not ch.pnp and isinstance(ch, Chink):
T.append('(%s %s)' % (ch.words[0].pos, ch.words[0].string))
elif not ch.pnp:
T.append(do_chunk(ch))
#elif ch.pnp not in v:
elif ch.pnp.anchor is None and ch.pnp not in v:
# The chunk is part of a PNP without an anchor.
T.append(do_pnp(ch.pnp))
v.append(ch.pnp)
T.append(')')
tree_string = ' '.join(T)
return Tree.fromstring(tree_string)
In [46]:
nltk_tree(sent)
Out[46]:
In [47]:
sent.relations
Out[47]:
In [48]:
sent.chunked()
Out[48]:
In [51]:
parsed = parsetree("Der SV Werder Bremen hat gestern in der Allianz-Arena 3:0 gegen Mannheim verloren.")
sent0 = parsed[0]
nltk_tree(sent0)
Out[51]:
In [52]:
sent0.relations
Out[52]:
In [ ]: