In [3]:
location = [('Omnicom','IN','New York'), ('DBB','IN','New York'), ('BBDO','IN','Atlanta')]
[e1 for (e1, rel, e2) in location if rel == 'IN' and e2 == 'New York']
Out[3]:
In [1]:
import nltk
from nltk.corpus import ieer
一般的information extraction分為幾個步驟
前面三個步驟不是本章的重點,因此將三個步驟結合成一個function。接下來要找出proper names(專有名詞)或是definite noun phrases(指示名詞),作為entity的可能候選。最後,再成對的entity中找出relation的可能pattern。
In [66]:
def preprocess(document, tagset="universal"):
sent = nltk.sent_tokenize(document)
sent = [nltk.word_tokenize(s) for s in sent]
sent = [nltk.pos_tag(s, tagset=tagset) for s in sent]
return sent
In [3]:
sentence = preprocess('the little yellow dog barked at the cat')[0]
sentence
Out[3]:
In [4]:
# RegexpParser: 將符合條件的POS組合成一個新的
grammar = "NP: {<DET>?<ADJ>*<NOUN>}"
cp = nltk.RegexpParser(grammar)
cp.parse(sentence)
Out[4]:
In [5]:
# 改良版的規則
# (冠詞|所有格) + 形容詞 + 名詞,如"The black fat cat"
# 連續的專有名詞,如"Mary Brown"
grammar = r"""
NP: {<DET|PRON>?<ADJ>*<NOUN>}
{<NOUN>+}
"""
cp = nltk.RegexpParser(grammar)
sentence = preprocess("Rapunzel let down her long golden hair")[0]
cp.parse(sentence)
Out[5]:
In [6]:
def find_chunks(user_rule = '{<V.*><TO><V.*>}'):
cp = nltk.RegexpParser('CHUNK: ' + user_rule)
for sent in nltk.corpus.brown.tagged_sents():
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.label() == 'CHUNK':
yield subtree
In [7]:
v_to_v = list(find_chunks())
In [8]:
v_to_v[1]
Out[8]:
In [9]:
n4 = list(find_chunks('{<N.*><N.*><N.*><N.*>+}'))
In [10]:
n4[2]
Out[10]:
In [11]:
grammar = r"""
NP: {<.*>+} # chunk everything
}<VERB|ADP>+{ # chink sequence of VBD and IN
"""
sentence = preprocess('the little yellow dog barked at the cat')[0]
cp = nltk.RegexpParser(grammar)
cp.parse(sentence)
Out[11]:
In [12]:
conll_example = '''
he PRP B-NP
accepted VBD B-VP
the DT B-NP
position NN I-NP
of IN B-PP
vice NN B-NP
chairman NN I-NP
of IN B-PP
Carlyle NNP B-NP
Group NNP I-NP
, , O
a DT B-NP
merchant NN I-NP
banking NN I-NP
concern NN I-NP
. . O
'''
In [13]:
nltk.chunk.conllstr2tree(conll_example)
Out[13]:
In [14]:
# conll2000是來自Wall Street Journal的文字,已經標註IOB tags
from nltk.corpus import conll2000
conll2000.chunked_sents()[99]
Out[14]:
在conll2000中,有三種類型的chunk: NP chunks(例如the black dog)、VP chunks(例如has already known)、PP chunks(例如because of)。如果對於NP chunks比較有興趣,可以指定要看的chunk種類:
In [15]:
conll2000.chunked_sents(chunk_types=['NP'])[99]
Out[15]:
In [16]:
# 已經有分為兩個資料集了
conll2000.fileids()
Out[16]:
In [17]:
# 先用空的parser試試看
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print cp.evaluate(test_sents)
因為空的parser不會標註任何資料,遇到NP以外的地方就是對的,至少還有43%的正確率。
但因為完全沒有true positive,precision及recall都是0。
In [18]:
# 嘗試用最簡單的parser
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print cp.evaluate(test_sents)
如果要自己設計chunker,可以用一個class繼承nltk.ChunkParserI
,並且實作__init__
及parse
這兩個function。例如下面用unigram model來實作chunker:
In [20]:
class UnigramChunker(nltk.ChunkParserI):
def __init__(self, train_sents):
"""
train_sents: 傳進來的training data,必須是chunk tree的形式
"""
# 經過tree2conlltags轉換後,變成tuple (word, pos_tag, chunk_tag)
train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
for sent in train_sents]
# 根據pos_tag預測chunk_tag
self.tagger = nltk.UnigramTagger(train_data)
def parse(self, sentence):
"""
sentence: (word, pos)組成的句子
return: conll tree
"""
pos_tags = [pos for word, pos in sentence]
# 根據pos_tag預測chunk_tag,不需其他資訊
iob_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for pos, chunktag in iob_pos_tags]
conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)]
return nltk.chunk.conlltags2tree(conlltags)
In [21]:
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chunker = UnigramChunker(train_sents)
print unigram_chunker.evaluate(test_sents)
In [29]:
# 前面preprocess是採用universal tag
# 但這裡訓練時,因為不是用universal tag,所以第二個參數要給空字串
sentence = preprocess('the little yellow dog barked at the cat', '')[0]
unigram_chunker.parse(sentence)
Out[29]:
In [40]:
# 這是一個conll tree的樣子
print train_sents[99]
In [41]:
# 這是將conll tree轉換成tuple的樣子,也是我們訓練資料的格式
nltk.chunk.tree2conlltags(train_sents[99])
Out[41]:
In [42]:
def npchunk_features(sentence, i, history):
"""
第一步,先定義feature
這裡先用與unigram chunker完全相同的feature,這樣才能確定classifier是否正確
如果classifier正確,則結果應該與unigram chunker差不多
sentence -- 需要提取feature的句子,格式為list of (word, pos_tag)
i -- 目前是句子的第幾個字
history -- 前面的單字對應的tags
"""
word, pos = sentence[i]
return {"pos": pos}
In [58]:
class ConsecutiveNPChunkTagger(nltk.TaggerI):
"""
第二步,定義一個接受training data並訓練出classifier的class
"""
def __init__(self, train_sents):
"""輸入訓練用的句子,得到一個Maxent Classifier
train_sents -- list of list of ((word, pos_tag), chunk_tag)
"""
train_set = []
for tagged_sent in train_sents:
# tagged_sent -- list of ((word, pos_tag), chunk_tag)
# untagged_sent -- list of (word, pos_tag)
untagged_sent = nltk.tag.untag(tagged_sent)
history = []
for i, (_, chunk_tag) in enumerate(tagged_sent):
featureset = npchunk_features(untagged_sent, i, history)
# 產生feature sets及IOB tag的對應關係,存到train_set
train_set.append( (featureset, chunk_tag) )
history.append(chunk_tag)
# 利用MaxentClassifier學習featureset與tag的對應關係
self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='iis', trace=0)
def tag(self, sentence):
"""輸入要作chunk tagging的句子
sentence -- list of (word, pos_tag)
"""
history = []
for i, _ in enumerate(sentence):
# 產生句子對應的feature set
featureset = npchunk_features(sentence, i, history)
# 將feature set丟進classifier找出tag
tag = self.classifier.classify(featureset)
history.append(tag)
return zip(sentence, history)
In [49]:
class ConsecutiveNPChunker(nltk.ChunkParserI):
"""
第三步,單純定義一個介面,將conll tree格式的句子轉換成((word, pos_tag), chunk_tag)
並丟給ConsecutiveNPChunkTagger訓練出classifier
"""
def __init__(self, train_sents):
tagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)]
for sent in train_sents]
self.tagger = ConsecutiveNPChunkTagger(tagged_sents)
def parse(self, sentence):
tagged_sents = self.tagger.tag(sentence)
conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
return nltk.chunk.conlltags2tree(conlltags)
In [50]:
chunker = ConsecutiveNPChunker(train_sents)
Out[50]:
In [52]:
# 因為feature完全與unigram相同,所以結果也相同
print chunker.evaluate(test_sents)
嘗試第二個版本的feature set,將前一個tag加入feature:
In [53]:
def npchunk_features(sentence, i, history):
"""
sentence -- 需要提取feature的句子,格式為list of (word, pos_tag)
i -- 目前是句子的第幾個字
history -- 前面的單字對應的tags
"""
word, pos = sentence[i]
if i == 0:
prevword, prevpos = "<START>", "<START>"
else:
prevword, prevpos = sentence[i-1]
return {"pos": pos, "prevpos": prevpos}
In [59]:
chunker = ConsecutiveNPChunker(train_sents)
In [60]:
print chunker.evaluate(test_sents)
嘗試第三個版本的feature set,將目前的單字加入feature。
In [61]:
def npchunk_features(sentence, i, history):
"""
sentence -- 需要提取feature的句子,格式為list of (word, pos_tag)
i -- 目前是句子的第幾個字
history -- 前面的單字對應的tags
"""
word, pos = sentence[i]
if i == 0:
prevword, prevpos = "<START>", "<START>"
else:
prevword, prevpos = sentence[i-1]
return {"pos": pos, "word": word, "prevpos": prevpos}
In [62]:
chunker = ConsecutiveNPChunker(train_sents)
In [63]:
print chunker.evaluate(test_sents)
最後一個版本,加入後一個字的tag、前後pos的關聯、及冠詞後面的tag。
In [64]:
def tags_since_dt(sentence, i):
tags = set()
for word, pos in sentence[:i]:
if pos == 'DT':
tags = set()
else:
tags.add(pos)
return '+'.join(sorted(tags))
def npchunk_features(sentence, i, history):
"""
sentence -- 需要提取feature的句子,格式為list of (word, pos_tag)
i -- 目前是句子的第幾個字
history -- 前面的單字對應的tags
"""
word, pos = sentence[i]
if i == 0:
prevword, prevpos = "<START>", "<START>"
else:
prevword, prevpos = sentence[i-1]
if i == len(sentence)-1:
nextword, nextpos = "<END>", "<END>"
else:
nextword, nextpos = sentence[i+1]
return {"pos": pos, "word": word, "prevpos": prevpos, "nextpos": nextpos,
"prevpos+pos": "%s+%s" % (prevpos, pos),
"pos+nextpos": "%s+%s" % (pos, nextpos),
"tags-since-dt": tags_since_dt(sentence, i)}
In [65]:
chunker = ConsecutiveNPChunker(train_sents)
In [67]:
print chunker.evaluate(test_sents)
In [79]:
grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP, VP
"""
cp = nltk.RegexpParser(grammar)
sentence = "Mary saw the cat sit on the mat"
preprocess(sentence, "")
Out[79]:
In [80]:
# 這個句子沒問題
cp.parse(preprocess(sentence, "")[0])
Out[80]:
In [82]:
# 但加上子句後就有問題,thinks沒有正確處理
sentence = "John thinks Mary saw the cat sit on the mat"
cp.parse(preprocess(sentence, "")[0])
Out[82]:
在上面的例子中,因為加上John thinks導致層數變多,所以thinks就被錯誤處理,沒有合併成VP。這可以用loop=2
的參數來解決。
In [83]:
cp = nltk.RegexpParser(grammar, loop=2)
cp.parse(preprocess(sentence, "")[0])
Out[83]:
當我們能夠正確辨識出NP chunk後,下一步就是辨識那個NP chunk屬於named entity,這個過程稱為NER(named entity recognition)。
named entity(NE)有以下類型:
NER問題的困難點
在conll2002的資料集中,就有包含NER問題的訓練資料。
In [89]:
# 使用binary=True,只會標註NE的位置,但不會標註類型
nltk.ne_chunk(nltk.chunk.tree2conlltags(train_sents[99]), binary=True)
Out[89]:
In [91]:
# 不使用binary=True,會標註類型 (例如PERSON)
nltk.ne_chunk(nltk.chunk.tree2conlltags(train_sents[99]))
Out[91]:
In [95]:
import re
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
print nltk.sem.rtuple(rel)
In [ ]: