In [3]:
    
location = [('Omnicom','IN','New York'), ('DBB','IN','New York'), ('BBDO','IN','Atlanta')]
[e1 for (e1, rel, e2) in location if rel == 'IN' and e2 == 'New York']
    
    Out[3]:
In [1]:
    
import nltk
from nltk.corpus import ieer
    
一般的information extraction分為幾個步驟
前面三個步驟不是本章的重點,因此將三個步驟結合成一個function。接下來要找出proper names(專有名詞)或是definite noun phrases(指示名詞),作為entity的可能候選。最後,再成對的entity中找出relation的可能pattern。
In [66]:
    
def preprocess(document, tagset="universal"):
    sent = nltk.sent_tokenize(document)
    sent = [nltk.word_tokenize(s) for s in sent]
    sent = [nltk.pos_tag(s, tagset=tagset) for s in sent]
    return sent
    
In [3]:
    
sentence = preprocess('the little yellow dog barked at the cat')[0]
sentence
    
    Out[3]:
In [4]:
    
# RegexpParser: 將符合條件的POS組合成一個新的
grammar = "NP: {<DET>?<ADJ>*<NOUN>}"
cp = nltk.RegexpParser(grammar)
cp.parse(sentence)
    
    Out[4]:
In [5]:
    
# 改良版的規則
#   (冠詞|所有格) + 形容詞 + 名詞,如"The black fat cat"
#   連續的專有名詞,如"Mary Brown"
grammar = r"""
    NP: {<DET|PRON>?<ADJ>*<NOUN>}
        {<NOUN>+}
"""
cp = nltk.RegexpParser(grammar)
sentence = preprocess("Rapunzel let down her long golden hair")[0]
cp.parse(sentence)
    
    Out[5]:
In [6]:
    
def find_chunks(user_rule = '{<V.*><TO><V.*>}'):
    cp = nltk.RegexpParser('CHUNK: ' + user_rule)
    for sent in nltk.corpus.brown.tagged_sents():
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'CHUNK':
                yield subtree
    
In [7]:
    
v_to_v = list(find_chunks())
    
In [8]:
    
v_to_v[1]
    
    Out[8]:
In [9]:
    
n4 = list(find_chunks('{<N.*><N.*><N.*><N.*>+}'))
    
In [10]:
    
n4[2]
    
    Out[10]:
In [11]:
    
grammar = r"""
    NP: {<.*>+}      # chunk everything
        }<VERB|ADP>+{  # chink sequence of VBD and IN
"""
sentence = preprocess('the little yellow dog barked at the cat')[0]
cp = nltk.RegexpParser(grammar)
cp.parse(sentence)
    
    Out[11]:
In [12]:
    
conll_example = '''
he PRP B-NP
accepted VBD B-VP
the DT B-NP
position NN I-NP
of IN B-PP
vice NN B-NP
chairman NN I-NP
of IN B-PP
Carlyle NNP B-NP
Group NNP I-NP
, , O
a DT B-NP
merchant NN I-NP
banking NN I-NP
concern NN I-NP
. . O
'''
    
In [13]:
    
nltk.chunk.conllstr2tree(conll_example)
    
    Out[13]:
In [14]:
    
# conll2000是來自Wall Street Journal的文字,已經標註IOB tags
from nltk.corpus import conll2000
conll2000.chunked_sents()[99]
    
    Out[14]:
在conll2000中,有三種類型的chunk: NP chunks(例如the black dog)、VP chunks(例如has already known)、PP chunks(例如because of)。如果對於NP chunks比較有興趣,可以指定要看的chunk種類:
In [15]:
    
conll2000.chunked_sents(chunk_types=['NP'])[99]
    
    Out[15]:
In [16]:
    
# 已經有分為兩個資料集了
conll2000.fileids()
    
    Out[16]:
In [17]:
    
# 先用空的parser試試看
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print cp.evaluate(test_sents)
    
    
因為空的parser不會標註任何資料,遇到NP以外的地方就是對的,至少還有43%的正確率。
但因為完全沒有true positive,precision及recall都是0。
In [18]:
    
# 嘗試用最簡單的parser
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print cp.evaluate(test_sents)
    
    
如果要自己設計chunker,可以用一個class繼承nltk.ChunkParserI,並且實作__init__及parse這兩個function。例如下面用unigram model來實作chunker:
In [20]:
    
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        """
        train_sents: 傳進來的training data,必須是chunk tree的形式
        """
        # 經過tree2conlltags轉換後,變成tuple (word, pos_tag, chunk_tag)
        train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        # 根據pos_tag預測chunk_tag
        self.tagger = nltk.UnigramTagger(train_data)
    
    def parse(self, sentence):
        """
        sentence: (word, pos)組成的句子
        return: conll tree
        """
        pos_tags = [pos for word, pos in sentence]
        # 根據pos_tag預測chunk_tag,不需其他資訊
        iob_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for pos, chunktag in iob_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)
    
In [21]:
    
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chunker = UnigramChunker(train_sents)
print unigram_chunker.evaluate(test_sents)
    
    
In [29]:
    
# 前面preprocess是採用universal tag
# 但這裡訓練時,因為不是用universal tag,所以第二個參數要給空字串
sentence = preprocess('the little yellow dog barked at the cat', '')[0]
unigram_chunker.parse(sentence)
    
    Out[29]:
In [40]:
    
# 這是一個conll tree的樣子
print train_sents[99]
    
    
In [41]:
    
# 這是將conll tree轉換成tuple的樣子,也是我們訓練資料的格式
nltk.chunk.tree2conlltags(train_sents[99])
    
    Out[41]:
In [42]:
    
def npchunk_features(sentence, i, history):
    """
    第一步,先定義feature
    這裡先用與unigram chunker完全相同的feature,這樣才能確定classifier是否正確
    如果classifier正確,則結果應該與unigram chunker差不多
    
    sentence -- 需要提取feature的句子,格式為list of (word, pos_tag)
    i -- 目前是句子的第幾個字
    history -- 前面的單字對應的tags
    """
    word, pos = sentence[i]
    return {"pos": pos}
    
In [58]:
    
class ConsecutiveNPChunkTagger(nltk.TaggerI):
    """
    第二步,定義一個接受training data並訓練出classifier的class
    """
    def __init__(self, train_sents):
        """輸入訓練用的句子,得到一個Maxent Classifier
        train_sents -- list of list of ((word, pos_tag), chunk_tag)
        """
        train_set = []
        for tagged_sent in train_sents:
            # tagged_sent -- list of ((word, pos_tag), chunk_tag)
            # untagged_sent -- list of (word, pos_tag)
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (_, chunk_tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history) 
                # 產生feature sets及IOB tag的對應關係,存到train_set
                train_set.append( (featureset, chunk_tag) )
                history.append(chunk_tag)
        # 利用MaxentClassifier學習featureset與tag的對應關係
        self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='iis', trace=0)
        
    def tag(self, sentence):
        """輸入要作chunk tagging的句子
        sentence -- list of (word, pos_tag)
        """
        history = []
        for i, _ in enumerate(sentence):
            # 產生句子對應的feature set
            featureset = npchunk_features(sentence, i, history)
            # 將feature set丟進classifier找出tag
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)
    
In [49]:
    
class ConsecutiveNPChunker(nltk.ChunkParserI):
    """
    第三步,單純定義一個介面,將conll tree格式的句子轉換成((word, pos_tag), chunk_tag)
    並丟給ConsecutiveNPChunkTagger訓練出classifier
    """
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)
    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)
    
In [50]:
    
chunker = ConsecutiveNPChunker(train_sents)
    
    Out[50]:
In [52]:
    
# 因為feature完全與unigram相同,所以結果也相同
print chunker.evaluate(test_sents)
    
    
嘗試第二個版本的feature set,將前一個tag加入feature:
In [53]:
    
def npchunk_features(sentence, i, history):
    """
    sentence -- 需要提取feature的句子,格式為list of (word, pos_tag)
    i -- 目前是句子的第幾個字
    history -- 前面的單字對應的tags
    """
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "prevpos": prevpos}
    
In [59]:
    
chunker = ConsecutiveNPChunker(train_sents)
    
In [60]:
    
print chunker.evaluate(test_sents)
    
    
嘗試第三個版本的feature set,將目前的單字加入feature。
In [61]:
    
def npchunk_features(sentence, i, history):
    """
    sentence -- 需要提取feature的句子,格式為list of (word, pos_tag)
    i -- 目前是句子的第幾個字
    history -- 前面的單字對應的tags
    """
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "word": word, "prevpos": prevpos}
    
In [62]:
    
chunker = ConsecutiveNPChunker(train_sents)
    
In [63]:
    
print chunker.evaluate(test_sents)
    
    
最後一個版本,加入後一個字的tag、前後pos的關聯、及冠詞後面的tag。
In [64]:
    
def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))
def npchunk_features(sentence, i, history):
    """
    sentence -- 需要提取feature的句子,格式為list of (word, pos_tag)
    i -- 目前是句子的第幾個字
    history -- 前面的單字對應的tags
    """
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    if i == len(sentence)-1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i+1]
    return {"pos": pos, "word": word, "prevpos": prevpos, "nextpos": nextpos,
            "prevpos+pos": "%s+%s" % (prevpos, pos),
            "pos+nextpos": "%s+%s" % (pos, nextpos),
            "tags-since-dt": tags_since_dt(sentence, i)}
    
In [65]:
    
chunker = ConsecutiveNPChunker(train_sents)
    
In [67]:
    
print chunker.evaluate(test_sents)
    
    
In [79]:
    
grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """
cp = nltk.RegexpParser(grammar)
sentence = "Mary saw the cat sit on the mat"
preprocess(sentence, "")
    
    Out[79]:
In [80]:
    
# 這個句子沒問題
cp.parse(preprocess(sentence, "")[0])
    
    Out[80]:
In [82]:
    
# 但加上子句後就有問題,thinks沒有正確處理
sentence = "John thinks Mary saw the cat sit on the mat"
cp.parse(preprocess(sentence, "")[0])
    
    Out[82]:
在上面的例子中,因為加上John thinks導致層數變多,所以thinks就被錯誤處理,沒有合併成VP。這可以用loop=2的參數來解決。
In [83]:
    
cp = nltk.RegexpParser(grammar, loop=2)
cp.parse(preprocess(sentence, "")[0])
    
    Out[83]:
當我們能夠正確辨識出NP chunk後,下一步就是辨識那個NP chunk屬於named entity,這個過程稱為NER(named entity recognition)。
named entity(NE)有以下類型:
NER問題的困難點
在conll2002的資料集中,就有包含NER問題的訓練資料。
In [89]:
    
# 使用binary=True,只會標註NE的位置,但不會標註類型
nltk.ne_chunk(nltk.chunk.tree2conlltags(train_sents[99]), binary=True)
    
    Out[89]:
In [91]:
    
# 不使用binary=True,會標註類型 (例如PERSON)
nltk.ne_chunk(nltk.chunk.tree2conlltags(train_sents[99]))
    
    Out[91]:
In [95]:
    
import re
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
     for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
            print nltk.sem.rtuple(rel)
    
    
In [ ]: