Mars Target Encyclopedia - NER

Thamme Gowda (Thamme.Gowda@jpl.nasa.gov)

Named Entity Recognition / Sequence Tagging This notebook contains NER tagging using CRF suite

Notes:

  • Use python3, Reason: we need unicode strings, which is default in python3
  • install Python-crfsuite
  • Start CoreNLP Server

In [1]:
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
from pycorenlp import StanfordCoreNLP
from codecs import open as copen
from collections import defaultdict as ddict
from csv import DictWriter
import sys
from copy import copy
import time
from pprint import pprint
import re

import os, glob
import pickle

print(sklearn.__version__)


0.18.1

In [6]:
#accept_labels = set(['Element', 'Mineral', 'Target', 'Material', 'Locality', 'Site'])
accept_labels = set(['Target', 'Mineral', 'Element'])

class BratToCRFSuitFeaturizer(object):
    def __init__(self, corenlp_url='http://localhost:9000', iob=False):
        '''
        Create Converter for converting brat annotations to Core NLP NER CRF
        classifier training data.
        @param corenlp_url: URL to corenlp server.
                To start the server checkout: http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started
        @param iob: set 'True' for IOB encoding
        '''
        self.corenlp = StanfordCoreNLP(corenlp_url)
        self.iob = iob

    def convert(self, text_file, ann_file):
        text, tree = self.parse(text_file, ann_file)
        props = { 'annotators': 'tokenize,ssplit,lemma,pos,ner', 'outputFormat': 'json'}
        if text[0].isspace():
            text = '.' + text[1:]
            # Reason: some tools trim/strip off the white spaces
            # which will mismatch the character offsets
        output = self.corenlp.annotate(text, properties=props)
        records = []
        for sentence in output['sentences']:
            sent_features = []
            continue_ann, continue_ann_en = None, None
            for tok in sentence['tokens']:
                begin, tok_end = tok['characterOffsetBegin'], tok['characterOffsetEnd']
                label = 'O'
                if begin in tree:
                    node = tree[begin]
                    if len(node) > 1:
                        print("WARN: multiple starts at ", begin, node)
                        if tok_end in node:
                            node = {tok_end: node[tok_end]} # picking one
                            print("Chose:", node)

                    ann_end, labels = list(node.items())[0]
                    if not len(labels) == 1:
                        print("WARN: Duplicate labels for token: %s, label:%s.\
                              Using the first one!" % (tok['word'], str(labels)))
                    if accept_labels is not None and labels[0] in accept_labels:
                        label = labels[0]

                    if tok_end == ann_end: # annotation ends where token ends
                        continue_ann = None
                    elif tok_end < ann_end and label != 'O':
                        #print("Continue for the next %d chars" % (ann_end - tok_end))
                        continue_ann = label
                        continue_ann_end = ann_end 
                    if label != 'O' and self.iob:
                        label = "B-" + label
                elif continue_ann is not None and tok_end <= continue_ann_end:
                    #print("Continuing the annotation %s, %d:%d %d]" % 
                    #(continue_ann, begin, tok_end, continue_ann_end))
                    label = continue_ann            # previous label is this label
                    if continue_ann_end == tok_end: # continuation ends here
                        #print("End")
                        continue_ann = None
                    if self.iob:
                        label = "I-" + label
                sent_features.append([tok['word'], tok['lemma'], tok['pos'], tok['ner'], label])
            yield sent_features

    def parse(self, txt_file, ann_file):
        with copen(ann_file, 'r', encoding='utf-8') as ann_file:
            with copen(txt_file, 'r', encoding='utf-8') as text_file:
                texts = text_file.read()
            anns = map(lambda x: x.strip().split('\t'), ann_file)
            anns = filter(lambda x: len(x) > 2, anns)
            # FIXME: ignoring the annotatiosn which are complex

            anns = filter(lambda x: ';' not in x[1], anns)
            # FIXME: some annotations' spread have been split into many, separated by ; ignoring them

            def __parse_ann(ann):
                spec = ann[1].split()
                name = spec[0]
                markers = list(map(lambda x: int(x), spec[1:]))
                #t = ' '.join([texts[begin:end] for begin,end in zip(markers[::2], markers[1::2])])
                t = texts[markers[0]:markers[1]]
                if not t == ann[2]:
                    print("Error: Annotation mis-match, file=%s, ann=%s" % (txt_file, str(ann)))
                    return None
                return (name, markers, t)
            anns = map(__parse_ann, anns) # format
            anns = filter(lambda x: x, anns) # skip None

            # building a tree index for easy accessing
            tree = {}
            for entity_type, pos, name in anns:
                if entity_type not in accept_labels:
                    continue
                begin, end = pos[0], pos[1]
                if begin not in tree:
                    tree[begin] = {}
                node = tree[begin]
                if end not in node:
                    node[end] = []
                node[end].append(entity_type)

            # Re-read file in without decoding it
            text_file = copen(txt_file, 'r', encoding='utf-8')
            texts = text_file.read()
            text_file.close()
            return texts, tree

def scan_dir(dir_name):
    items = glob.glob(dir_name + "/*.ann")
    items = map(lambda f: (f, f.replace(".ann", ".txt")), items)
    return items

def preprocess_all(list_file, out_file):
    featzr = BratToCRFSuitFeaturizer(iob=True)
    tokenized = []
    with open(list_file) as f:
        examples = map(lambda l:l.strip().split(','), f.readlines())
    for txt_file, ann_file in examples:
        sents = featzr.convert(txt_file, ann_file)
        tokenized.append(list(sents))

    pickle.dump(tokenized, open(out_file, 'wb'))
    print("Dumped %d docs to %s" % (len(tokenized), out_file))

#######################
# Evaluates the model
def evaluate(tagger, corpus_file):
    
    corpus = pickle.load(open(corpus_file, 'rb'))
    y_pred = []
    y_true = []
    for doc in corpus:
        seq = merge_sequences(doc)
        truth = seq2labels(seq)
        preds = tagger.tag(seq2features(seq))
        assert len(truth) == len(preds)
        y_true.extend(truth)
        y_pred.extend(preds)    
    assert len(y_true) == len(y_pred)
    table = ddict(lambda: ddict(int)) 
    for truth, pred in zip(y_true, y_pred):
        table[truth][pred] += 1
        table[truth]['total'] += 1
        table['total'][pred] += 1
        table['total']['total'] += 1
    keys = []
    for label in accept_labels:
        keys.append('B-%s' % label)
        keys.append('I-%s' % label)
    col_keys = copy(keys)
    precision, recall = {}, {}
    for k in set(keys):
        tot_preds = table['total'][k]
        tot_truth = table[k]['total']
        table['Precision'][k] = "%.4f" % (float(table[k][k]) / tot_preds) if tot_preds else 0 
        table['Recall'][k] = "%.4f" % (float(table[k][k]) / tot_truth) if tot_truth else 0 
    col_keys.extend(['O', 'total'])
    keys.extend(['', 'Precision', 'Recall', '', 'O', 'total'])
    return table, keys, col_keys


def printtable(table, row_keys, col_keys, delim=','):
    """
    print table in CSV format which is meant to be copy pasted to Excel sheet 
    """
    f = sys.stdout
    out = DictWriter(f, delimiter=delim, restval=0, fieldnames=col_keys)
    f.write("%s%s" % ("***", delim))
    out.writeheader()
    for k in row_keys:
        if not k.strip():
            f.write("\n")
            continue
        f.write("%s%s" % (k, delim))
        out.writerow(table[k])
    f.write("\n")

Parse and store the corpus

In this step, we pass the text through CoreNLP pipeline, tokenize and POS tag them. In addition, we lookup the annotations file and match the target annotations with the token.

Since this step is expensive, we store the results in pickle file, so that we can later load and resume our analysis for feature engineering.


In [ ]:
p_dir = "/Users/thammegr/work/mte/data/newcorpus/workspace"
train_list = p_dir + "/train_62r15_685k14_384k15.list"
dev_list= p_dir + "/development.list"
test_list = p_dir + "/test.list"

train_corpus_file = 'mte-corpus-train.pickle'
preprocess_all(train_list, train_corpus_file)

# Test and Development set
dev_corpus_file = 'mte-corpus-dev.pickle'
preprocess_all(dev_list, dev_corpus_file)
test_corpus_file = 'mte-corpus-test.pickle'
preprocess_all(test_list, test_corpus_file)

Load the corpus

Here we load the corpus from pickle file


In [4]:
corpus_file = 'mte-corpus-train.pickle'
corpus = pickle.load(open(corpus_file, 'rb'))
corpus[0][10]


Out[4]:
[['Hollow', 'hollow', 'JJ', 'O', 'O'],
 ['spherical', 'spherical', 'JJ', 'O', 'O'],
 ['feature', 'feature', 'NN', 'O', 'O'],
 ['observed', 'observe', 'VBN', 'O', 'O'],
 ['on', 'on', 'IN', 'O', 'O'],
 ['sol', 'sol', 'NN', 'O', 'O'],
 ['122', '122', 'CD', 'NUMBER', 'O'],
 ['in', 'in', 'IN', 'O', 'O'],
 ['the', 'the', 'DT', 'O', 'O'],
 ['Yellowknife', 'Yellowknife', 'NNP', 'LOCATION', 'O'],
 ['Bay', 'Bay', 'NNP', 'LOCATION', 'O'],
 ['area', 'area', 'NN', 'O', 'O'],
 ['.', '.', '.', 'O', 'O']]

Next, we start playing with the features of CRF Suite to build a sequence tagger.


In [11]:
#%%time
config = {
    'POS': False,
    'gen_POS': True, # generalize POS
    'bias': True,
    'max_suffix_chars': 3,
    'is_lower': True,
    'is_upper': True,
    'is_title': True,
    'text': True,
    'wordshape': 'sound',
    'NER': False, # default NER
    'context': list(range(-1, 2))
}

def get_wordshape_general(word):
    """
    Makes shape of the word based on upper case, lowercase or digit
    """
    # Note : the order of replacement matters, digits should be at the last
    return re.sub("[0-9]", 'd', 
                  re.sub("[A-Z]", 'X',
                         re.sub("[a-z]", 'x', word)))

def get_wordshape_sound(word):
    """
    Makes shape of word based on the vowel or consonenet sound
    """
    # Note : the order of replacement matters, c, v, d in order
    word = re.sub("[bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ]", 'c', word) # consonents
    word = re.sub("[AEIOUaeiou]", 'v', word) # vowels
    word = re.sub("[0-9]", 'd', word) # digits
    return word

def get_wordshape_sound_case(word):
    """
    Makes shape of word based on the vowel or consonenet sound considering case
    """
    word = re.sub("[bcdfghjklmnpqrstvwxyz]", 'c', word) # consonents
    word = re.sub("[BCDFGHJKLMNPQRSTVWXYZ]", 'C', word) # upper consonents
    word = re.sub("[aeiou]", 'v', word) # vowels
    word = re.sub("[AEIOU]", 'V', word) # upper vowels
    word = re.sub("[+-]?[0-9]+(\.[0-9]+)?", 'N', word) # digits
    return word

def word2features(sent, idx):
    word = sent[idx]
    words = []
    feats = []

    # Context
    context = set(config.get('context', []))
    context.add(0)  # current word
    for ctx in sorted(context):
        pos = ctx + idx
        if pos >= 0 and pos < len(sent):
            words.append((str(ctx), sent[pos]))
    
    if idx == 0:
        feats.append('BOS') # begin of sequence
    if idx == len(sent) - 1:
        feats.append('EOS')
    for prefix, word in words:
        assert len(word) == 5
        txt, lemma, POS, ner, label = word 
        if config.get('bias'):
            feats.append('%sword.bias'% (prefix))
        if config.get('POS'):
            feats.append('%sword.pos=%s' %(prefix, POS))
        if config.get('gen_POS'):
            feats.append('%sword.genpos=%s' %(prefix, POS[:2]))
        if config.get('max_suffix_chars'):
            for i in range(1, config.get('max_suffix_chars', -1) + 1):
                if len(txt) < i:
                    break
                feats.append('%sword[-%d:]=%s' % (prefix, i, txt[-i:]))
        if config.get('is_lower'):
            feats.append('%sword.islower=%s' % (prefix, txt.islower()))
        if config.get('is_upper'):
            feats.append('%sword.isupper=%s' % (prefix, txt.isupper()))
        if config.get('is_title'):
            feats.append('%sword.istitle=%s' % (prefix, txt.istitle()))
        if config.get('wordshape'):
            shape = config['wordshape'] 
            if shape == 'general':
                shape_val = get_wordshape_general(txt)
            elif shape == 'sound':
                shape_val = get_wordshape_sound(txt)
            elif shape == 'sound_case':
                shape_val = get_wordshape_sound_case(txt)
            else:
                raise Error("Word Shape spec unknown '%s'" % config['wordshape'])
            feats.append('%sword.shape=%s' % (prefix, shape_val))
        if config.get('NER'):
            feats.append('%sword.ner=%s' % (prefix, ner))
        if config.get('text'):
            feats.append('%sword.text=%s' % (prefix, txt))
    return feats

def seq2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def seq2labels(sent):
    # Assumption the last one in array is always a label
    return [tok[-1] for tok in sent] 

def merge_sequences(doc):
    '''
    document contains multiple sentences. here all sentences in document are merged to form one large sequence.
    '''
    res = []
    for seq in doc:
        res.extend(seq)
        res.append(['|', '|', '|', 'O', 'O']) # sentence end marker
    return res
  

def train(corpus, model_file):
    trainer = pycrfsuite.Trainer(verbose=False)
    # Load training examples
    flag = True
    for doc in corpus:
        seq = merge_sequences(doc)
        x_seq = seq2features(seq)
        if flag:
            p = 403
            print("Sample features:")
            print("\n".join(map(str, seq[p-6:p+6])))
            print("\n".join(x_seq[p]))
            flag = False
        y_seq = seq2labels(seq)
        trainer.append(x_seq, y_seq)

    trainer.set_params({
        'c1': 0.5,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })


    st = time.time()
    pprint(trainer.params())
    pprint(config)
    trainer.train(model_file)
    print("Training Time: %.3fs" % (time.time() - st))

model_file = 'jpl-mars-target-ner-model.crfsuite'
train(corpus, model_file)

tagger = pycrfsuite.Tagger()
tagger.open(model_file)
print("\nEvaluating on Development Set\n")
dev_corpus_file = 'mte-corpus-dev.pickle'
printtable(*evaluate(tagger, dev_corpus_file))

print("\nEvaluating on Test Set\n")
test_corpus_file = 'mte-corpus-test.pickle'
printtable(*evaluate(tagger, test_corpus_file))


Sample features:
['hollow', 'hollow', 'JJ', 'O', 'O']
['.', '.', '.', 'O', 'O']
['|', '|', '|', 'O', 'O']
['No', 'no', 'DT', 'O', 'O']
['such', 'such', 'JJ', 'O', 'O']
['features', 'feature', 'NNS', 'O', 'O']
['were', 'be', 'VBD', 'O', 'O']
['observed', 'observe', 'VBN', 'O', 'O']
['across', 'across', 'IN', 'O', 'O']
['Bradbury', 'Bradbury', 'NNP', 'O', 'O']
['Rise', 'rise', 'NN', 'O', 'O']
[',', ',', ',', 'O', 'O']
-1word.bias
-1word.genpos=NN
-1word[-1:]=s
-1word[-2:]=es
-1word[-3:]=res
-1word.islower=True
-1word.isupper=False
-1word.istitle=False
-1word.shape=cvvcvcvc
-1word.text=features
0word.bias
0word.genpos=VB
0word[-1:]=e
0word[-2:]=re
0word[-3:]=ere
0word.islower=True
0word.isupper=False
0word.istitle=False
0word.shape=cvcv
0word.text=were
1word.bias
1word.genpos=VB
1word[-1:]=d
1word[-2:]=ed
1word[-3:]=ved
1word.islower=True
1word.isupper=False
1word.istitle=False
1word.shape=vccvccvc
1word.text=observed
['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']
{'NER': False,
 'POS': False,
 'bias': True,
 'context': [-1, 0, 1],
 'gen_POS': True,
 'is_lower': True,
 'is_title': True,
 'is_upper': True,
 'max_suffix_chars': 3,
 'text': True,
 'wordshape': 'sound'}
Training Time: 133.967s

Evaluating on Development Set

***,B-Mineral,I-Mineral,B-Target,I-Target,B-Element,I-Element,O,total
B-Mineral,195,0,0,0,1,0,44,240
I-Mineral,0,0,0,0,0,0,0,0
B-Target,0,0,36,0,0,0,111,147
I-Target,0,0,0,8,0,0,6,14
B-Element,0,0,0,0,331,0,44,375
I-Element,0,0,0,0,0,0,0,0

Precision,0.9701,0,0.9730,0.8889,0.8874,0,0,0
Recall,0.8125,0,0.2449,0.5714,0.8827,0,0,0

O,6,0,1,1,41,0,34145,34194
total,201,0,37,9,373,0,34350,34970


Evaluating on Test Set

***,B-Mineral,I-Mineral,B-Target,I-Target,B-Element,I-Element,O,total
B-Mineral,265,0,0,0,0,0,47,312
I-Mineral,1,0,0,0,0,0,2,3
B-Target,0,0,57,0,0,0,137,194
I-Target,0,0,0,8,0,0,12,20
B-Element,0,0,0,0,431,0,27,458
I-Element,0,0,0,0,0,0,0,0

Precision,0.9707,0,0.9048,0.8889,0.8725,0,0,0
Recall,0.8494,0.0000,0.2938,0.4000,0.9410,0,0,0

O,7,0,6,1,63,0,59566,59643
total,273,0,63,9,494,0,59791,60630

Using the model to predict


In [220]:
tagger = pycrfsuite.Tagger()
tagger.open(model_file)

with open(dev_corpus_file, 'rb') as f:
    dev_corpus = pickle.load(f)

ctx = (-3, 4)
c = 0
print("idx, Truth, Predicted, Word, Comment ")
for doc in dev_corpus:
    seq = merge_sequences(doc)
    y = seq2labels(seq)
    y_ = tagger.tag(seq2features(seq))
    
    for idx in range(len(seq)):
        a, p, tok = y[idx], y_[idx], seq[idx]
        if a == 'O' and p == 'B-Element':
            for pos in filter(lambda p: 0 <= p < len(seq), range(idx+ctx[0], idx+ctx[1])):
                if idx == pos:
                    label = "<CORR>" if a == p else "<ERR>"
                else:
                    label = "%d" % (pos - idx)
                print("%4d %9s %9s %8s %s" % (pos, y[pos], y_[pos], label, str(seq[pos])))
            print("")
            if a != p:
                c += 1
print(c)


idx, Truth, Predicted, Word, Comment 
 500         O         O       -3 ['Cooperstown', 'Cooperstown', 'NNP', 'LOCATION', 'O']
 501         O         O       -2 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
 502         O         O       -1 ['-4.62', '-4.62', 'CD', 'NUMBER', 'O']
 503         O B-Element    <ERR> ['N', 'n', 'NN', 'O', 'O']
 504         O         O        1 [',', ',', ',', 'O', 'O']
 505         O         O        2 ['137.42', '137.42', 'CD', 'NUMBER', 'O']
 506         O         O        3 ['E', 'e', 'NN', 'O', 'O']

 550         O         O       -3 ['Kimberley', 'kimberley', 'NN', 'PERSON', 'O']
 551         O         O       -2 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
 552         O         O       -1 ['-4.64', '-4.64', 'CD', 'NUMBER', 'O']
 553         O B-Element    <ERR> ['N', 'n', 'NN', 'O', 'O']
 554         O         O        1 [',', ',', ',', 'O', 'O']
 555         O         O        2 ['137.4', '137.4', 'CD', 'NUMBER', 'O']
 556         O         O        3 ['E', 'e', 'NN', 'O', 'O']

1536         O         O       -3 ['.', '.', '.', 'O', 'O']
1537         O         O       -2 ['|', '|', '|', 'O', 'O']
1538         O         O       -1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1539         O B-Element    <ERR> ['B', 'b', 'NN', 'O', 'O']
1540         O         O        1 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']
1541         O         O        2 ['Na2O', 'na2o', 'NN', 'O', 'O']
1542         O         O        3 ['versus', 'versus', 'CC', 'O', 'O']

 412         O         O       -3 ['using', 'use', 'VBG', 'O', 'O']
 413         O         O       -2 ['a', 'a', 'DT', 'O', 'O']
 414         O         O       -1 ['1064-nm', '1064-nm', 'JJ', 'O', 'O']
 415         O B-Element    <ERR> ['Nd', 'nd', 'NN', 'O', 'O']
 416         O         O        1 [':', ':', ':', 'O', 'O']
 417         O         O        2 ['YAG', 'yag', 'NN', 'O', 'O']
 418         O         O        3 ['q-switched', 'q-switched', 'JJ', 'O', 'O']

 951         O         O       -3 ['The', 'the', 'DT', 'O', 'O']
 952         O         O       -2 ['ratio', 'ratio', 'NN', 'O', 'O']
 953         O         O       -1 ['of', 'of', 'IN', 'O', 'O']
 954         O B-Element    <ERR> ['Si', 'Si', 'NNP', 'O', 'O']
 955         O         O        1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
 956         O         O        2 ['II', 'II', 'NNP', 'O', 'O']
 957         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

 994         O         O       -3 ['|', '|', '|', 'O', 'O']
 995         O         O       -2 ['The', 'the', 'DT', 'O', 'O']
 996         O         O       -1 ['flat', 'flat', 'JJ', 'O', 'O']
 997         O B-Element    <ERR> ['Si', 'Si', 'NNP', 'O', 'O']
 998         O         O        1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
 999         O         O        2 ['II', 'II', 'NNP', 'O', 'O']
1000         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

1050         O         O       -3 ['.', '.', '.', 'O', 'O']
1051         O         O       -2 ['|', '|', '|', 'O', 'O']
1052         O         O       -1 ['Decreasing', 'decrease', 'VBG', 'O', 'O']
1053         O B-Element    <ERR> ['Si', 'Si', 'NNP', 'O', 'O']
1054         O         O        1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1055         O         O        2 ['II', 'II', 'NNP', 'O', 'O']
1056         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

1068         O         O       -3 ['collection', 'collection', 'NN', 'O', 'O']
1069         O         O       -2 ['suggest', 'suggest', 'VBP', 'O', 'O']
1070         O         O       -1 ['that', 'that', 'IN', 'O', 'O']
1071         O B-Element    <ERR> ['Si', 'Si', 'NNP', 'O', 'O']
1072         O         O        1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1073         O         O        2 ['II', 'II', 'NNP', 'O', 'O']
1074         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

1073         O         O       -3 ['II', 'II', 'NNP', 'O', 'O']
1074         O         O       -2 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']
1075         O         O       -1 ['and', 'and', 'CC', 'O', 'O']
1076         O B-Element    <ERR> ['Si', 'Si', 'NNP', 'O', 'O']
1077         O         O        1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1078         O         O        2 ['I', 'I', 'PRP', 'O', 'O']
1079         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

1086         O         O       -3 ['plasma', 'plasma', 'NN', 'O', 'O']
1087         O         O       -2 [',', ',', ',', 'O', 'O']
1088         O         O       -1 ['and', 'and', 'CC', 'O', 'O']
1089         O B-Element    <ERR> ['Si', 'Si', 'NNP', 'MISC', 'O']
1090         O         O        1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1091         O         O        2 ['I', 'I', 'PRP', 'O', 'O']
1092         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

1111         O         O       -3 ['who', 'who', 'WP', 'O', 'O']
1112         O         O       -2 ['found', 'find', 'VBD', 'O', 'O']
1113         O         O       -1 ['that', 'that', 'IN', 'O', 'O']
1114         O B-Element    <ERR> ['Al', 'Al', 'NNP', 'O', 'O']
1115         O         O        1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1116         O         O        2 ['II', 'II', 'NNP', 'O', 'O']
1117         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

1118         O         O       -3 ['propagates', 'propagate', 'VBZ', 'O', 'O']
1119         O         O       -2 ['vertically', 'vertically', 'RB', 'O', 'O']
1120         O         O       -1 ['while', 'while', 'IN', 'O', 'O']
1121         O B-Element    <ERR> ['Al', 'Al', 'NNP', 'O', 'O']
1122         O         O        1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1123         O         O        2 ['I', 'I', 'PRP', 'O', 'O']
1124         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

 457         O         O       -3 ['telescope', 'telescope', 'NN', 'O', 'O']
 458         O         O       -2 [',', ',', ',', 'O', 'O']
 459         O         O       -1 ['a', 'a', 'DT', 'O', 'O']
 460         O B-Element    <ERR> ['Nd', 'nd', 'NN', 'O', 'O']
 461         O         O        1 [':', ':', ':', 'O', 'O']
 462         O         O        2 ['YAG', 'yag', 'NN', 'O', 'O']
 463         O         O        3 ['laser', 'laser', 'NN', 'O', 'O']

  26         O         O       -3 ['T.', 'T.', 'NNP', 'O', 'O']
  27         O         O       -2 ['Dequaire1', 'Dequaire1', 'NNP', 'O', 'O']
  28         O         O       -1 [',', ',', ',', 'O', 'O']
  29         O B-Element    <ERR> ['P', 'p', 'NN', 'O', 'O']
  30         O         O        1 [',', ',', ',', 'O', 'O']
  31         O         O        2 ['Y.', 'Y.', 'NNP', 'O', 'O']
  32         O         O        3 ['Meslin2', 'Meslin2', 'NNP', 'O', 'O']

  79         O         O       -3 [',', ',', ',', 'O', 'O']
  80         O         O       -2 ['1LISA', '1lisa', 'NN', 'O', 'O']
  81         O         O       -1 [',', ',', ',', 'O', 'O']
  82         O B-Element    <ERR> ['Cr', 'cr', 'NN', 'O', 'O']
  83         O         O        1 ['teilParis', 'teilparis', 'NN', 'O', 'O']
  84         O         O        2 [',', ',', ',', 'O', 'O']
  85         O         O        3 ['France', 'France', 'NNP', 'LOCATION', 'O']

1699         O         O       -3 ['reveal', 'reveal', 'VBP', 'O', 'O']
1700         O         O       -2 ['detections', 'detection', 'NNS', 'O', 'O']
1701         O         O       -1 ['of', 'of', 'IN', 'O', 'O']
1702         O B-Element    <ERR> ['N', 'n', 'NN', 'O', 'O']
1703         O         O        1 [',', ',', ',', 'O', 'O']
1704         O B-Element        2 ['H', 'h', 'NN', 'O', 'O']
1705         O         O        3 ['and', 'and', 'CC', 'O', 'O']

1701         O         O       -3 ['of', 'of', 'IN', 'O', 'O']
1702         O B-Element       -2 ['N', 'n', 'NN', 'O', 'O']
1703         O         O       -1 [',', ',', ',', 'O', 'O']
1704         O B-Element    <ERR> ['H', 'h', 'NN', 'O', 'O']
1705         O         O        1 ['and', 'and', 'CC', 'O', 'O']
1706         O         O        2 ['C.', 'C.', 'NNP', 'O', 'O']
1707         O         O        3 ['1364', '1364', 'CD', 'DATE', 'O']

  30         O         O       -3 ['.', '.', '.', 'O', 'O']
  31         O         O       -2 ['|', '|', '|', 'O', 'O']
  32         O         O       -1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
  33         O B-Element    <ERR> ['B', 'b', 'NN', 'O', 'O']
  34         O         O        1 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']
  35         O         O        2 ['MAHLI', 'mahli', 'NN', 'O', 'O']
  36         O         O        3 ['image', 'image', 'NN', 'O', 'O']

 662         O         O       -3 ['are', 'be', 'VBP', 'O', 'O']
 663         O         O       -2 ['armored', 'armored', 'JJ', 'O', 'O']
 664         O         O       -1 ['by', 'by', 'IN', 'O', 'O']
 665         O B-Element    <ERR> ['medium', 'medium', 'NN', 'O', 'O']
 666         O         O        1 ['to', 'to', 'TO', 'O', 'O']
 667         O         O        2 ['coarse', 'coarse', 'JJ', 'O', 'O']
 668         O         O        3 ['sand', 'sand', 'NN', 'O', 'O']

1155         O         O       -3 ['absorptions', 'absorption', 'NNS', 'O', 'O']
1156         O         O       -2 ['indicating', 'indicate', 'VBG', 'O', 'O']
1157         O         O       -1 ['dominantly', 'dominantly', 'RB', 'O', 'O']
1158         O B-Element    <ERR> ['Fe', 'Fe', 'NNP', 'O', 'O']
1159         O         O        1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1160         O         O        2 ['II', 'II', 'NNP', 'O', 'O']
1161         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

1161         O         O       -3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']
1162         O         O       -2 ['rather', 'rather', 'RB', 'O', 'O']
1163         O         O       -1 ['than', 'than', 'IN', 'O', 'O']
1164         O B-Element    <ERR> ['Fe', 'Fe', 'NNP', 'O', 'O']
1165         O         O        1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1166         O         O        2 ['III', 'III', 'NNP', 'O', 'O']
1167         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

1052         O         O       -3 ['have', 'have', 'VB', 'O', 'O']
1053         O         O       -2 ['a', 'a', 'DT', 'O', 'O']
1054         O         O       -1 ['high', 'high', 'JJ', 'O', 'O']
1055         O B-Element    <ERR> ['Ca', 'ca', 'NN', 'O', 'O']
1056         O         O        1 ['+', '+', 'CC', 'O', 'O']
1057         O         O        2 ['Na', 'na', 'NN', 'O', 'O']
1058         O         O        3 ['+', '+', 'CC', 'O', 'O']

1333 B-Mineral         O       -3 ['andesine', 'andesine', 'NN', 'O', 'B-Mineral']
1334         O         O       -2 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1335         O         O       -1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1336         O B-Element    <ERR> ['Ca', 'ca', 'NN', 'O', 'O']
1337         O         O        1 [',', ',', ',', 'O', 'O']
1338         O B-Element        2 ['Na', 'na', 'NN', 'MISC', 'O']
1339         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

1335         O         O       -3 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1336         O B-Element       -2 ['Ca', 'ca', 'NN', 'O', 'O']
1337         O         O       -1 [',', ',', ',', 'O', 'O']
1338         O B-Element    <ERR> ['Na', 'na', 'NN', 'MISC', 'O']
1339         O         O        1 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']
1340         O         O        2 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1341         O B-Element        3 ['Al', 'Al', 'NNP', 'O', 'O']

1338         O B-Element       -3 ['Na', 'na', 'NN', 'MISC', 'O']
1339         O         O       -2 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']
1340         O         O       -1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1341         O B-Element    <ERR> ['Al', 'Al', 'NNP', 'O', 'O']
1342         O         O        1 [',', ',', ',', 'O', 'O']
1343         O B-Element        2 ['Si', 'Si', 'NNP', 'O', 'O']
1344         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

1340         O         O       -3 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
1341         O B-Element       -2 ['Al', 'Al', 'NNP', 'O', 'O']
1342         O         O       -1 [',', ',', ',', 'O', 'O']
1343         O B-Element    <ERR> ['Si', 'Si', 'NNP', 'O', 'O']
1344         O         O        1 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']
1345         O         O        2 ['4O8', '4o8', 'NN', 'O', 'O']
1346         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

 280         O         O       -3 [',', ',', ',', 'O', 'O']
 281         O         O       -2 ['USA', 'USA', 'NNP', 'LOCATION', 'O']
 282         O         O       -1 [',', ',', ',', 'O', 'O']
 283         O B-Element    <ERR> ['13Oregon', '13Oregon', 'NNP', 'O', 'O']
 284         O         O        1 ['State', 'State', 'NNP', 'ORGANIZATION', 'O']
 285         O         O        2 ['University', 'University', 'NNP', 'ORGANIZATION', 'O']
 286         O         O        3 [',', ',', ',', 'O', 'O']

  83         O         O       -3 ['3CalTech', '3CalTech', 'NNP', 'O', 'O']
  84         O         O       -2 [',', ',', ',', 'O', 'O']
  85         O         O       -1 ['4Washington', '4Washington', 'NNP', 'O', 'O']
  86         O B-Element    <ERR> ['U.', 'U.', 'NNP', 'O', 'O']
  87         O         O        1 [',', ',', ',', 'O', 'O']
  88         O         O        2 ['5MSSS', '5msss', 'NN', 'O', 'O']
  89         O         O        3 [',', ',', ',', 'O', 'O']

 205         O         O       -3 ['to', 'to', 'TO', 'O', 'O']
 206         O         O       -2 ['estimate', 'estimate', 'VB', 'O', 'O']
 207         O         O       -1 ['the', 'the', 'DT', 'O', 'O']
 208         O B-Element    <ERR> ['halogen', 'halogen', 'NN', 'O', 'O']
 209         O         O        1 ['budget', 'budget', 'NN', 'O', 'O']
 210         O         O        2 ['of', 'of', 'IN', 'O', 'O']
 211         O         O        3 ['Mars', 'Mars', 'NNP', 'O', 'O']

 778         O         O       -3 ['1', '1', 'CD', 'NUMBER', 'O']
 779         O         O       -2 ['Ca', 'ca', 'NN', 'O', 'O']
 780         O         O       -1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
 781         O B-Element    <ERR> ['F', 'f', 'NN', 'O', 'O']
 782         O         O        1 [',', ',', ',', 'O', 'O']
 783         O B-Element        2 ['Cl', 'cl', 'NN', 'O', 'O']
 784         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

 780         O         O       -3 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
 781         O B-Element       -2 ['F', 'f', 'NN', 'O', 'O']
 782         O         O       -1 [',', ',', ',', 'O', 'O']
 783         O B-Element    <ERR> ['Cl', 'cl', 'NN', 'O', 'O']
 784         O         O        1 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']
 785         O         O        2 ['2', '2', 'CD', 'NUMBER', 'O']
 786         O         O        3 ['Ca10', 'ca10', 'NN', 'O', 'O']

 789         O         O       -3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']
 790         O         O       -2 ['6', '6', 'CD', 'NUMBER', 'O']
 791         O         O       -1 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
 792         O B-Element    <ERR> ['F', 'f', 'NN', 'O', 'O']
 793         O         O        1 [',', ',', ',', 'O', 'O']
 794         O B-Element        2 ['Cl', 'cl', 'NN', 'O', 'O']
 795         O         O        3 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']

 791         O         O       -3 ['-LRB-', '-lrb-', '-LRB-', 'O', 'O']
 792         O B-Element       -2 ['F', 'f', 'NN', 'O', 'O']
 793         O         O       -1 [',', ',', ',', 'O', 'O']
 794         O B-Element    <ERR> ['Cl', 'cl', 'NN', 'O', 'O']
 795         O         O        1 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']
 796         O         O        2 ['2', '2', 'CD', 'NUMBER', 'O']
 797         O         O        3 ['These', 'these', 'DT', 'O', 'O']

 866         O         O       -3 ['of', 'of', 'IN', 'O', 'O']
 867 B-Element B-Element       -2 ['F', 'f', 'NN', 'O', 'B-Element']
 868         O         O       -1 ['and', 'and', 'CC', 'O', 'O']
 869         O B-Element    <ERR> ['Cl', 'cl', 'NN', 'O', 'O']
 870         O         O        1 ['-RRB-', '-rrb-', '-RRB-', 'O', 'O']
 871         O         O        2 ['.', '.', '.', 'O', 'O']
 872         O         O        3 ['|', '|', '|', 'O', 'O']

1301         O         O       -3 ['Anderson', 'Anderson', 'NNP', 'PERSON', 'O']
1302         O         O       -2 [',', ',', ',', 'O', 'O']
1303         O         O       -1 ['R.', 'R.', 'NNP', 'O', 'O']
1304         O B-Element    <ERR> ['B.', 'B.', 'NNP', 'O', 'O']
1305         O         O        1 [',', ',', ',', 'O', 'O']
1306         O         O        2 ['et', 'et', 'FW', 'O', 'O']
1307         O         O        3 ['al.', 'al.', 'FW', 'O', 'O']

1330         O         O       -3 ['Spectrochimica', 'Spectrochimica', 'NNP', 'O', 'O']
1331         O         O       -2 ['Acta', 'Acta', 'NNP', 'O', 'O']
1332         O         O       -1 ['Part', 'Part', 'NNP', 'O', 'O']
1333         O B-Element    <ERR> ['B', 'B', 'NNP', 'O', 'O']
1334         O         O        1 [':', ':', ':', 'O', 'O']
1335         O         O        2 ['Atomic', 'atomic', 'JJ', 'O', 'O']
1336         O         O        3 ['Spectroscopy', 'spectroscopy', 'NN', 'O', 'O']

1384         O         O       -3 ['-RSB-', '-rsb-', '-RRB-', 'O', 'O']
1385         O         O       -2 ['Schr', 'Schr', 'NNP', 'O', 'O']
1386         O         O       -1 ['der', 'der', 'NNP', 'O', 'O']
1387         O B-Element    <ERR> ['S.', 'S.', 'NNP', 'O', 'O']
1388         O         O        1 [',', ',', ',', 'O', 'O']
1389         O         O        2 ['et', 'et', 'FW', 'O', 'O']
1390         O         O        3 ['al.', 'al.', 'FW', 'O', 'O']

37

Evaluate

Interpretation of matrix

Row sum is total true labels Column sum is predictions total labels


In [112]:
print("\nTest Set")
test_corpus_file = 'mte-corpus-test.pickle'
printtable(*evaluate(tagger, test_corpus_file))


Test Set
***,B-Mineral,I-Mineral,B-Target,I-Target,B-Element,I-Element,O,total
B-Mineral,0,312,0,0,0,0,0,312
I-Mineral,0,3,0,0,0,0,0,3
B-Target,0,194,0,0,0,0,0,194
I-Target,0,20,0,0,0,0,0,20
B-Element,0,458,0,0,0,0,0,458
I-Element,0,0,0,0,0,0,0,0
Precision,0,0.0000,0,0,0,0,0,0
Recall,0.0000,1.0000,0.0000,0.0000,0.0000,0,0,0

O,0,59643,0,0,0,0,0,59643
total,0,60630,0,0,0,0,0,60630


In [ ]:


In [33]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    #tagset.append('O')
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

def evaluate(tagger, corpus_file):    
    corpus = pickle.load(open(corpus_file, 'rb'))
    y_pred = []
    y_true = []
    for doc in corpus:
        seq = merge_sequences(doc)
        y_true.append(seq2labels(seq))
        y_pred.append(tagger.tag(seq2features(seq)))
    return bio_classification_report(y_true, y_pred)


dev_corpus_file = 'mte-corpus-dev.pickle'
test_corpus_file = 'mte-corpus-test.pickle'
print("Development")
print(evaluate(tagger, dev_corpus_file))

print("Testing")
print(evaluate(tagger, test_corpus_file))


Development
             precision    recall  f1-score   support

  B-Element       0.85      0.80      0.82       375
  B-Mineral       0.80      0.46      0.59       240
   B-Target       0.85      0.16      0.26       147
   I-Target       0.83      0.36      0.50        14

avg / total       0.99      0.99      0.99     34970

Testing
             precision    recall  f1-score   support

  B-Element       0.87      0.86      0.86       458
  B-Mineral       0.76      0.54      0.64       312
  I-Mineral       0.00      0.00      0.00         3
   B-Target       0.93      0.21      0.34       194
   I-Target       0.78      0.35      0.48        20

avg / total       0.84      0.62      0.68       987

/usr/local/lib/python3.5/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Learning: State Transitions


In [221]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])


Top likely transitions:
B-Target -> B-Target 2.386108
B-Target -> I-Target 2.208480
I-Mineral -> I-Mineral 2.114082
I-Target -> B-Target 2.111899
B-Mineral -> I-Mineral 1.460643
O      -> O       1.213288
B-Element -> I-Element 0.872641
I-Target -> I-Target 0.816825
B-Mineral -> B-Mineral 0.156233
O      -> B-Mineral 0.151186
B-Mineral -> O       0.115508
I-Target -> O       0.043102
O      -> B-Target 0.039404
I-Mineral -> O       0.035210
B-Element -> O       -0.018350

Top unlikely transitions:
B-Element -> I-Mineral -0.294718
B-Target -> B-Mineral -0.296192
I-Element -> O       -0.393103
I-Target -> B-Element -0.516234
B-Element -> B-Element -0.554523
B-Mineral -> B-Element -0.687627
B-Target -> O       -0.748083
B-Mineral -> B-Target -0.835439
B-Mineral -> I-Target -0.917494
B-Element -> B-Mineral -1.130964
B-Target -> B-Element -1.475913
O      -> I-Element -2.597084
B-Element -> I-Target -4.038288
O      -> I-Mineral -7.106403
O      -> I-Target -8.379464

Learning: State Features


In [222]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])


Top positive:
8.058676 O      0word[-2:]=be
6.673286 B-Mineral 0word.shape=ccvcvcccvcvcv
6.495373 O      0word.genpos=DT
6.414756 B-Element 0word[-2:]=Ho
6.412245 B-Element 0word[-2:]=Sn
6.403895 O      0word[-2:]=We
5.984024 O      0word[-1:]=A
5.793493 B-Target 0word[-2:]=CB
5.733161 B-Element 0word[-2:]=Dy
5.418787 B-Target 0word[-2:]=JK
5.315773 B-Element 0word[-2:]=Ag
5.286066 O      0word.shape=vcvvcvccvvc
5.140279 O      0word[-1:]=Q
5.037365 O      0word[-2:]=Ma
5.005682 B-Element 0word[-2:]=Eu
4.958816 B-Mineral 0word.shape=vcvcvcv
4.941674 O      0word.shape=ccvcvcvcvccvcv
4.894968 O      0word[-1:]=T
4.866134 O      0word[-1:]=G
4.829303 O      -1word.shape=dddd.ccc

Top negative:
-2.700686 O      0word.shape=v
-2.731780 O      0word[-3:]=aat
-2.736311 O      0word.shape=cvccvcvc
-2.753581 B-Element 0word[-2:]=ed
-2.758393 O      0word[-3:]=fur
-2.770790 B-Element 0word.ner=LOCATION
-2.814486 O      0word[-3:]=gen
-2.876500 I-Target 0word.islower=True
-3.049026 O      0word.shape=cccccvcvcvcvcv
-3.206710 O      0word.shape=cvccccvcc
-3.278055 O      0word[-2:]=Mo
-3.305678 O      0word[-2:]=La
-3.319855 O      0word[-3:]=bon
-3.344219 B-Target -1word.ner=PERSON
-3.418571 O      0word[-2:]=Lu
-3.455216 O      0word.shape=ccvcvvccvcv
-3.716405 O      0word[-3:]=kel
-3.971070 O      0word.shape=cccccvcvcvcvcvc
-5.492894 O      0word[-3:]=for
-5.822459 O      0word.shape=cccvcvcv

In [ ]:


In [213]:
arr = ['a', 'b', 'c']
a, b, c =  *arr
print(a,b,c)


  File "<ipython-input-213-64e54bd3b239>", line 2
    a, b, c =  *arr
              ^
SyntaxError: can't use starred expression here

In [197]:
s = "hellow 124.45 -65.7623"
get_wordshape_sound_case("hellow 124.45 -65.7623")
#get_wordshape_sound(s)


Out[197]:
'cvcvc N N'

In [157]:
"abcd"[:2]


Out[157]:
'ab'

In [ ]: