Named Entity Recognizer - CONLL 2003 English

Use the CRF Implementation of sklearn-crfsuite to build a NER for the CONLL-2003 English Task


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

import pandas as pd 
import numpy as np

In [12]:
import os

def read_raw_input(filename):
    """Read a train/test file and return the contents as a list of list of lists. 
    
    The innermost list is a record of 4 items, one per word.
    The middle-level list contains all the records in one sentence.
    """
    raw_train_file = os.path.join('CoNLL-2003', filename)

    all_items = []

    with open(raw_train_file) as fh:
        current_item = []
        all_items.append(current_item)

        for line in fh:
            tags = line.strip().split()
            if len(tags) == 0 or tags[0] == '-DOCSTART-':
                continue
            current_item.append(tags)
            if tags[0] == '.' and tags[1] == '.':
                current_item = []
                all_items.append(current_item)
                
    return all_items

In [73]:
%%time
train_sents = read_raw_input('eng.train')
test_sents = read_raw_input('eng.testA')


Wall time: 991 ms

In [74]:
from IPython.display import display
display(train_sents[0])


[['EU', 'NNP', 'I-NP', 'I-ORG'],
 ['rejects', 'VBZ', 'I-VP', 'O'],
 ['German', 'JJ', 'I-NP', 'I-MISC'],
 ['call', 'NN', 'I-NP', 'O'],
 ['to', 'TO', 'I-VP', 'O'],
 ['boycott', 'VB', 'I-VP', 'O'],
 ['British', 'JJ', 'I-NP', 'I-MISC'],
 ['lamb', 'NN', 'I-NP', 'O'],
 ['.', '.', 'O', 'O']]

In [75]:
display(test_sents[0])


[['CRICKET', 'NNP', 'I-NP', 'O'],
 ['-', ':', 'O', 'O'],
 ['LEICESTERSHIRE', 'NNP', 'I-NP', 'I-ORG'],
 ['TAKE', 'NNP', 'I-NP', 'O'],
 ['OVER', 'IN', 'I-PP', 'O'],
 ['AT', 'NNP', 'I-NP', 'O'],
 ['TOP', 'NNP', 'I-NP', 'O'],
 ['AFTER', 'NNP', 'I-NP', 'O'],
 ['INNINGS', 'NNP', 'I-NP', 'O'],
 ['VICTORY', 'NN', 'I-NP', 'O'],
 ['.', '.', 'O', 'O']]

In [76]:
def make_df_for_sent(single_sent):
    """Make a DataFrame out of all the records for a single sentence."""
    df = pd.DataFrame(data=single_sent, columns=['word', 'pos', 'parse', 'ner'])
    df.index.name = 'word_seq_num'
    return df
    
def all_sentences(sents):
    """Convert the list of list of lists to a list of DataFrames."""
    total_df = [make_df_for_sent(s) for s in sents]
    return total_df

In [77]:
%%time
train_sents = all_sentences(train_sents)
test_sents  = all_sentences(test_sents)


Wall time: 8.68 s

In [78]:
train_sents[0]


Out[78]:
word pos parse ner
word_seq_num
0 EU NNP I-NP I-ORG
1 rejects VBZ I-VP O
2 German JJ I-NP I-MISC
3 call NN I-NP O
4 to TO I-VP O
5 boycott VB I-VP O
6 British JJ I-NP I-MISC
7 lamb NN I-NP O
8 . . O O

In [89]:
def get_labels(all_sents):
    """Return the labels for all the words in a collection of sentences."""
    all_labels = []
    
    for s_df in all_sents:
        labels = s_df.loc[:, 'ner'].tolist()
        all_labels.append(labels)
        
    return all_labels 


def word2features(i, single_sent_df):
    """
    Return a dictionary of feature names and values for the word at ``word_idx`` 
    in a single sentence represented as a ``DataFrame``."""
    
    word, postag = single_sent_df.iloc[i].loc[['word', 'pos']]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1, postag1 = single_sent_df.iloc[i-1].loc[['word', 'pos']]
        
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < (single_sent_df.shape[0] - 1):
        word1, postag1 = single_sent_df.iloc[i+1].loc[['word', 'pos']]
        
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(s_df):
    """
    Return the feature values extracted from a single sentence.
    """    
    features = s_df.index.map(lambda word_idx: word2features(word_idx, s_df))
    return features.tolist()

def get_feature_values(all_sents):
    """Get the feature values for all the sentences in train/test dataset."""
    
    all_features = [sent2features(s) for s in all_sents]    
    return all_features

In [94]:
%%time
X_train = get_feature_values(train_sents)


Wall time: 7min 52s

In [93]:
%%time
X_test = get_feature_values(test_sents)


Wall time: 2min 3s

In [92]:
%%time
y_train, y_test = get_labels(train_sents), get_labels(test_sents)


Wall time: 938 ms

In [95]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=200,
    verbose=False,
    all_possible_transitions=True
)

crf.fit(X_train, y_train)


Wall time: 1min 20s

In [96]:
labels = list(crf.classes_)
labels.remove('O')
labels


Out[96]:
['I-ORG', 'I-MISC', 'I-PER', 'I-LOC', 'B-LOC', 'B-MISC', 'B-ORG']

In [97]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)


d:\Anaconda3\envs\crf\lib\site-packages\sklearn\metrics\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
d:\Anaconda3\envs\crf\lib\site-packages\sklearn\metrics\classification.py:1137: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
Out[97]:
0.8848009691129132

In [98]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
report = metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)
print(report)


d:\Anaconda3\envs\crf\lib\site-packages\sklearn\metrics\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
d:\Anaconda3\envs\crf\lib\site-packages\sklearn\metrics\classification.py:1137: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
             precision    recall  f1-score   support

      B-LOC      0.000     0.000     0.000         0
      I-LOC      0.910     0.862     0.885      2094
     B-MISC      0.500     0.500     0.500         4
     I-MISC      0.896     0.813     0.853      1264
      B-ORG      0.000     0.000     0.000         0
      I-ORG      0.864     0.825     0.844      2092
      I-PER      0.926     0.924     0.925      3149

avg / total      0.902     0.868     0.885      8603


In [ ]: