In this notebook I'll follow the example presented in Named entities and random fields to train a conditional random field to recognize named entities in Twitter data. The data and some of the code below are taken from a programming assignment in the amazing class Natural Language Processing offered by Coursera. In the assignment we were shown how to build a named entity recognizer using deep learning with a bidirectional LSTM, which is a pretty complicated approach and I wanted to have a baseline model to see what sort of accuracy should be expected on this data.
In [1]:
def read_data(file_path):
tokens = []
tags = []
tweet_tokens = []
tweet_tags = []
for line in open(file_path, encoding='utf-8'):
line = line.strip()
if not line:
if tweet_tokens:
tokens.append(tweet_tokens)
tags.append(tweet_tags)
tweet_tokens = []
tweet_tags = []
else:
token, tag = line.split()
# Replace all urls with <URL> token
# Replace all users with <USR> token
if token.startswith("http://") or token.startswith("https://"): token = "<URL>"
elif token.startswith("@"): token = "<USR>"
tweet_tokens.append(token)
tweet_tags.append(tag)
return tokens, tags
train_tokens, train_tags = read_data('data/train.txt')
validation_tokens, validation_tags = read_data('data/validation.txt')
test_tokens, test_tags = read_data('data/test.txt')
The CRF model uses part of speech tags as features so we'll need to add those to the datasets.
In [2]:
%%time
import nltk
def build_sentence(tokens, tags):
pos_tags = [item[-1] for item in nltk.pos_tag(tokens)]
return list(zip(tokens, pos_tags, tags))
def build_sentences(tokens_set, tags_set):
return [build_sentence(tokens, tags) for tokens, tags in zip(tokens_set, tags_set)]
train_sents = build_sentences(train_tokens, train_tags)
validation_sents = build_sentences(validation_tokens, validation_tags)
test_sents = build_sentences(test_tokens, test_tags)
In [22]:
def word2features(sent, i):
word = sent[i][0]
postag = sent[i][1]
features = {
'bias': 1.0,
'word.lower()': word.lower(),
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
'word.isupper()': word.isupper(),
'word.istitle()': word.istitle(),
'word.isdigit()': word.isdigit(),
'postag': postag,
'postag[:2]': postag[:2],
}
if i > 0:
word1 = sent[i - 1][0]
postag1 = sent[i - 1][1]
features.update({
'-1:word.lower()': word1.lower(),
'-1:word.istitle()': word1.istitle(),
'-1:word.isupper()': word1.isupper(),
'-1:postag': postag1,
'-1:postag[:2]': postag1[:2],
})
else:
features['BOS'] = True
if i < len(sent) - 1:
word1 = sent[i + 1][0]
postag1 = sent[i + 1][1]
features.update({
'+1:word.lower()': word1.lower(),
'+1:word.istitle()': word1.istitle(),
'+1:word.isupper()': word1.isupper(),
'+1:postag': postag1,
'+1:postag[:2]': postag1[:2],
})
else:
features['EOS'] = True
return features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
return [label for token, postag, label in sent]
def sent2tokens(sent):
return [token for token, postag, label in sent]
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_validation = [sent2features(s) for s in validation_sents]
y_validation = [sent2labels(s) for s in validation_sents]
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]
In [4]:
import sklearn_crfsuite
In [5]:
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.12,
c2=0.01,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
Out[5]:
In [20]:
from evaluation import precision_recall_f1
def eval_conll(model, tokens, tags, short_report=True):
"""Computes NER quality measures using CONLL shared task script."""
tags_pred = model.predict(tokens)
y_true = [y for s in tags for y in s]
y_pred = [y for s in tags_pred for y in s]
results = precision_recall_f1(y_true, y_pred, print_results=True, short_report=short_report)
return results
In [23]:
print('-' * 20 + ' Train set quality: ' + '-' * 20)
train_results = eval_conll(crf, X_train, y_train, short_report=False)
print('-' * 20 + ' Validation set quality: ' + '-' * 20)
validation_results = eval_conll(crf, X_validation, y_validation, short_report=False)
print('-' * 20 + ' Test set quality: ' + '-' * 20)
test_results = eval_conll(crf, X_test, y_test, short_report=False)
I tried tuning the parameters c1 and c2 of the model using randomized grid search but was not able to improve the results that way. I plan to try GPyOpt to see if that will do better but don't have time to do that here.
In [ ]: