In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import pandas as pd
import numpy as np
In [12]:
import os
def read_raw_input(filename):
"""Read a train/test file and return the contents as a list of list of lists.
The innermost list is a record of 4 items, one per word.
The middle-level list contains all the records in one sentence.
"""
raw_train_file = os.path.join('CoNLL-2003', filename)
all_items = []
with open(raw_train_file) as fh:
current_item = []
all_items.append(current_item)
for line in fh:
tags = line.strip().split()
if len(tags) == 0 or tags[0] == '-DOCSTART-':
continue
current_item.append(tags)
if tags[0] == '.' and tags[1] == '.':
current_item = []
all_items.append(current_item)
return all_items
In [73]:
%%time
train_sents = read_raw_input('eng.train')
test_sents = read_raw_input('eng.testA')
In [74]:
from IPython.display import display
display(train_sents[0])
In [75]:
display(test_sents[0])
In [76]:
def make_df_for_sent(single_sent):
"""Make a DataFrame out of all the records for a single sentence."""
df = pd.DataFrame(data=single_sent, columns=['word', 'pos', 'parse', 'ner'])
df.index.name = 'word_seq_num'
return df
def all_sentences(sents):
"""Convert the list of list of lists to a list of DataFrames."""
total_df = [make_df_for_sent(s) for s in sents]
return total_df
In [77]:
%%time
train_sents = all_sentences(train_sents)
test_sents = all_sentences(test_sents)
In [78]:
train_sents[0]
Out[78]:
In [89]:
def get_labels(all_sents):
"""Return the labels for all the words in a collection of sentences."""
all_labels = []
for s_df in all_sents:
labels = s_df.loc[:, 'ner'].tolist()
all_labels.append(labels)
return all_labels
def word2features(i, single_sent_df):
"""
Return a dictionary of feature names and values for the word at ``word_idx``
in a single sentence represented as a ``DataFrame``."""
word, postag = single_sent_df.iloc[i].loc[['word', 'pos']]
features = {
'bias': 1.0,
'word.lower()': word.lower(),
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
'word.isupper()': word.isupper(),
'word.istitle()': word.istitle(),
'word.isdigit()': word.isdigit(),
'postag': postag,
'postag[:2]': postag[:2],
}
if i > 0:
word1, postag1 = single_sent_df.iloc[i-1].loc[['word', 'pos']]
features.update({
'-1:word.lower()': word1.lower(),
'-1:word.istitle()': word1.istitle(),
'-1:word.isupper()': word1.isupper(),
'-1:postag': postag1,
'-1:postag[:2]': postag1[:2],
})
else:
features['BOS'] = True
if i < (single_sent_df.shape[0] - 1):
word1, postag1 = single_sent_df.iloc[i+1].loc[['word', 'pos']]
features.update({
'+1:word.lower()': word1.lower(),
'+1:word.istitle()': word1.istitle(),
'+1:word.isupper()': word1.isupper(),
'+1:postag': postag1,
'+1:postag[:2]': postag1[:2],
})
else:
features['EOS'] = True
return features
def sent2features(s_df):
"""
Return the feature values extracted from a single sentence.
"""
features = s_df.index.map(lambda word_idx: word2features(word_idx, s_df))
return features.tolist()
def get_feature_values(all_sents):
"""Get the feature values for all the sentences in train/test dataset."""
all_features = [sent2features(s) for s in all_sents]
return all_features
In [94]:
%%time
X_train = get_feature_values(train_sents)
In [93]:
%%time
X_test = get_feature_values(test_sents)
In [92]:
%%time
y_train, y_test = get_labels(train_sents), get_labels(test_sents)
In [95]:
%%time
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=200,
verbose=False,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
In [96]:
labels = list(crf.classes_)
labels.remove('O')
labels
Out[96]:
In [97]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
Out[97]:
In [98]:
# group B and I results
sorted_labels = sorted(
labels,
key=lambda name: (name[1:], name[0])
)
report = metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)
print(report)
In [ ]: