In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [ ]:
from lxmls import DATA_PATH
import lxmls
import lxmls.sequences.crf_online as crfo
import lxmls.readers.pos_corpus as pcc
import lxmls.sequences.id_feature as idfc
import lxmls.sequences.extended_feature as exfc
from lxmls.readers import pos_corpus
In [ ]:
corpus = lxmls.readers.pos_corpus.PostagCorpus()
train_seq = corpus.read_sequence_list_conll(DATA_PATH + "/train-02-21.conll",
max_sent_len=10, max_nr_sent=1000)
test_seq = corpus.read_sequence_list_conll(DATA_PATH + "/test-23.conll",
max_sent_len=10, max_nr_sent=1000)
dev_seq = corpus.read_sequence_list_conll(DATA_PATH + "/dev-22.conll",
max_sent_len=10, max_nr_sent=1000)
In [ ]:
feature_mapper = exfc.ExtendedFeatures(train_seq)
feature_mapper.build_features()
In [ ]:
crf_online = crfo.CRFOnline(corpus.word_dict, corpus.tag_dict, feature_mapper)
crf_online.num_epochs = 20
crf_online.train_supervised(train_seq)
In [ ]:
pred_train = crf_online.viterbi_decode_corpus(train_seq)
pred_dev = crf_online.viterbi_decode_corpus(dev_seq)
pred_test = crf_online.viterbi_decode_corpus(test_seq)
eval_train = crf_online.evaluate_corpus(train_seq, pred_train)
eval_dev = crf_online.evaluate_corpus(dev_seq, pred_dev)
eval_test = crf_online.evaluate_corpus(test_seq, pred_test)
In [ ]:
print("CRF - Extended Features Accuracy Train: %.3f Dev: %.3f Test: %.3f" \
%(eval_train, eval_dev,eval_test))
Compare the errors obtained with the two different feature sets.
Do some error analysis: what errors were corrected by using more features?
Can you think of other features to use to solve the errors found?
In [ ]: