In [2]:
%cd '/Users/max/Projects/Coreference/'
In [3]:
%cd 'rucoref'
from anaphoralib.corpora import rueval
from anaphoralib.tagsets import multeast
from anaphoralib.tagsets.utils import same_grammemmes
from anaphoralib.experiments import mentionpair
from anaphoralib.experiments import coref_utils
from anaphoralib import utils
from anaphoralib.experiments import utils as exp_utils
%cd '..'
#%load_ext autoreload
#%autoreload 2
scorer_path = 'rucoref/external/reference-coreference-scorers/scorer.pl'
random_state = 42
In [4]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import export_graphviz
In [5]:
rucoref_train = rueval.RuCorefCorpus(multeast, rueval)
rucoref_test = rueval.RuCorefCorpus(multeast, rueval)
In [6]:
exp_utils.load_corpus(rucoref_train, 'Corpus-2015/Tokens.train.fixmorph.txt.parsed', 'Corpus-2015/Groups.train.txt')
In [7]:
exp_utils.load_corpus(rucoref_test, 'Corpus-2015/Tokens.test.fixmorph.txt.parsed', 'Corpus-2015/Groups.test.txt')
In [8]:
group_ok = lambda g: g.tag.startswith('N') or (g.tag.startswith('P') and g.lemma[0] in multeast.coref_pronouns)
In [9]:
gs_mentions, gs_group_ids = coref_utils.get_gs_groups(rucoref_test)
gs_groups = gs_mentions
pred_mentions, pred_group_ids = coref_utils.get_pred_groups(rucoref_test, group_ok)
pred_groups = rucoref_test.groups
pred_mentions_gold_bound, pred_gold_bounds_ids = coref_utils.get_pred_groups_gold_boundaries(rucoref_test, group_ok)
pred_groups_gold_bound = rucoref_test.groups
In [10]:
gs_mentions_train, gs_group_ids_train = coref_utils.get_gs_groups(rucoref_train)
gs_groups_train = gs_mentions_train
pred_mentions_train, pred_group_ids_train = coref_utils.get_pred_groups(rucoref_train, group_ok)
pred_groups_train = rucoref_train.groups
pred_mentions_gold_bound_train, pred_gold_bounds_ids = coref_utils.get_pred_groups_gold_boundaries(rucoref_train, group_ok)
pred_groups_gold_bound_train = rucoref_train.groups
In [11]:
class MLMentionPairClassifier(mentionpair.MentionPairClassifier):
NEEDS_TRAINING = True
def __init__(self, scorer_path=None):
self.scorer_path = scorer_path
def train(self, clf, corpus, mentions):
self.data_x = []
self.data_y = []
self.appositives = []
self.tagset = corpus.tagset
for i, text in enumerate(corpus.texts):
all_mentions = utils.find_mentions(corpus.groups[i], corpus.tagset)
gs = corpus.gs[i]
words_index = corpus.words_index[i]
for chain_id in gs['chains']:
chain = gs['chains'][chain_id]
for pair in ((chain[i], chain[i+1]) for i in range(len(chain)-1)):
text_groups = []
for pair_elem in pair:
gs_group = gs['groups'][pair_elem]
words = [text[words_index[shift]] for shift in gs_group['tokens_shifts']]
head = text[words_index[gs_group['head_shift'][0]]]
text_groups.append(coref_utils.create_gs_group(gs_group, words, head))
self.data_x.append(self.get_feature_vector(corpus.texts[i], corpus.parses[i] if corpus.parses else None, *text_groups))
self.data_y.append(True)
neg_first = None
neg_last = None
for i_mention, mention in enumerate(all_mentions):
if mention.offset == text_groups[0].offset:
neg_first = i_mention
if mention.offset == text_groups[1].offset:
neg_last = i_mention
if neg_first and neg_last:
break
if not neg_first or not neg_last:
continue
neg_text_groups = all_mentions[neg_first+1:neg_last]
#for neg_pair in ((neg_text_groups[i], neg_text_groups[i+1]) for i in range(len(neg_text_groups)-1)):
# self.data_x.append(self.get_feature_vector(corpus.texts[i], *neg_pair))
# self.data_y.append(False)
for neg_group in neg_text_groups:
self.data_x.append(self.get_feature_vector(corpus.texts[i], corpus.parses[i] if corpus.parses else None, neg_group, text_groups[1]))
self.data_y.append(False)
self.clf = clf
self.clf.fit(self.data_x, self.data_y)
def pair_coreferent(self, pair, groups, words, parse):
vctr = self.get_feature_vector(words, parse, *pair)
return self.clf.predict([vctr])[0]
def get_feature_vector(self, words, parse, group_1, group_2):
# group_1 — possible antecedent
# group_2 — anaphor
vctr = []
feat_names = []
self.feat_names = feat_names
return vctr
In [83]:
import re
class MLMentionPairMoreFeatures(MLMentionPairClassifier):
NEEDS_TRAINING = True
def __init__(self, scorer_path, feat_zones=None):
create_pro_rx = lambda strings: re.compile(ur'(^|\b){}\b'.format(u'|'.join(strings)))
self.scorer_path = scorer_path
self.feat_zones = feat_zones if feat_zones else tuple()
self.modif = set()
self.relatives = []
self.deictic_pronouns = []
self.str_match = []
self.modif_pairs = []
self.abbrs = []
self.synt_roles = []
self.rx_lat = re.compile('[A-Za-z]')
self.rx_endings = re.compile(u'(?<=[А-ЯЁа-яё])(а|ы|ой|е)(?= |$)')
self.rx_pro_deictic = create_pro_rx((u'я', u'ты', u'мы', u'вы'))
self.rx_pro_personal = create_pro_rx((u'мой', u'твой', u'наш', u'ваш'))
self.rx_pro_reflexive = create_pro_rx((u"свое",
u"своё", u"своего", u"своей", u"своем", u"своём", u"своему", u"своею", u"свой", u"свои",
u"своим", u"своими", u"своих", u"свою", u"своя", u"себе", u"себя", u"собой", u"собою"))
self.rx_pro_possessive = create_pro_rx((u"его", u"ее", u"её", u"ей", u"ему", u"ею", u"им", u"ими", u"их"))
def get_feature_vector(self, words, parse, group_1, group_2):
# group_1 — possible antecedent
# group_2 — anaphor
head_1 = group_1.words[group_1.head] if group_1.type != 'word' else group_1
head_2 = group_2.words[group_2.head] if group_2.type != 'word' else group_2
is_appo = False
is_pronoun = lambda w: len(w.lemma) == 1 and w.lemma[0] in self.tagset.coref_pronouns
is_deictic_pronoun = lambda w: is_pronoun and self.tagset.extract_feature('person', w) in ('1', '2')
is_proper = lambda w: self.tagset.extract_feature('proper', w) == 'p'
group_1_proper = is_proper(group_1)
group_2_proper = is_proper(group_2)
is_pronoun_1 = is_pronoun(group_1)
is_pronoun_2 = is_pronoun(group_2)
number_agrees = lambda p: same_grammemmes('number', p, self.tagset)
gender_agrees = lambda p: same_grammemmes('gender', p, self.tagset)
animacity_agrees = lambda p: same_grammemmes('animate', p, self.tagset)
person_1 = self.tagset.extract_feature('person', group_1)
person_2 = self.tagset.extract_feature('person', group_2)
pronoun_persons = {u'мой': '1', u'наш': '1', u'я': '1', u'мы': '1', u'твой': 2, u'ваш': '2', u'ты': '2', u'вы': '2'}
is_demonstrative = lambda w: [tag.startswith('Pd') or w.lemma[i] == u'этот' for i, tag in enumerate(w.tags)]
demonstr_1 = is_demonstrative(group_1) if len(group_1.lemma) > 1 else [0]
demonstr_2 = is_demonstrative(group_2) if len(group_2.lemma) > 1 else [0]
filtered_lemmas_1 = [lemma for (i, lemma) in enumerate(group_1.lemma) if not demonstr_1[i]]
filtered_lemmas_2 = [lemma for (i, lemma) in enumerate(group_2.lemma) if not demonstr_2[i]]
filtered_lemma_1 = ' '.join(filtered_lemmas_1)
filtered_lemma_2 = ' '.join(filtered_lemmas_2)
filtered_wforms_1 = [wf for (i, wf) in enumerate(group_1.wordform) if not demonstr_1[i]]
filtered_wforms_2 = [wf for (i, wf) in enumerate(group_2.wordform) if not demonstr_2[i]]
filtered_wf_1 = ' '.join(filtered_wforms_1)
filtered_wf_2 = ' '.join(filtered_wforms_2)
modifiers_1 = [group_1.lemma[i] for i in range(len(group_1.tags)) if i != group_1.head and group_1.tags[i][0] == 'N'
and len(lemma) > 1
and self.tagset.extract_feature('case', group_1) == self.tagset.extract_feature('case', group_1.words[i])
and self.tagset.extract_feature('case', group_1.words[i]) != 'g'
]
modifiers_2 = [group_2.lemma[i] for i in range(len(group_2.tags)) if i != group_2.head and group_2.tags[i][0] == 'N'
and self.tagset.extract_feature('case', group_2) == self.tagset.extract_feature('case', group_2.words[i])
and self.tagset.extract_feature('case', group_2.words[i]) != 'g'
]
if filtered_lemma_1 in pronoun_persons:
person_1 = pronoun_persons[filtered_lemma_1]
if filtered_lemma_2 in pronoun_persons:
person_2 = pronoun_persons[filtered_lemma_2]
person_agr = person_1 == person_2 if person_1 in {'1','2'} or person_2 in {'1', '2'} else -1
n_sentences = -1
self.modif.update(modifiers_1)
self.modif.update(modifiers_2)
j = i = 0
if not head_1 in words or not head_2 in words:
n_sentences = -1
n_nouns = -1
dist_words = -1
print 'no alignment found'
else:
i = words.index(head_1)
j = words.index(head_2)
gr1_end = i + len(group_1.lemma) - group_1.head - 1
gr2_start = j - group_2.head
between_groups = words[gr1_end+1:gr2_start]
n_sentences = sum(1 for gr in between_groups if gr.tag == 'SENT')
n_nouns = sum(1 for gr in between_groups if gr.tag.startswith('N'))
dist_words = j - i
if gr2_start - gr1_end == 2 and words[gr1_end+1].tag.startswith(',') \
and same_grammemmes('case', (group_1, group_2), self.tagset) \
and same_grammemmes('number', (group_1, group_2), self.tagset) \
and same_grammemmes('gender', (group_1, group_2), self.tagset) \
and group_1.tag.startswith('N') and group_2.tag.startswith('N'):
is_appo = True
self.appositives.append((group_1, group_2, i, j))
# Capital letter heuristic
#if i > 0:
# group_1_proper |= any(w[0].isupper() for w in group_1.wordform) and words[i-1].tag != 'SENT'
#if j > 0:
# group_2_proper |= any(w[0].isupper() for w in group_2.wordform) and words[j-1].tag != 'SENT'
# Endings heuristic
#if group_1_proper and group_2_proper:
# str_match = self.rx_endings.sub(u'', filtered_lemma_1) == self.rx_endings.sub(u'', filtered_lemma_2)
#else:
# str_match = filtered_lemma_1 == filtered_lemma_2
# No endings heuristic:
str_match = filtered_lemma_1 == filtered_lemma_2
vctr = []
self.feat_names = []
if 'soon' in self.feat_zones:
if not 'dist' in self.feat_zones:
vctr.append(n_sentences == 1)
self.feat_names.append('dist==1')
vctr.append(not is_pronoun_1 and not is_pronoun_2 and str_match)
self.feat_names.append('str_match')
is_animate_1 = self.tagset.extract_feature('animate', group_1) in ('y', 'a')
is_animate_2 = self.tagset.extract_feature('animate', group_2) in ('y', 'a')
sem_class_agreement = (is_animate_1 and is_animate_2) or (not is_animate_1 and not is_animate_2)
# Semantic similarity heuristic (head match)
if not is_pronoun_1:
sem_class_agreement &= group_1.lemma[group_1.head] == group_2.lemma[group_2.head]
vctr.append(sem_class_agreement)
self.feat_names.append('sem_class_agreement')
if not 'morpho' in self.feat_zones:
vctr.append(is_pronoun_1)
vctr.append(is_pronoun_2)
self.feat_names.extend(('i_pronoun', 'j_pronoun'))
vctr.append(is_pronoun_1 and is_pronoun_2)
self.feat_names.append('both_pronouns')
vctr.append(number_agrees((group_1, group_2)))
vctr.append(gender_agrees((group_1, group_2)))
self.feat_names.extend(('number-agr', 'gender-agr'))
vctr.append(group_1_proper and group_2_proper)
self.feat_names.append('both-proper')
vctr.append(any(demonstr_2[:group_2.head+1]))
self.feat_names.append('anaphor-is-demonstrative')
vctr.append(is_appo)
self.feat_names.append('appositive')
if 'morpho' in self.feat_zones:
#vctr.append(is_pronoun_1)
#vctr.append(is_pronoun_2)
#self.feat_names.extend(('i_pronoun', 'j_pronoun'))
#vctr.append(is_pronoun_1 and is_pronoun_2)
#self.feat_names.append('both_pronouns')
vctr.append(self.rx_pro_deictic.search(filtered_lemma_2) is not None if is_pronoun_2 else -1)
vctr.append(self.rx_pro_deictic.search(filtered_lemma_1) is not None if is_pronoun_1 else -1)
self.feat_names.extend(('deictic_pronouns2', 'deictic_pronouns1'))
#if vctr[-1] == True and vctr[-2] == True and filtered_lemma_1 == filtered_lemma_2:
# self.deictic_pronouns.append((filtered_lemma_1, filtered_lemma_2))
#vctr.append(vctr[-1] == True and vctr[-2] == True and filtered_lemma_1 == filtered_lemma_2)
#self.feat_names.append('same_deictic_pronouns')
vctr.append(self.rx_pro_personal.search(filtered_lemma_2) is not None if is_pronoun_2 else -1)
vctr.append(self.rx_pro_personal.search(filtered_lemma_1) is not None if is_pronoun_1 else -1)
self.feat_names.append('pers_poss_pronouns2')
self.feat_names.append('pers_poss_pronouns1')
#vctr.append(person_agr)
#vctr.append(((vctr[-1] == True and vctr[-3] == True) or (vctr[-2] == True and vctr[-4] == True)) and person_agr)
#self.feat_names.append('person_agr')
vctr.append((filtered_lemma_2 in (u'который',)
and (words[j-1].tag[0] == ',' or words[j-2].tag[0] == ',')
and dist_words < 4) if is_pronoun_1 else -1)
if is_pronoun_2 and filtered_lemma_2 in (u'который',):
self.relatives.append((filtered_lemma_1, filtered_lemma_2, words[j-1], dist_words))
vctr.append(filtered_lemma_1 in (u'который',) if is_pronoun_2 else -1)
self.feat_names.append('rel_pronouns1')
self.feat_names.append('rel_pronouns2')
vctr.append(self.rx_pro_reflexive.search(filtered_lemma_2) is not None if is_pronoun_2 else -1)
vctr.append(self.rx_pro_reflexive.search(filtered_lemma_1) is not None if is_pronoun_1 else -1)
self.feat_names.append('refl_pronouns2')
self.feat_names.append('refl_pronouns1')
vctr.append(self.rx_pro_possessive.search(filtered_lemma_2) is not None if is_pronoun_2 else -1)
vctr.append(self.rx_pro_possessive.search(filtered_lemma_1) is not None if is_pronoun_1 else -1)
self.feat_names.append('poss_pronouns2')
self.feat_names.append('poss_pronouns1')
#vctr.append(not is_pronoun_1 and not is_pronoun_2
# and (filtered_lemma_1.startswith(filtered_lemma_2)
# or filtered_lemma_2.startswith(filtered_lemma_1)))
#self.feat_names.append('substring')
if 'dist' in self.feat_zones:
vctr.append(n_sentences == 1)
self.feat_names.append('dist==1')
vctr.append(n_sentences > 2)
self.feat_names.append('dist>2')
vctr.append(n_nouns > 3)
self.feat_names.append('nouns>3')
if 'lexical' in self.feat_zones:
vctr.append(not is_pronoun_1 and not is_pronoun_2
and ((len(modifiers_1) and filtered_lemma_2 in modifiers_1)) or ((len(modifiers_2) and filtered_lemma_1 in modifiers_2)))
#vctr.append(not is_pronoun_1 and not is_pronoun_2 and len(modifiers_1) and filtered_lemma_2 in modifiers_1)
self.feat_names.append('modif-fullNP')
if vctr[-1] == True:
self.modif_pairs.append(('1', ' '.join(modifiers_1), head_1, head_2, '+'.join(group_1.lemma), '+'.join(group_2.lemma)))
self.modif_pairs.append(('2', ' '.join(modifiers_2), head_1, head_2, '+'.join(group_1.lemma), '+'.join(group_2.lemma)))
is_abbr = (len(filtered_lemmas_1) > 1 or len(filtered_lemmas_2) > 1) and \
(''.join([w[0] for w in filtered_lemmas_1]) == filtered_lemma_2 or \
''.join([w[0] for w in filtered_lemmas_2]) == filtered_lemma_1)
vctr.append(not is_pronoun_1 and not is_pronoun_2 and is_abbr)
if is_abbr:
self.abbrs.append((filtered_lemma_1, filtered_lemma_2))
self.feat_names.append('is_abbr')
if 'synt' in self.feat_zones:
if parse:
synt_roles = []
for group, word_ind in ((group_1, i), (group_2, j)):
#word_offset = group.words[group.head].offset if len(group.wordform) > 1 else group.offset
synt_roles.append(parse[word_ind][1])
is_subj_1 = synt_roles[0] == u'предик'
is_subj_2 = synt_roles[1] == u'предик'
is_obj_1 = synt_roles[0] in (u'1-компл', u'2-компл', u'предл')
is_obj_2 = synt_roles[1] in (u'1-компл', u'2-компл', u'предл')
vctr.append(is_subj_1 and is_obj_2 and n_sentences == 0)
self.feat_names.append('subj_and_obj')
#self.synt_roles.append((group_1, synt_roles[0], group_2, synt_roles[1]))
else:
is_subj_1 = self.tagset.extract_feature('case', group_1) == 'n' and words[i-1].tag == 'SENT'
is_subj_2 = self.tagset.extract_feature('case', group_2) == 'n' and words[j-1].tag == 'SENT'
#vctr.append(is_subj_1)
#vctr.append(is_subj_2)
#self.feat_names.append('subj1')
#self.feat_names.append('subj2')
vctr.append(words[i-1].tag == 'SENT')
vctr.append(words[j-1].tag == 'SENT')
self.feat_names.append('sent_start_1')
self.feat_names.append('sent_start_2')
vctr.append(is_subj_1 == is_subj_2 and not is_pronoun_1 and is_pronoun_2
and self.tagset.extract_feature('person', group_2) == '3')
self.feat_names.append('subj_parallel')
#self.feat_names.append('both_subj')
#vctr.append(is_appo)
#self.feat_names.append('appo')
return vctr
Soon features with all the heuristics, gold mentions (fixed morphology):
In [197]:
# both 'Endings' and 'Capital letter' heuristics should be uncommented (see the classifier code)
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon',))
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon',))
clf.train(LinearSVC(random_state=42), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
Soon features without first-capital-letter as a proper noun heuristic. Fixed morphology makes it nearly just the same:
In [199]:
# 'Endings' heuristic should be uncommented, 'Capital letter' heuristic should be commented
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon',))
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
Soon features without endings heuristic. Again, fixed morphology makes it really nearly the same (and in fact better than with one heuristic enabled):
In [201]:
# both 'Endings' and 'Capital letter' heuristics should be commented
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon',))
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
At the same time, turning off head match as a semantic similarity heuristic reduces quality drastically:
In [203]:
# 'Head match' heuristic should be commented
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon',))
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
Soon features with the head match match as a semantic similarity heuristics, gold, gold boundaries and predicted mentions:
In [205]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon',))
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon',))
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, pred_mentions_gold_bound_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon',))
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, pred_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions, pred_groups, False)
Predicted mentions with heads-only evaluation works nearly as gold boundary mentions. The reason for this is that NP extraction using syntax finds nearly all the noun phrases, including the embedded ones. On the other hand, 10% difference between mention detection f-measures means that the parses are far from perfect
In [208]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon',))
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, pred_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions, pred_groups, True)
Optimizing the decision tree a little:
In [13]:
def create_clf(min_samples_leaf=0.005):
return DecisionTreeClassifier(random_state=random_state, min_samples_leaf=min_samples_leaf)
In [207]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon',))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
In [92]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon',))
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, pred_mentions_gold_bound_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
Analyzing the results:
In [209]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon',))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
scores, test_groups, test_chains = clf.score(rucoref_test, gs_mentions, gs_groups)
coref_utils.print_chains_in_text(rucoref_test, 1, test_chains, gs_mentions)
Gold mentions:
In [210]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('morpho',))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
In [211]:
clf.clf.feature_importances_
Out[211]:
In [212]:
clf.feat_names
Out[212]:
In [256]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon',))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
export_graphviz(clf.clf, out_file='soon_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
In [257]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho'))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
export_graphviz(clf.clf, out_file='soon_morph_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
Morphology is making results better at last!
In [258]:
for i, feat_importance in sorted(enumerate(clf.clf.feature_importances_), key=lambda f: f[1], reverse=True):
print clf.feat_names[i], feat_importance
In [259]:
scores, test_groups, test_chains = clf.score(rucoref_test, gs_mentions, gs_groups)
coref_utils.print_chains_in_text(rucoref_test, 1, test_chains, gs_mentions)
In [260]:
coref_utils.print_chains_in_text(rucoref_test, 30, test_chains, gs_mentions)
Morphological features improved the quality and the pronouns aren't clustered together at last. Instead, they are sometimes clustered with the right antecedent. Still, there is a lot of room for improvement.
In [262]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho'))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
#export_graphviz(clf.clf, out_file='soon_morph_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
In [254]:
# only "n_sentences == 1" feature
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist'))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
#export_graphviz(clf.clf, out_file='soon_morph_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
In [264]:
# "n_sentences == 1", "n_sentences == 0" features
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist'))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
#export_graphviz(clf.clf, out_file='soon_morph_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
In [266]:
# "n_sentences == 1", "n_sentences > 2" features
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist'))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
#export_graphviz(clf.clf, out_file='soon_morph_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
In [267]:
for i, feat_importance in sorted(enumerate(clf.clf.feature_importances_), key=lambda f: f[1], reverse=True):
print clf.feat_names[i], feat_importance
In [272]:
# "n_sentences == 1", "n_sentences > 2", "n_nouns > 2" features
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist'))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
#export_graphviz(clf.clf, out_file='soon_morph_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
In [273]:
for i, feat_importance in sorted(enumerate(clf.clf.feature_importances_), key=lambda f: f[1], reverse=True):
print clf.feat_names[i], feat_importance
In [275]:
# "n_sentences == 1", "n_sentences > 2", "n_nouns > 1" features
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist'))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
#export_graphviz(clf.clf, out_file='soon_morph_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
In [277]:
# "n_sentences == 1", "n_sentences > 2", "n_nouns > 3" features
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist'))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
export_graphviz(clf.clf, out_file='soon_morph_dist_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
Distance features are working now although the quality increase is not that big. Adding more distance features may increase the quality further. For comparison, without distance features (only "dist==1") the results are:
\textsc{MLMentionPairMoreFeatures} & $100.00$ & $79.32$ & $62.42$ & $69.86$ & $79.94$ & $47.89$ & $59.89$ & $53.30$ \\
In [278]:
for i, feat_importance in sorted(enumerate(clf.clf.feature_importances_), key=lambda f: f[1], reverse=True):
print clf.feat_names[i], feat_importance
Moreover, we can see that the quality for the pronoun resolution has decreased:
In [279]:
scores, test_groups, test_chains = clf.score(rucoref_test, gs_mentions, gs_groups)
coref_utils.print_chains_in_text(rucoref_test, 1, test_chains, gs_mentions)
For the comparison:
\textsc{MLMentionPairMoreFeatures} & $100.00$ & $79.29$ & $63.01$ & $70.22$ & $79.42$ & $48.39$ & $60.14$ & $53.65$ \\
Without DT optimisation:
In [91]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist'))
clf.train(create_clf(min_samples_leaf=0.0001), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
export_graphviz(clf.clf, out_file='soon_morph_dist_lex_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
In [89]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical'))
clf.train(create_clf(min_samples_leaf=0.0001), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
export_graphviz(clf.clf, out_file='soon_morph_dist_lex_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
In [90]:
for i, feat_importance in sorted(enumerate(clf.clf.feature_importances_), key=lambda f: f[1], reverse=True):
print clf.feat_names[i], feat_importance
In [88]:
for i, feat_importance in sorted(enumerate(clf.clf.feature_importances_), key=lambda f: f[1], reverse=True):
print clf.feat_names[i], feat_importance
In [86]:
for pair in clf.modif_pairs:
if pair[1]:
print pair[0], ': ', pair[1], ' — ', pair[2], pair[3], pair[4]
In [52]:
print '\n'.join(clf.modif)
The results are the same. There are not enough cases for those features to be recognized by a Decision Tree.
For the comparison:
\textsc{MLMentionPairMoreFeatures} & $100.00$ & $79.29$ & $63.01$ & $70.22$ & $79.42$ & $48.39$ & $60.14$ & $53.65$ \\
In [96]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt'))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
export_graphviz(clf.clf, out_file='soon_morph_dist_lex_synt_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
In [97]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt'))
clf.train(create_clf(min_samples_leaf=0.0003), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
export_graphviz(clf.clf, out_file='soon_morph_dist_lex_synt_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
In [25]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt'))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
export_graphviz(clf.clf, out_file='soon_morph_dist_lex_synt_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
In [14]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt'))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
export_graphviz(clf.clf, out_file='soon_morph_dist_lex_synt_dt_optimized.dot', feature_names=clf.feat_names, class_names=('noncoref', 'coref'))
In [95]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt'))
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, pred_mentions_gold_bound_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
In [94]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho'))
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, pred_mentions_gold_bound_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
In [15]:
scores, test_groups, test_chains = clf.score(rucoref_test, gs_mentions, gs_groups)
coref_utils.print_chains_in_text(rucoref_test, 1, test_chains, gs_mentions)
For some unknown reasons (yet!) lexical and synt features are not included in the trained decision trees
Dumping the results to a file:
In [37]:
import codecs
def dump_chains_in_corpus(corpus, test_chains, gold_mentions, out_file_name='coref_chains.txt'):
out_file = codecs.open(out_file_name, 'w', encoding='utf-8')
for i_text, text in enumerate(test_chains):
# Dumping SYS
for chain_id in test_chains[i_text]:
for elem_id in test_chains[i_text][chain_id]:
elem = gold_mentions[i_text][elem_id]
out_file.write(u'{text_id}\tSYS\t{chain_id}\t{elem_id}\t{offset}\t{lemma}\t{wordform}\n'.format(text_id=corpus.doc_ids[i_text],
chain_id=chain_id,
elem_id=elem_id,
offset=elem.offset,
lemma=u' '.join(elem.lemma),
wordform=u' '.join(elem.wordform)
))
out_file.write('\n')
# Dumping GS
for chain_id in corpus.gs[i_text]['chains'].keys():
#gs_mentions, gs_group_ids = coref_utils.get_gs_groups(corpus)
cur_gs_chain = {key: [gs_group_ids[i_text].index(item) for item in val]
for key, val in corpus.gs[i_text]['chains'].items()}
for elem_id in cur_gs_chain[chain_id]:
elem = gs_mentions[i_text][elem_id]
out_file.write(u'{text_id}\tGS \t{chain_id}\t{elem_id}\t{offset}\t{lemma}\t{wordform}\n'.format(text_id=corpus.doc_ids[i_text],
chain_id=chain_id,
elem_id=elem_id,
offset=elem.offset,
lemma=u' '.join(elem.lemma),
wordform=u' '.join(elem.wordform)
))
out_file.write('\n')
In [38]:
dump_chains_in_corpus(rucoref_test, test_chains, gs_mentions, out_file_name='coref_chains_default.txt')