In [1]:
%cd '/Users/max/Projects/Coreference/'
In [2]:
%cd 'rucoref'
from anaphoralib.corpora import rueval
from anaphoralib.tagsets import multeast
from anaphoralib.tagsets.utils import same_grammemmes
from anaphoralib.experiments import mentionpair
from anaphoralib.experiments import coref_utils
from anaphoralib import utils
from anaphoralib.experiments import utils as exp_utils
%cd '..'
#%load_ext autoreload
#%autoreload 2
scorer_path = 'rucoref/external/reference-coreference-scorers/scorer.pl'
In [3]:
rucoref_train = rueval.RuCorefCorpus(multeast, rueval)
rucoref_test = rueval.RuCorefCorpus(multeast, rueval)
In [4]:
exp_utils.load_corpus(rucoref_train, 'Corpus-2015/Tokens.train.fixmorph.txt.parsed', 'Corpus-2015/Groups.train.txt')
In [5]:
exp_utils.load_corpus(rucoref_test, 'Corpus-2015/Tokens.test.fixmorph.txt.parsed', 'Corpus-2015/Groups.test.txt')
In [6]:
class BaselineAllSingletonsClassifier(mentionpair.MentionPairClassifier):
def pair_coreferent(self, pair, groups, words):
return False
In [7]:
class BaselineAllInOneClassifier(mentionpair.MentionPairClassifier):
def pair_coreferent(self, pair, groups, words):
return True
In [8]:
class BaselineStrMatchClassifier(mentionpair.MentionPairClassifier):
def pair_coreferent(self, pair, groups, words):
is_pronoun = rucoref_test.tagset.pos_filters['pronoun'](pair[0])
is_personal_pronoun = rucoref_test.tagset.extract_feature('person', pair[0]) in ('1', '2')
return (not is_pronoun or is_personal_pronoun) and ' '.join(pair[0].lemma) == ' '.join(pair[1].lemma)
In [9]:
class BaselineHeadMatchClassifier(mentionpair.MentionPairClassifier):
def pair_coreferent(self, pair, groups, words):
is_pronoun = rucoref_test.tagset.pos_filters['pronoun'](pair[0])
is_personal_pronoun = rucoref_test.tagset.extract_feature('person', pair[0]) in ('1', '2')
return (not is_pronoun or is_personal_pronoun) and pair[0].lemma[pair[0].head] == pair[1].lemma[pair[1].head]
In [10]:
class BaselineHeadMatchProClassifier(mentionpair.MentionPairClassifier):
def __init__(self, scorer_path):
super(BaselineHeadMatchProClassifier, self).__init__(scorer_path)
self.groups_match = lambda pair: pair[0].lemma[pair[0].head] == pair[1].lemma[pair[1].head]
def pair_coreferent(self, pair, groups, words):
tagset = rucoref_test.tagset
is_pronoun = lambda w: tagset.pos_filters['pronoun'](w)
is_deictic_pronoun = lambda w: tagset.extract_feature('person', w) in ('1', '2')
number_agrees = lambda p: same_grammemmes('number', p, tagset)
gender_agrees = lambda p: same_grammemmes('gender', p, tagset)
if is_pronoun(pair[1]):
heads = [np.words[np.head] if np.type != 'word' else np for np in pair]
heads_indices = [words.index(head) for head in heads]
nouns_agr_between = [word for word in words[heads_indices[0]+1:heads_indices[1]]
if tagset.pos_filters['noun'](word)
and number_agrees((word, pair[1]))
and gender_agrees((word, pair[1]))
]
return (
(is_deictic_pronoun(pair[0]) and self.groups_match(pair))
or
(not is_pronoun(pair[0]) and pair[0].lemma[pair[0].head] == pair[1].lemma[pair[1].head])
or
(
not is_pronoun(pair[0]) and is_pronoun(pair[1])
and number_agrees(pair)
and gender_agrees(pair)
and len(nouns_agr_between) == 0
)
)
In [11]:
class BaselineStrMatchProClassifier(BaselineHeadMatchProClassifier):
def __init__(self, scorer_path):
super(BaselineStrMatchProClassifier, self).__init__(scorer_path)
self.groups_match = lambda pair: ' '.join(pair[0].lemma) == ' '.join(pair[1].lemma)
In [12]:
good_pronouns = {u'я', u'мы',
u'ты', u'вы',
u'он', u'она', u'оно', u'они',
u'мой', 'наш',
u'твой', u'ваш',
u'его', u'ее', u'их',
u'себя', u'свой',
u'который'
}
group_ok = lambda g: g.tag.startswith('N') or (g.tag.startswith('P') and g.lemma[0] in good_pronouns)
In [13]:
gs_mentions, gs_group_ids = coref_utils.get_gs_groups(rucoref_test)
gs_groups = gs_mentions
pred_mentions, pred_group_ids = coref_utils.get_pred_groups(rucoref_test, group_ok)
pred_groups = rucoref_test.groups
pred_mentions_gold_bound, pred_gold_bounds_ids = coref_utils.get_pred_groups_gold_boundaries(rucoref_test, group_ok)
pred_groups_gold_bound = rucoref_test.groups
In [14]:
print len(gs_mentions[1])
print len(pred_mentions[1])
print len(pred_mentions_gold_bound[1])
In [15]:
pred_mentions_gold_bound[0][:150]
Out[15]:
In [16]:
gs_mentions_train, gs_group_ids_train = coref_utils.get_gs_groups(rucoref_train)
gs_groups_train = gs_mentions_train
pred_mentions_train, pred_group_ids_train = coref_utils.get_pred_groups(rucoref_train, group_ok)
pred_groups_train = rucoref_train.groups
pred_mentions_gold_bound_train, pred_gold_bounds_ids = coref_utils.get_pred_groups_gold_boundaries(rucoref_train, group_ok)
pred_groups_gold_bound_train = rucoref_train.groups
Testing the baseline classifiers:
In [17]:
coref_utils.get_score_table(BaselineAllInOneClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineAllSingletonsClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineStrMatchClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineStrMatchProClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineHeadMatchClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineHeadMatchProClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
In [18]:
coref_utils.get_score_table(BaselineAllInOneClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineAllSingletonsClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineStrMatchClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineStrMatchProClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineHeadMatchClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineHeadMatchProClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
In [30]:
coref_utils.get_score_table(BaselineAllInOneClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineAllSingletonsClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineStrMatchClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineStrMatchProClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineHeadMatchClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineHeadMatchProClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
In [19]:
scores, groups, chains_base = BaselineHeadMatchProClassifier(scorer_path).score(rucoref_test,
pred_mentions_gold_bound,
pred_groups_gold_bound,
metrics=('muc',), heads_only=False)
In [29]:
coref_utils.print_chains_in_text(rucoref_test, 1, chains_base, pred_mentions_gold_bound)