In [1]:
%cd '/Users/max/Projects/Coreference/'
In [93]:
%cd 'rucoref'
from anaphoralib.corpora import rueval
from anaphoralib.tagsets import multeast
from anaphoralib.tagsets.utils import same_grammemmes
from anaphoralib.experiments import mentionpair
from anaphoralib.experiments import coref_utils
from anaphoralib import utils
from anaphoralib.experiments import utils as exp_utils
%cd '..'
#%load_ext autoreload
#%autoreload 2
scorer_path = 'rucoref/external/reference-coreference-scorers/scorer.pl'
ruthes_path = '/Users/max/Datasets/ruthes-lite2/'
random_state = 42
In [26]:
import codecs
In [3]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import export_graphviz
In [12]:
def create_clf(min_samples_leaf=0.005):
return DecisionTreeClassifier(random_state=random_state, min_samples_leaf=min_samples_leaf)
In [123]:
import codecs
def dump_chains_in_corpus(corpus, test_chains, gold_mentions, out_file_name='coref_chains.txt'):
out_file = codecs.open(out_file_name, 'w', encoding='utf-8')
for i_text, text in enumerate(test_chains):
# Dumping SYS
for chain_id in test_chains[i_text]:
for elem_id in test_chains[i_text][chain_id]:
elem = gold_mentions[i_text][elem_id]
out_file.write(u'{text_id}\tSYS\t{chain_id}\t{elem_id}\t{offset}\t{lemma}\t{wordform}\n'.format(text_id=corpus.doc_ids[i_text],
chain_id=chain_id,
elem_id=elem_id,
offset=elem.offset,
lemma=u' '.join(elem.lemma),
wordform=u' '.join(elem.wordform)
))
out_file.write('\n')
# Dumping GS
for chain_id in corpus.gs[i_text]['chains'].keys():
#gs_mentions, gs_group_ids = coref_utils.get_gs_groups(corpus)
cur_gs_chain = {key: [gs_group_ids[i_text].index(item) for item in val]
for key, val in corpus.gs[i_text]['chains'].items()}
for elem_id in cur_gs_chain[chain_id]:
elem = gs_mentions[i_text][elem_id]
out_file.write(u'{text_id}\tGS \t{chain_id}\t{elem_id}\t{offset}\t{lemma}\t{wordform}\n'.format(text_id=corpus.doc_ids[i_text],
chain_id=chain_id,
elem_id=elem_id,
offset=elem.offset,
lemma=u' '.join(elem.lemma),
wordform=u' '.join(elem.wordform)
))
out_file.write('\n')
In [90]:
from xml.etree import cElementTree as ElementTree
import collections
In [91]:
def load_dict_from_xml(filename, single_item_tag, tags):
items = {}
xml = ElementTree.parse(filename)
for item in xml.findall(single_item_tag):
items[int(item.attrib['id'])] = {tag: item.find(tag).text or '' for tag in tags}
return items
In [94]:
import os
concepts = load_dict_from_xml(os.path.join(ruthes_path, 'concepts.xml'), 'concept', ('name', 'gloss', 'domain'))
entries = load_dict_from_xml(os.path.join(ruthes_path, 'text_entry.xml'), 'entry', ('name', 'lemma', 'synt_type'))
In [108]:
xml = ElementTree.parse(os.path.join(ruthes_path, 'synonyms.xml'))
synonyms = collections.defaultdict(set)
concept_index = {}
for elem in xml.findall('entry_rel'):
synonyms[int(elem.attrib['concept_id'])].add(int(elem.attrib['entry_id']))
concept_index[entries[int(elem.attrib['entry_id'])]['lemma'].lower()] = int(elem.attrib['concept_id'])
In [109]:
import collections
ruthes_aliases = collections.defaultdict(set)
for key in synonyms.keys():
val = synonyms[key]
good_entries = [entries[entry]['lemma'].lower() for entry in val if entry in entries and entries[entry]['synt_type'].startswith('N')]
for entry in good_entries:
ruthes_aliases[entry].update(good_entries)
In [110]:
u'городок' in ruthes_aliases[u'город']
Out[110]:
In [113]:
relations = {}
xml = ElementTree.parse(os.path.join(ruthes_path, 'relations.xml'))
for item in xml.findall('rel'):
if item.attrib['name'] == u'ВЫШЕ':
rel_to = int(item.attrib['to'])
rel_from = int(item.attrib['from'])
#relations[concepts[rel_from]['name'].lower()] = concepts[rel_to]['name'].lower()
relations[rel_from] = rel_to
In [118]:
def find_link(word1, word2):
link = False
if word1 in concept_index and word2 in concept_index:
for word_from, word_to in ((word1, word2), (word2, word1)):
concept = concept_index[word_from]
target_concept = concept_index[word_to]
#print concept, concepts[concept]['name'], target_concept, concepts[target_concept]['name']
while concept in relations:
#print concept, target_concept
linked_concept = relations[concept]
if target_concept == concept:
link = True
break
concept = linked_concept
if link:
break
return link
In [119]:
find_link(u'вода', u'напиток')
Out[119]:
In [4]:
rucoref_train = rueval.RuCorefCorpus(multeast, rueval)
rucoref_test = rueval.RuCorefCorpus(multeast, rueval)
In [5]:
exp_utils.load_corpus(rucoref_train, 'Corpus-2015/Tokens.train.fixmorph.txt.parsed', 'Corpus-2015/Groups.train.txt')
In [6]:
exp_utils.load_corpus(rucoref_test, 'Corpus-2015/Tokens.test.fixmorph.txt.parsed', 'Corpus-2015/Groups.test.txt')
In [7]:
group_ok = lambda g: g.tag.startswith('N') or (g.tag.startswith('P') and g.lemma[0] in multeast.coref_pronouns)
In [8]:
gs_mentions, gs_group_ids = coref_utils.get_gs_groups(rucoref_test)
gs_groups = gs_mentions
pred_mentions, pred_group_ids = coref_utils.get_pred_groups(rucoref_test, group_ok)
pred_groups = rucoref_test.groups
pred_mentions_gold_bound, pred_gold_bounds_ids = coref_utils.get_pred_groups_gold_boundaries(rucoref_test, group_ok)
pred_groups_gold_bound = rucoref_test.groups
In [9]:
gs_mentions_train, gs_group_ids_train = coref_utils.get_gs_groups(rucoref_train)
gs_groups_train = gs_mentions_train
pred_mentions_train, pred_group_ids_train = coref_utils.get_pred_groups(rucoref_train, group_ok)
pred_groups_train = rucoref_train.groups
pred_mentions_gold_bound_train, pred_gold_bounds_ids = coref_utils.get_pred_groups_gold_boundaries(rucoref_train, group_ok)
pred_groups_gold_bound_train = rucoref_train.groups
In [10]:
class MLMentionPairClassifier(mentionpair.MentionPairClassifier):
NEEDS_TRAINING = True
def __init__(self, scorer_path=None):
self.scorer_path = scorer_path
def train(self, clf, corpus, mentions):
self.data_x = []
self.data_y = []
self.appositives = []
self.tagset = corpus.tagset
for i, text in enumerate(corpus.texts):
all_mentions = utils.find_mentions(corpus.groups[i], corpus.tagset)
gs = corpus.gs[i]
words_index = corpus.words_index[i]
for chain_id in gs['chains']:
chain = gs['chains'][chain_id]
for pair in ((chain[i], chain[i+1]) for i in range(len(chain)-1)):
text_groups = []
for pair_elem in pair:
gs_group = gs['groups'][pair_elem]
words = [text[words_index[shift]] for shift in gs_group['tokens_shifts']]
head = text[words_index[gs_group['head_shift'][0]]]
text_groups.append(coref_utils.create_gs_group(gs_group, words, head))
self.data_x.append(self.get_feature_vector(corpus.texts[i], corpus.parses[i] if corpus.parses else None, *text_groups))
self.data_y.append(True)
neg_first = None
neg_last = None
for i_mention, mention in enumerate(all_mentions):
if mention.offset == text_groups[0].offset:
neg_first = i_mention
if mention.offset == text_groups[1].offset:
neg_last = i_mention
if neg_first and neg_last:
break
if not neg_first or not neg_last:
continue
neg_text_groups = all_mentions[neg_first+1:neg_last]
#for neg_pair in ((neg_text_groups[i], neg_text_groups[i+1]) for i in range(len(neg_text_groups)-1)):
# self.data_x.append(self.get_feature_vector(corpus.texts[i], *neg_pair))
# self.data_y.append(False)
for neg_group in neg_text_groups:
self.data_x.append(self.get_feature_vector(corpus.texts[i], corpus.parses[i] if corpus.parses else None, neg_group, text_groups[1]))
self.data_y.append(False)
self.clf = clf
self.clf.fit(self.data_x, self.data_y)
def pair_coreferent(self, pair, groups, words, parse):
vctr = self.get_feature_vector(words, parse, *pair)
return self.clf.predict([vctr])[0]
def get_feature_vector(self, words, parse, group_1, group_2):
# group_1 — possible antecedent
# group_2 — anaphor
vctr = []
feat_names = []
self.feat_names = feat_names
return vctr
In [20]:
import re
class MLMentionPairMoreFeatures(MLMentionPairClassifier):
NEEDS_TRAINING = True
def __init__(self, scorer_path, feat_zones=None):
create_pro_rx = lambda strings: re.compile(ur'(^|\b){}\b'.format(u'|'.join(strings)))
self.scorer_path = scorer_path
self.feat_zones = feat_zones if feat_zones else tuple()
self.modif = set()
self.relatives = []
self.deictic_pronouns = []
self.str_match = []
self.modif_pairs = []
self.abbrs = []
self.synt_roles = []
self.rx_lat = re.compile('[A-Za-z]')
self.rx_endings = re.compile(u'(?<=[А-ЯЁа-яё])(а|ы|ой|е)(?= |$)')
self.rx_pro_deictic = create_pro_rx((u'я', u'ты', u'мы', u'вы'))
self.rx_pro_personal = create_pro_rx((u'мой', u'твой', u'наш', u'ваш'))
self.rx_pro_reflexive = create_pro_rx((u"свое",
u"своё", u"своего", u"своей", u"своем", u"своём", u"своему", u"своею", u"свой", u"свои",
u"своим", u"своими", u"своих", u"свою", u"своя", u"себе", u"себя", u"собой", u"собою"))
self.rx_pro_possessive = create_pro_rx((u"его", u"ее", u"её", u"ей", u"ему", u"ею", u"им", u"ими", u"их"))
def get_feature_vector(self, words, parse, group_1, group_2):
# group_1 — possible antecedent
# group_2 — anaphor
head_1 = group_1.words[group_1.head] if group_1.type != 'word' else group_1
head_2 = group_2.words[group_2.head] if group_2.type != 'word' else group_2
is_appo = False
is_pronoun = lambda w: len(w.lemma) == 1 and w.lemma[0] in self.tagset.coref_pronouns
is_deictic_pronoun = lambda w: is_pronoun and self.tagset.extract_feature('person', w) in ('1', '2')
is_proper = lambda w: self.tagset.extract_feature('proper', w) == 'p'
group_1_proper = is_proper(group_1)
group_2_proper = is_proper(group_2)
is_pronoun_1 = is_pronoun(group_1)
is_pronoun_2 = is_pronoun(group_2)
number_agrees = lambda p: same_grammemmes('number', p, self.tagset)
gender_agrees = lambda p: same_grammemmes('gender', p, self.tagset)
animacity_agrees = lambda p: same_grammemmes('animate', p, self.tagset)
person_1 = self.tagset.extract_feature('person', group_1)
person_2 = self.tagset.extract_feature('person', group_2)
pronoun_persons = {u'мой': '1', u'наш': '1', u'я': '1', u'мы': '1', u'твой': 2, u'ваш': '2', u'ты': '2', u'вы': '2'}
is_demonstrative = lambda w: [tag.startswith('Pd') or w.lemma[i] == u'этот' for i, tag in enumerate(w.tags)]
demonstr_1 = is_demonstrative(group_1) if len(group_1.lemma) > 1 else [0]
demonstr_2 = is_demonstrative(group_2) if len(group_2.lemma) > 1 else [0]
filtered_lemmas_1 = [lemma for (i, lemma) in enumerate(group_1.lemma) if not demonstr_1[i]]
filtered_lemmas_2 = [lemma for (i, lemma) in enumerate(group_2.lemma) if not demonstr_2[i]]
filtered_lemma_1 = ' '.join(filtered_lemmas_1)
filtered_lemma_2 = ' '.join(filtered_lemmas_2)
filtered_wforms_1 = [wf for (i, wf) in enumerate(group_1.wordform) if not demonstr_1[i]]
filtered_wforms_2 = [wf for (i, wf) in enumerate(group_2.wordform) if not demonstr_2[i]]
filtered_wf_1 = ' '.join(filtered_wforms_1)
filtered_wf_2 = ' '.join(filtered_wforms_2)
modifiers_1 = [group_1.lemma[i] for i in range(group_1.head) if group_1.tags[i][0] == 'N']
modifiers_2 = [group_2.lemma[i] for i in range(group_2.head) if group_2.tags[i][0] == 'N']
if filtered_lemma_1 in pronoun_persons:
person_1 = pronoun_persons[filtered_lemma_1]
if filtered_lemma_2 in pronoun_persons:
person_2 = pronoun_persons[filtered_lemma_2]
person_agr = person_1 == person_2 if person_1 in {'1','2'} or person_2 in {'1', '2'} else -1
n_sentences = -1
self.modif.update(modifiers_1)
self.modif.update(modifiers_2)
j = i = 0
if not head_1 in words or not head_2 in words:
n_sentences = -1
n_nouns = -1
dist_words = -1
print 'no alignment found'
else:
i = words.index(head_1)
j = words.index(head_2)
gr1_end = i + len(group_1.lemma) - group_1.head - 1
gr2_start = j - group_2.head
between_groups = words[gr1_end+1:gr2_start]
n_sentences = sum(1 for gr in between_groups if gr.tag == 'SENT')
n_nouns = sum(1 for gr in between_groups if gr.tag.startswith('N'))
dist_words = j - i
if gr2_start - gr1_end == 2 and words[gr1_end+1].tag.startswith(',') \
and same_grammemmes('case', (group_1, group_2), self.tagset) \
and same_grammemmes('number', (group_1, group_2), self.tagset) \
and same_grammemmes('gender', (group_1, group_2), self.tagset) \
and group_1.tag.startswith('N') and group_2.tag.startswith('N'):
is_appo = True
self.appositives.append((group_1, group_2, i, j))
# Capital letter heuristic
#if i > 0:
# group_1_proper |= any(w[0].isupper() for w in group_1.wordform) and words[i-1].tag != 'SENT'
#if j > 0:
# group_2_proper |= any(w[0].isupper() for w in group_2.wordform) and words[j-1].tag != 'SENT'
# Endings heuristic
#if group_1_proper and group_2_proper:
# str_match = self.rx_endings.sub(u'', filtered_lemma_1) == self.rx_endings.sub(u'', filtered_lemma_2)
#else:
# str_match = filtered_lemma_1 == filtered_lemma_2
# No endings heuristic:
str_match = filtered_lemma_1 == filtered_lemma_2
vctr = []
self.feat_names = []
if 'soon' in self.feat_zones:
if not 'dist' in self.feat_zones:
vctr.append(n_sentences == 1)
self.feat_names.append('dist==1')
vctr.append(not is_pronoun_1 and not is_pronoun_2 and str_match)
self.feat_names.append('str_match')
is_animate_1 = self.tagset.extract_feature('animate', group_1) in ('y', 'a')
is_animate_2 = self.tagset.extract_feature('animate', group_2) in ('y', 'a')
sem_class_agreement = (is_animate_1 and is_animate_2) or (not is_animate_1 and not is_animate_2)
# Semantic similarity heuristic (head match)
if not is_pronoun_1:
sem_class_agreement &= group_1.lemma[group_1.head] == group_2.lemma[group_2.head]
vctr.append(sem_class_agreement)
self.feat_names.append('sem_class_agreement')
if not 'morpho' in self.feat_zones:
vctr.append(is_pronoun_1)
vctr.append(is_pronoun_2)
self.feat_names.extend(('i_pronoun', 'j_pronoun'))
vctr.append(is_pronoun_1 and is_pronoun_2)
self.feat_names.append('both_pronouns')
vctr.append(number_agrees((group_1, group_2)))
vctr.append(gender_agrees((group_1, group_2)))
self.feat_names.extend(('number-agr', 'gender-agr'))
vctr.append(group_1_proper and group_2_proper)
self.feat_names.append('both-proper')
vctr.append(any(demonstr_2[:group_2.head+1]))
self.feat_names.append('anaphor-is-demonstrative')
vctr.append(is_appo)
self.feat_names.append('appositive')
if 'morpho' in self.feat_zones:
#vctr.append(is_pronoun_1)
#vctr.append(is_pronoun_2)
#self.feat_names.extend(('i_pronoun', 'j_pronoun'))
#vctr.append(is_pronoun_1 and is_pronoun_2)
#self.feat_names.append('both_pronouns')
vctr.append(self.rx_pro_deictic.search(filtered_lemma_2) is not None if is_pronoun_2 else -1)
vctr.append(self.rx_pro_deictic.search(filtered_lemma_1) is not None if is_pronoun_1 else -1)
self.feat_names.extend(('deictic_pronouns2', 'deictic_pronouns1'))
#if vctr[-1] == True and vctr[-2] == True and filtered_lemma_1 == filtered_lemma_2:
# self.deictic_pronouns.append((filtered_lemma_1, filtered_lemma_2))
#vctr.append(vctr[-1] == True and vctr[-2] == True and filtered_lemma_1 == filtered_lemma_2)
#self.feat_names.append('same_deictic_pronouns')
vctr.append(self.rx_pro_personal.search(filtered_lemma_2) is not None if is_pronoun_2 else -1)
vctr.append(self.rx_pro_personal.search(filtered_lemma_1) is not None if is_pronoun_1 else -1)
self.feat_names.append('pers_poss_pronouns2')
self.feat_names.append('pers_poss_pronouns1')
#vctr.append(person_agr)
#vctr.append(((vctr[-1] == True and vctr[-3] == True) or (vctr[-2] == True and vctr[-4] == True)) and person_agr)
#self.feat_names.append('person_agr')
vctr.append((filtered_lemma_2 in (u'который',)
and (words[j-1].tag[0] == ',' or words[j-2].tag[0] == ',')
and dist_words < 4) if is_pronoun_1 else -1)
if is_pronoun_2 and filtered_lemma_2 in (u'который',):
self.relatives.append((filtered_lemma_1, filtered_lemma_2, words[j-1], dist_words))
vctr.append(filtered_lemma_1 in (u'который',) if is_pronoun_2 else -1)
self.feat_names.append('rel_pronouns1')
self.feat_names.append('rel_pronouns2')
vctr.append(self.rx_pro_reflexive.search(filtered_lemma_2) is not None if is_pronoun_2 else -1)
vctr.append(self.rx_pro_reflexive.search(filtered_lemma_1) is not None if is_pronoun_1 else -1)
self.feat_names.append('refl_pronouns2')
self.feat_names.append('refl_pronouns1')
vctr.append(self.rx_pro_possessive.search(filtered_lemma_2) is not None if is_pronoun_2 else -1)
vctr.append(self.rx_pro_possessive.search(filtered_lemma_1) is not None if is_pronoun_1 else -1)
self.feat_names.append('poss_pronouns2')
self.feat_names.append('poss_pronouns1')
#vctr.append(not is_pronoun_1 and not is_pronoun_2
# and (filtered_lemma_1.startswith(filtered_lemma_2)
# or filtered_lemma_2.startswith(filtered_lemma_1)))
#self.feat_names.append('substring')
if 'dist' in self.feat_zones:
vctr.append(n_sentences == 1)
self.feat_names.append('dist==1')
vctr.append(n_sentences > 2)
self.feat_names.append('dist>2')
vctr.append(n_nouns > 3)
self.feat_names.append('nouns>3')
if 'lexical' in self.feat_zones:
vctr.append(not is_pronoun_1 and not is_pronoun_2 and len(modifiers_1) and filtered_lemma_2 == modifiers_1[0])
#vctr.append(not is_pronoun_1 and not is_pronoun_2 and len(modifiers_1) and filtered_lemma_2 in modifiers_1)
self.feat_names.append('modif-fullNP')
if vctr[-1] == True:
self.modif_pairs.append((' '.join(modifiers_1), filtered_lemma_1, filtered_lemma_2))
is_abbr = (len(filtered_lemmas_1) > 1 or len(filtered_lemmas_2) > 1) and \
(''.join([w[0] for w in filtered_lemmas_1]) == filtered_lemma_2 or \
''.join([w[0] for w in filtered_lemmas_2]) == filtered_lemma_1)
vctr.append(not is_pronoun_1 and not is_pronoun_2 and is_abbr)
if is_abbr:
self.abbrs.append((filtered_lemma_1, filtered_lemma_2))
self.feat_names.append('is_abbr')
if 'synt' in self.feat_zones:
if parse:
synt_roles = []
for group, word_ind in ((group_1, i), (group_2, j)):
#word_offset = group.words[group.head].offset if len(group.wordform) > 1 else group.offset
synt_roles.append(parse[word_ind][1])
is_subj_1 = synt_roles[0] == u'предик'
is_subj_2 = synt_roles[1] == u'предик'
is_obj_1 = synt_roles[0] in (u'1-компл', u'2-компл', u'предл')
is_obj_2 = synt_roles[1] in (u'1-компл', u'2-компл', u'предл')
vctr.append(is_subj_1 and is_obj_2 and n_sentences == 0)
self.feat_names.append('subj_and_obj')
#self.synt_roles.append((group_1, synt_roles[0], group_2, synt_roles[1]))
else:
is_subj_1 = self.tagset.extract_feature('case', group_1) == 'n' and words[i-1].tag == 'SENT'
is_subj_2 = self.tagset.extract_feature('case', group_2) == 'n' and words[j-1].tag == 'SENT'
#vctr.append(is_subj_1)
#vctr.append(is_subj_2)
#self.feat_names.append('subj1')
#self.feat_names.append('subj2')
vctr.append(words[i-1].tag == 'SENT')
vctr.append(words[j-1].tag == 'SENT')
self.feat_names.append('sent_start_1')
self.feat_names.append('sent_start_2')
vctr.append(is_subj_1 == is_subj_2 and not is_pronoun_1 and is_pronoun_2
and self.tagset.extract_feature('person', group_2) == '3')
self.feat_names.append('subj_parallel')
#self.feat_names.append('both_subj')
#vctr.append(is_appo)
#self.feat_names.append('appo')
return vctr
In [213]:
class NE_MLMentionPairClassifier(MLMentionPairMoreFeatures):
exceptions = {u'то', u'это', u'ваш', u'сам'}
def __init__(self, scorer_path, feat_zones=None, ne_list='ne.txt'):
self.aliases = []
self.domains = []
self.ne = []
self.w2v = []
super(NE_MLMentionPairClassifier, self).__init__(scorer_path=scorer_path, feat_zones=feat_zones)
self.read_ne(ne_list)
def read_ne(self, filename):
self.entities = {}
self.entity_synonyms = {}
self.entity_types = {}
with codecs.open(filename, encoding='utf-8') as inp_file:
for i_entity, line in enumerate(inp_file):
if not line or line.startswith(';'):
continue
ne_type, synonyms = line.strip('\r\n').split('|', 1)
synonyms = synonyms.split('|')
ne_type = ne_type.split(' ')[0]
self.entities[i_entity] = (synonyms[0], ne_type, synonyms)
for synonym in synonyms:
if synonym in self.exceptions:
continue
if synonym not in self.entity_synonyms:
self.entity_synonyms[synonym] = set()
self.entity_types[synonym] = set()
self.entity_synonyms[synonym].add(i_entity)
self.entity_types[synonym].add(ne_type)
def get_feature_vector(self, words, parse, group_1, group_2):
vctr = super(NE_MLMentionPairClassifier, self).get_feature_vector(words, parse, group_1, group_2)
lemma_1 = ' '.join(group_1.lemma)
lemma_2 = ' '.join(group_2.lemma)
head_1 = group_1.lemma[group_1.head]
head_2 = group_2.lemma[group_2.head]
sem_class_agreement = vctr[self.feat_names.index('sem_class_agreement')]
is_alias = -1
is_pronoun_1 = lemma_1 in self.tagset.coref_pronouns
is_pronoun_2 = lemma_2 in self.tagset.coref_pronouns
if 'sem' in self.feat_zones:
vctr[self.feat_names.index('sem_class_agreement')] = sem_class_agreement
if lemma_1 in self.entity_synonyms and lemma_2 in self.entity_synonyms and not is_pronoun_1 and not is_pronoun_2:
ent_1 = self.entity_synonyms[lemma_1]
ent_2 = self.entity_synonyms[lemma_2]
ent_types_1 = self.entity_types[lemma_1]
ent_types_2 = self.entity_types[lemma_2]
sem_class_agreement &= len(ent_types_1 & ent_types_2) > 0
is_alias = (len(ent_1 & ent_2) > 0, 'NE')
if lemma_1 != lemma_2:
self.ne.append((lemma_1, lemma_2, is_alias[0]))
if 'w2v' in self.feat_zones:
alias_w2v = -1
lemma_1_pos = head_1 + '_NOUN'
lemma_2_pos = head_2 + '_NOUN'
if head_1 != head_2 and not is_pronoun_1 and not is_pronoun_2 and lemma_1_pos in model and lemma_2_pos in model:
sim = model.similarity(lemma_1_pos, lemma_2_pos)
alias_w2v = sim > 0.85
if alias_w2v > 0:
if is_alias == -1:
is_alias = (alias_w2v, 'w2v')
else:
is_alias = (is_alias[0] | alias_w2v, is_alias[1] + '|w2v')
self.w2v.append((lemma_1, lemma_2, sim))
if 'ruthes' in self.feat_zones:
# checking ontological semantic class
if head_1 in concept_index and head_2 in concept_index:
concept_1 = concepts[concept_index[head_1]]
concept_2 = concepts[concept_index[head_2]]
if 'domain' in concept_1 and 'domain' in concept_2:
vctr[self.feat_names.index('sem_class_agreement')] = concept_1['domain'] == concept_2['domain']
self.domains.append((concept_1['domain'], head_1, head_2))
# checking if NPs are aliases
is_ruthes_alias = -1
if head_1 in ruthes_aliases and head_1 != head_2:
is_ruthes_alias = head_2 in ruthes_aliases[head_1]
if head_2 in ruthes_aliases and head_1 != head_2:
is_ruthes_alias = head_1 in ruthes_aliases[head_2]
#else:
# is_alias = find_link(head_1, head_2)
if is_ruthes_alias > 0:
if is_alias == -1:
is_alias = (is_ruthes_alias, 'ruthes')
else:
is_alias = (is_alias[0] | is_ruthes_alias | find_link(head_1, head_2), is_alias[1] + '|ruthes')
if is_alias == -1:
is_alias = (not is_pronoun_1 and not is_pronoun_2 and group_1.lemma[group_1.head] == group_2.lemma[group_2.head], 'default')
vctr.append(is_alias[0])
self.feat_names.append('alias')
if is_alias[0]:
self.aliases.append((is_alias[1], lemma_1, lemma_2))
return vctr
As a baseline we will use a mention-pair classifier with all the features (see the previous NB for details)
In [182]:
clf = MLMentionPairMoreFeatures(scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt'))
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
An updated classifier without the new features shows slightly better results because of the is_alias default heuristic:
In [183]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt'), ne_list='dictionaries/ne.txt')
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
First, the small list:
In [186]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt', 'sem'), ne_list='dictionaries/ne.txt')
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
In [187]:
for ne in clf.ne:
if ne[2]:
print ne[0], ne[1]
In [192]:
for alias in clf.aliases:
if alias[0] != 'default' and alias[1] != alias[2]:
print alias[0], alias[1], alias[2]
And now the full list:
In [193]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt', 'sem'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
Dumping the groups:
In [194]:
scores, test_groups, test_chains = clf.score(rucoref_test, gs_mentions, gs_groups)
In [195]:
dump_chains_in_corpus(rucoref_test, test_chains, gs_mentions, out_file_name='coref_chains_ne_full.txt')
In [39]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('news.model.bin', binary=True)
In [176]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
In [196]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt', 'w2v'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
In [197]:
for ne in clf.w2v:
print ne[0], ne[1], ne[2]
In [198]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt', 'sem', 'w2v'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
Dumping the groups:
In [200]:
scores, test_groups, test_chains = clf.score(rucoref_test, gs_mentions, gs_groups)
dump_chains_in_corpus(rucoref_test, test_chains, gs_mentions, out_file_name='coref_chains_ne_w2v.txt')
In [201]:
for alias in clf.aliases:
if alias[0] != 'default' and alias[1] != alias[2]:
print alias[0], alias[1], alias[2]
In [214]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt', 'ruthes'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
In [221]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt', 'w2v', 'ruthes'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
In [218]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt', 'sem', 'ruthes'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
In [223]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt', 'sem', 'w2v'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
In [225]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt', 'sem', 'w2v', 'ruthes'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)
In [226]:
for alias in clf.aliases:
if alias[0] != 'default' and alias[1] != alias[2]:
print alias[0], alias[1], alias[2]
Dumping the groups:
In [227]:
scores, test_groups, test_chains = clf.score(rucoref_test, gs_mentions, gs_groups)
In [228]:
dump_chains_in_corpus(rucoref_test, test_chains, gs_mentions, out_file_name='coref_chains_all_sem.txt')
$79.35$ & $63.44$ & $70.51$ & $79.37$ & $48.60$ & $60.29$ & $53.83$: Without semantics
$79.43$ & $63.72$ & $70.71$ & $79.37$ & $48.86$ & $60.48$ & $54.05$: With NE
$79.29$ & $63.49$ & $70.52$ & $79.25$ & $48.64$ & $60.28$ & $53.85$: With word2vec
$79.19$ & $63.79$ & $70.66$ & $78.92$ & $48.78$ & $60.29$ & $53.87$: With RuThes
$79.36$ & $63.77$ & $70.72$ & $79.25$ & $48.89$ & $60.47$ & $54.07$: With NE & word2vec
$79.24$ & $63.97$ & $70.79$ & $78.92$ & $48.94$ & $60.41$ & $54.01$: With NE & RuThes
$79.14$ & $63.79$ & $70.64$ & $78.85$ & $48.78$ & $60.27$ & $53.87$: With word2vec & RuThes
$79.19$ & $63.97$ & $70.77$ & $78.85$ & $48.94$ & $60.39$ & $54.01$: All semantics
Calculating the values on predicted mentions:
In [236]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon',), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, pred_mentions_gold_bound_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
In [237]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, pred_mentions_gold_bound_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
In [238]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt', 'sem'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, pred_mentions_gold_bound_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
In [239]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt', 'w2v'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, pred_mentions_gold_bound_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
In [240]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt', 'ruthes'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, pred_mentions_gold_bound_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
In [241]:
clf = NE_MLMentionPairClassifier(scorer_path=scorer_path, feat_zones=('soon', 'morpho', 'dist', 'lexical', 'synt', 'sem', 'w2v', 'ruthes'), ne_list='dictionaries/all-ne.txt')
clf.train(create_clf(), rucoref_train, pred_mentions_gold_bound_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)