To reproduce this experiment you will need:
scikit-learn (v. 0.22.1)imbalanced-learn (v. 0.6.2)matplotlib (v. 3.1.3)anaphoralib Python moduleSince anaphoralib is in an early stage of development, there is no way to install it yet, so in order to import it, you should cd to the folder with the module. Paths to the corpus should be updated accordingly.
In [1]:
%cd '/Users/max/Projects/Coreference/'
In [2]:
%cd 'rucoref'
from anaphoralib.corpora import rueval
from anaphoralib.tagsets import multeast
from anaphoralib.experiments.base import BaseClassifier
from anaphoralib import utils
from anaphoralib.experiments import utils as exp_utils
%cd '..'
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import BorderlineSMOTE
import numpy as np
%matplotlib inline
In [3]:
lists_dir = 'CICLing-2016/wordlists'
texts_dir = 'Corpus-2015/Tokens.txt'
gs_dir = 'Corpus-2015/Groups.txt'
tagset = multeast
random_state = 42
In [4]:
rucoref = rueval.RuCorefCorpus(multeast, rueval)
In [5]:
exp_utils.load_corpus(rucoref, texts_dir, gs_dir)
In [6]:
rucoref.groups[0][:30]
Out[6]:
In [7]:
rucoref.print_stats()
In [8]:
rucoref.create_indices()
In [9]:
def load_list(filename):
data = set()
with open(filename, encoding='utf-8') as inp_file:
for line in inp_file:
data.add(line.strip('\r\n'))
return data
In [10]:
import os
wordlists = {}
for filename in os.listdir(lists_dir):
wordlists[filename.replace('.txt', '')] = load_list(os.path.join(lists_dir, filename))
In [11]:
print(wordlists.keys())
Building additional indices (of all words and all groups):
In [12]:
import collections
word_index = []
group_index = []
for i, text in enumerate(rucoref.texts):
word_index.append(collections.defaultdict(set))
group_index.append(collections.defaultdict(set))
for word in text:
word_index[-1]['_'.join(word.lemma)].add(word.offset)
for group in rucoref.groups[i]:
for g in group.iter_groups():
group_index[-1]['_'.join(g.lemma)].add(g.offset)
In [13]:
print('\n'.join(list(group_index[0].keys())[:30]))
Building sets of adjectives and pronouns for feature selection:
In [14]:
adjectives = set()
for text in rucoref.texts:
for word in text:
if tagset.pos_filters['adj'](word) and (len(word.tag) < 7 or word.tag[6] == 'f'):
adjectives.add('_'.join(word.lemma))
adjectives = list(adjectives)
In [15]:
adjectives
Out[15]:
In [16]:
pronouns = set()
for text in rucoref.texts:
for word in text:
if tagset.pos_filters['pronoun'](word):
pronouns.add('_'.join(word.lemma))
pronouns = list(pronouns)
In [17]:
pronouns
Out[17]:
In [18]:
import re
class FirstMentionClassifier(BaseClassifier):
def __init__(self):
super(FirstMentionClassifier, self).__init__()
self.feat_zones_ = ('struct', 'string', 'lists')
self.stats = {'str_matches_before', 'head_matches_before', 'n_adj', 'len_np'}
self.rx_lat = re.compile('[A-Za-z]')
self.pronouns = {u"его", u"ее", u"её", u"ей", u"ему", u"ею", u"им", u"ими", u"их", u"которая",
u"которого", u"которое", u"которой", u"котором", u"которому", u"которую", u"которые",
u"который", u"которым", u"которыми", u"которых", u"него", u"нее", u"неё", u"ней", u"нем",
u"нём", u"нему", u"нею", u"ним", u"ними", u"них", u"он", u"она", u"они", u"оно", u"свое",
u"своё", u"своего", u"своей", u"своем", u"своём", u"своему", u"своею", u"свой", u"свои",
u"своим", u"своими", u"своих", u"свою", u"своя", u"себе", u"себя", u"собой", u"собою"}
self.clear_stats()
def get_feature_vector(self, corpus, group, i_text, save_feature_names=False):
if save_feature_names:
self.feature_names_ = []
vctr = []
group_lemma = '_'.join(group.lemma)
group_occurrences = group_index[i_text][group_lemma] if group_lemma in group_index[i_text] else []
head_index = group.head
head_lemma = group.lemma[group.head]
head_occurrences = word_index[i_text][head_lemma] if head_lemma in word_index[i_text] else []
head_offset = group.head_offset
group_words = group.words if group.type != 'word' else [group]
str_matches_before = sum(1 for occ in group_occurrences if occ < group.offset)
head_matches_before = sum(1 for occ in head_occurrences if occ < group.offset)
adj_in_group = [word for word in group_words[:head_index+1] if tagset.pos_filters['adj'](word)]
self.stats['str_matches_before'].append(str_matches_before)
self.stats['head_matches_before'].append(head_matches_before)
self.stats['n_adj'].append("{}: {}".format(len(adj_in_group), group_lemma))
self.stats['len_np'].append("{}: {}".format(len(group_words), group_lemma))
if 'string' in self.feat_zones_:
vctr.append(('str_match_before=0', str_matches_before == 0))
vctr.append(('str_match_before<2', str_matches_before < 2))
vctr.append(('str_match_before<3', str_matches_before < 3))
vctr.append(('str_match_before>2', str_matches_before > 2))
vctr.append(('head_match_before=0', head_matches_before == 0))
vctr.append(('head_match_before<2', head_matches_before < 2))
vctr.append(('head_match_before<3', head_matches_before < 3))
vctr.append(('head_match_before>2', head_matches_before > 2))
vctr.append(('uppercase', all(word.isupper() and len(word) > 1 for word in group.wordform)))
#vctr.append(('capitalized', any(word[0].isupper() and len(group.wordform) > 1 for word in group.wordform[1:])))
vctr.append(('latin', any(self.rx_lat.search(word) for word in group.wordform)))
vctr.append(('is_proper_noun', corpus.tagset.pos_filters['properNoun'](group)))
#vctr.append(('is_pronoun', group.lemma[0] in pronouns))
vctr.append(('is_pronoun', group.wordform[0] in self.pronouns))
#vctr.append(('is_pronoun', multeast.pos_filters['pronoun'](group) or group.wordform[0] in pronouns))
self.n_pronouns += 1
if 'struct' in self.feat_zones_:
i_word = corpus.words_index[i_text][group.offset]
left_word = corpus.texts[i_text][i_word - 1] if i_word > 0 else None
right_word = corpus.texts[i_text][i_word + len(group.wordform) + 1] \
if i_word + len(group.wordform) + 1 < len(corpus.texts[i_text]) else None
vctr.append(('conj', bool((left_word and corpus.tagset.pos_filters['conj'](left_word))
or (right_word and corpus.tagset.pos_filters['conj'](right_word)))))
vctr.append(('len_np<2', len(group.tags) < 2))
vctr.append(('len_np>2', len(group.tags) > 2))
vctr.append(('n_adj=0', len(adj_in_group) == 0))
vctr.append(('n_adj>1', len(adj_in_group) > 1))
vctr.append(('n_adj>2', len(adj_in_group) > 2))
if 'lists' in self.feat_zones_:
for l in wordlists:
vctr.append(('in_list_{}'.format(l), any(lemma in wordlists[l] for lemma in group.lemma[:head_index+1])))
if save_feature_names:
self.feature_names_ = [feat[0] for feat in vctr]
return [int(feat[1]) for feat in vctr]
def prepare_data(self, corpus, random_state=42, test_size=0.3, feature_zones=None):
if feature_zones:
self.feat_zones_ = feature_zones
self.n_pronouns = 0
self.stats['class'] = []
self.groups = []
self.x_data = []
self.y_data = []
self.cur_data_ = 'Binary, filtered singletons'
self.class_names_ = ('non-first', 'first')
save_features = True
for i_text, text in enumerate(corpus.texts):
for i, mention in enumerate(corpus.mentions[i_text]):
if i not in rucoref.gs_index[i_text]:
continue
cur_gs_group_id = corpus.gs_index[i_text][i]
cur_chain = corpus.gs[i_text]['chains'][corpus.chains_index[i_text][cur_gs_group_id]]
self.y_data.append(self.class_names_.index('first') if cur_gs_group_id == cur_chain[0]
else self.class_names_.index('non-first'))
group = corpus.heads_index[i_text][mention.offset]
self.x_data.append(self.get_feature_vector(corpus, group, i_text, save_features))
self.groups.append(group)
self.stats['class'].append(self.class_names_[self.y_data[-1]])
save_features = False
pronoun_index = self.feature_names_.index('is_pronoun')
if self.x_data[-1][pronoun_index]:
self.x_data.pop()
self.y_data.pop()
self.groups.pop()
for key in self.stats:
self.stats[key].pop()
continue
del self.x_data[-1][pronoun_index]
super(FirstMentionClassifier, self).prepare_data(corpus, random_state, test_size)
del self.feature_names_[pronoun_index]
class_numbers = [sum(1 for item in self.y_data if item == cur_class) for cur_class in range(len(self.class_names_))]
self.ratio = float(min(class_numbers) / float(max(class_numbers)))
In [19]:
first_mention_clf = FirstMentionClassifier()
first_mention_clf.prepare_data(rucoref, random_state=random_state, test_size=0.3)
In [20]:
first_mention_clf.stats.keys()
Out[20]:
In [22]:
def baseline_predict(data):
y_pred = np.zeros(len(data))
for i, row in enumerate(data):
y_pred[i] = row[0] == 1
return y_pred
In [23]:
first_mention_clf.test(y_pred=baseline_predict(first_mention_clf.x_data_test), test_name='baseline')
In [24]:
first_mention_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string',))
In [25]:
clf = RandomForestClassifier(n_estimators=500, random_state=random_state)
sampler = BorderlineSMOTE(sampling_strategy=first_mention_clf.ratio, kind='borderline-1', random_state=random_state)
first_mention_clf.fit(clf, sampler)
first_mention_clf.test(test_name='string features')
In [26]:
first_mention_clf.print_stats()
In [27]:
first_mention_clf = FirstMentionClassifier()
first_mention_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct'))
clf = RandomForestClassifier(n_estimators=500, random_state=random_state)
sampler = BorderlineSMOTE(sampling_strategy=first_mention_clf.ratio, kind='borderline-1', random_state=random_state)
first_mention_clf.fit(clf, sampler)
first_mention_clf.test(test_name='string+struct features')
In [28]:
first_mention_clf = FirstMentionClassifier()
first_mention_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct', 'lists'))
clf = RandomForestClassifier(n_estimators=500, random_state=random_state)
sampler = BorderlineSMOTE(sampling_strategy=first_mention_clf.ratio, kind='borderline-1', random_state=random_state)
first_mention_clf.fit(clf, sampler)
first_mention_clf.test(test_name='all features')
In [29]:
first_mention_clf = FirstMentionClassifier()
first_mention_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct', 'lists'))
regr = LogisticRegression(random_state=random_state, max_iter=250)
sampler = BorderlineSMOTE(sampling_strategy=first_mention_clf.ratio, kind='borderline-1', random_state=random_state)
first_mention_clf.fit(regr, sampler)
In [30]:
for i, feat_name in enumerate(first_mention_clf.feature_names_):
print('{}: {:.4f}'.format(feat_name, regr.coef_[0,i]))
In [31]:
import sklearn.feature_extraction.text
adj_vectorizer = sklearn.feature_extraction.text.CountVectorizer(vocabulary=adjectives)
pron_vectorizer = sklearn.feature_extraction.text.CountVectorizer(vocabulary=pronouns)
In [32]:
def additional_features(data, vectorizer):
additional_features = np.zeros(shape=(len(data), len(vectorizer.vocabulary)))
for i, row in enumerate(data):
additional_features[i,:] = vectorizer.transform([u' '.join(row.lemma)]).toarray()
return additional_features
In [33]:
from sklearn.preprocessing import MinMaxScaler
def rank_to_dict(ranks, names, order=1):
minmax = MinMaxScaler()
ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
ranks = map(lambda x: round(x, 4), ranks)
return dict(zip(names, ranks ))
In [34]:
add_data_x = additional_features(first_mention_clf.groups_train, adj_vectorizer)
adj_clf = RandomForestClassifier(random_state=random_state)
adj_clf.fit(add_data_x, first_mention_clf.y_data_train)
Out[34]:
In [35]:
ranks = rank_to_dict(adj_clf.feature_importances_, adj_vectorizer.vocabulary)
for feat_name in sorted(ranks, key=lambda f: ranks[f], reverse=True):
print(feat_name, ranks[feat_name])
In [36]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import anaphoralib.experiments.utils
In [37]:
first_mention_clf = FirstMentionClassifier()
first_mention_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct', 'lists'))
feature_distributions = {}
for feat_name in first_mention_clf.stats:
feature_distributions[feat_name] = {cls: [] for cls in first_mention_clf.class_names_ + ('total',)}
for i, elem in enumerate(first_mention_clf.stats['class']):
feature_distributions[feat_name][elem].append(first_mention_clf.stats[feat_name][i])
feature_distributions[feat_name]['total'].append(first_mention_clf.stats[feat_name][i])
In [38]:
import os
anaphoralib.experiments.utils.latexify(columns=2)
for feat_name in feature_distributions:
if feat_name == 'class':
continue
anaphoralib.experiments.utils.plot_feature_distribution(feature_distributions[feat_name],
range(7),
first_mention_clf.class_names_,
x_label=feat_name.replace('_', '\\_'),
filename=os.path.join('CICLing-2016', feat_name))
In [39]:
from sklearn.model_selection import learning_curve
from sklearn.metrics import make_scorer, f1_score
from sklearn.utils import shuffle
In [40]:
first_mention_clf = FirstMentionClassifier()
first_mention_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct', 'lists'))
clf = RandomForestClassifier(n_estimators=500, random_state=random_state)
In [41]:
shuffled_x_data, shuffled_y_data = shuffle(first_mention_clf.x_data, first_mention_clf.y_data,
random_state=random_state)
train_sizes_abs, train_scores, test_scores = learning_curve(clf,
shuffled_x_data,
shuffled_y_data,
cv=3,
scoring=make_scorer(f1_score, pos_label=1))
In [42]:
anaphoralib.experiments.utils.latexify(columns=2)
anaphoralib.experiments.utils.plot_learning_curve(train_sizes_abs,
train_scores, test_scores,
score_name='f1',
filename=os.path.join('CICLing-2016', 'learning_curve_plot'))
Out[42]: