To reproduce this experiment you will need:
scikit-learn (v. 0.22.1)imbalanced-learn (v. 0.6.2)matplotlib (v. 3.1.3)anaphoralib Python moduleSince anaphoralib is in an early stage of development, there is no way to install it yet, so in order to import it, you should cd to the folder with the module. Paths to the corpus should be updated accordingly.
In [1]:
%cd '/Users/max/Projects/Coreference/'
In [2]:
%cd 'rucoref'
from anaphoralib.corpora import rueval
from anaphoralib.tagsets import multeast
from anaphoralib.experiments.base import BaseClassifier
from anaphoralib import utils
from anaphoralib.experiments import utils as exp_utils
%cd '..'
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import BorderlineSMOTE
import numpy as np
%matplotlib inline
In [3]:
lists_dir = 'CLLS-2016/wordlists'
texts_dir = 'Corpus-2015/Tokens.txt'
gs_dir = 'Corpus-2015/Groups.txt'
tagset = multeast
random_state = 42
In [4]:
rucoref = rueval.RuCorefCorpus(multeast, rueval)
In [5]:
exp_utils.load_corpus(rucoref, texts_dir, gs_dir)
In [6]:
rucoref.groups[0][:10]
Out[6]:
In [7]:
rucoref.print_stats()
In [8]:
rucoref.create_indices()
In [9]:
import codecs
def load_list(filename):
data = set()
with codecs.open(filename, encoding='utf-8') as inp_file:
for line in inp_file:
data.add(line.strip('\r\n'))
return data
In [10]:
import os
wordlists = {}
for filename in os.listdir(lists_dir):
wordlists[filename.replace('.txt', '')] = load_list(os.path.join(lists_dir, filename))
In [11]:
print(wordlists.keys())
Building additional indices (of all words and all groups):
In [12]:
import collections
word_index = []
group_index = []
for i, text in enumerate(rucoref.texts):
word_index.append(collections.defaultdict(set))
group_index.append(collections.defaultdict(set))
for word in text:
word_index[-1]['_'.join(word.lemma)].add(word.offset)
for group in rucoref.groups[i]:
for g in group.iter_groups():
group_index[-1]['_'.join(g.lemma)].add(g.offset)
In [13]:
print('\n'.join(list(group_index[0].keys())[:15]))
In [14]:
import re
class SingletonClassifier(BaseClassifier):
def __init__(self):
super(SingletonClassifier, self).__init__()
self.feat_zones_ = ('struct', 'string', 'lists', 'synt')
self.stats = {'str_matches_before', 'head_matches_before', 'n_adj', 'len_np', 'is_genitive'}
self.stats.update('in_list_{}'.format(l) for l in wordlists)
self.rx_lat = re.compile('[A-Za-z]')
self.pronouns = {u"его", u"ее", u"её", u"ей", u"ему", u"ею", u"им", u"ими", u"их", u"которая",
u"которого", u"которое", u"которой", u"котором", u"которому", u"которую", u"которые",
u"который", u"которым", u"которыми", u"которых", u"него", u"нее", u"неё", u"ней", u"нем",
u"нём", u"нему", u"нею", u"ним", u"ними", u"них", u"он", u"она", u"они", u"оно", u"свое",
u"своё", u"своего", u"своей", u"своем", u"своём", u"своему", u"своею", u"свой", u"свои",
u"своим", u"своими", u"своих", u"свою", u"своя", u"себе", u"себя", u"собой", u"собою"}
self.clear_stats()
def get_feature_vector(self, corpus, group, i_text, save_feature_names=False):
if save_feature_names:
self.feature_names_ = []
vctr = []
group_lemma = '_'.join(group.lemma)
group_occurrences = group_index[i_text][group_lemma] if group_lemma in group_index[i_text] else []
head_index = group.head
head_lemma = group.lemma[group.head]
head_occurrences = word_index[i_text][head_lemma] if head_lemma in word_index[i_text] else []
head_offset = group.head_offset
group_words = group.words if group.type != 'word' else [group]
str_matches_before = sum(1 for occ in group_occurrences if occ < group.offset)
head_matches_before = sum(1 for occ in head_occurrences if occ < group.offset)
adj_in_group = [word for word in group_words[:head_index+1] if tagset.pos_filters['adj'](word)]
self.stats['str_matches_before'].append(str_matches_before)
self.stats['head_matches_before'].append(head_matches_before)
self.stats['n_adj'].append(len(adj_in_group))
self.stats['len_np'].append(len(group_words))
if 'string' in self.feat_zones_:
vctr.append(('str_match_before=0', str_matches_before == 0))
vctr.append(('head_match_before=0', head_matches_before == 0))
#vctr.append(('uppercase', all(word.isupper() and len(word) > 1 for word in group.wordform)))
#vctr.append(('capitalized', any(word[0].isupper() and len(group.wordform) > 1 for word in group.wordform[1:])))
vctr.append(('latin', any(self.rx_lat.search(word) for word in group.wordform)))
vctr.append(('is_proper_noun', corpus.tagset.pos_filters['properNoun'](group)))
vctr.append(('is_animate', corpus.tagset.extract_feature('animate', group) == u'y'))
vctr.append(('is_pronoun', group.wordform[0] in self.pronouns and len(group_words) == 1))
i_word = corpus.words_index[i_text][group.offset]
left_word = corpus.texts[i_text][i_word - 1] if i_word > 0 else None
right_word = corpus.texts[i_text][i_word + len(group.wordform) + 1] \
if i_word + len(group.wordform) + 1 < len(corpus.texts[i_text]) else None
if 'struct' in self.feat_zones_:
#vctr.append(('conj', bool((left_word and corpus.tagset.pos_filters['conj'](left_word))
# or (right_word and corpus.tagset.pos_filters['conj'](right_word)))))
vctr.append(('len_np==1', len(group.tags) == 1))
vctr.append(('1<len_np<4', 1 < len(group.tags) < 4))
vctr.append(('len_np>=4', 1 < len(group.tags) >= 4))
vctr.append(('n_adj=0', len(adj_in_group) == 0))
#vctr.append(('n_adj>1', len(adj_in_group) > 1))
vctr.append(('n_adj>2', len(adj_in_group) > 2))
vctr.append(('is_genitive', corpus.tagset.extract_feature('case', group) == u'g'))
self.stats['is_genitive'].append(vctr[-1][1])
sent_begin = left_word is None or left_word.tag == 'SENT'
sent_end = right_word is None or right_word.tag == 'SENT'
nomin = corpus.tagset.extract_feature('case', group) == u'n'
accus = corpus.tagset.extract_feature('case', group) == u'a'
if 'synt' in self.feat_zones_:
vctr.append(('is_subject', nomin or sent_begin))
#vctr.append(('is_object', accus or sent_end))
if 'lists' in self.feat_zones_:
for l in wordlists:
feat_name = 'in_list_{}'.format(l)
vctr.append((feat_name, any(lemma in wordlists[l] for lemma in group.lemma[:head_index+1])))
self.stats[feat_name].append(vctr[-1][1])
if save_feature_names:
self.feature_names_ = [feat[0] for feat in vctr]
return [int(feat[1]) for feat in vctr]
def prepare_data(self, corpus, random_state=42, test_size=0.3, feature_zones=None):
if feature_zones:
self.feat_zones_ = feature_zones
self.groups = []
self.x_data = []
self.y_data = []
self.stats['class'] = []
self.cur_data_ = 'Singletons'
self.class_names_ = ('non-singleton', 'singleton')
save_features = True
exceptions = {u'и', u'в', u'а', u'к', u'у', u'по', u'где', u'ведь', u'с'}
for i_text, text in enumerate(corpus.texts):
for i, mention in enumerate(corpus.mentions[i_text]):
group = corpus.heads_index[i_text][mention.offset]
if group.lemma[0] in exceptions and group.tags[0].startswith('N'):
continue
if i not in rucoref.gs_index[i_text]:
self.y_data.append(self.class_names_.index('singleton'))
else:
self.y_data.append(self.class_names_.index('non-singleton'))
self.x_data.append(self.get_feature_vector(corpus, group, i_text, save_features))
self.groups.append(group)
self.stats['class'].append(self.class_names_[self.y_data[-1]])
save_features = False
#pronoun_index = self.feature_names_.index('is_pronoun')
#if self.x_data[-1][pronoun_index]:
# self.x_data.pop()
# self.y_data.pop()
# continue
#del self.x_data[-1][pronoun_index]
super(SingletonClassifier, self).prepare_data(corpus, random_state, test_size)
#del self.feature_names_[pronoun_index]
class_numbers = [sum(1 for item in self.y_data if item == cur_class) for cur_class in range(len(self.class_names_))]
self.ratio = float(min(class_numbers) / float(max(class_numbers)))
In [15]:
singleton_clf = SingletonClassifier()
singleton_clf.prepare_data(rucoref, random_state=random_state)
In [16]:
def baseline_predict(data):
y_pred = np.zeros(len(data))
for i, row in enumerate(data):
y_pred[i] = (row[0] == 1 and row[1] == 1)
return y_pred
In [17]:
singleton_clf.test(y_pred=baseline_predict(singleton_clf.x_data_test), test_name='baseline')
In [18]:
singleton_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string',))
In [19]:
clf = RandomForestClassifier(n_estimators=200, random_state=random_state)
sampler = BorderlineSMOTE(sampling_strategy='auto', kind='borderline-1', random_state=random_state)
singleton_clf.fit(clf, sampler)
In [20]:
singleton_clf.print_stats()
In [21]:
len(singleton_clf.x_data_train)
Out[21]:
In [22]:
singleton_clf.test(test_name='string features')
In [23]:
singleton_clf = SingletonClassifier()
singleton_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct'))
clf = RandomForestClassifier(n_estimators=200, random_state=random_state)
sampler = BorderlineSMOTE(sampling_strategy='auto', kind='borderline-1', random_state=random_state)
singleton_clf.fit(clf, sampler)
singleton_clf.test(test_name='string+struct features')
In [24]:
singleton_clf = SingletonClassifier()
singleton_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct', 'lists'))
clf = RandomForestClassifier(n_estimators=200, random_state=random_state)
sampler = BorderlineSMOTE(sampling_strategy='auto', kind='borderline-1', random_state=random_state)
singleton_clf.fit(clf, sampler)
singleton_clf.test(test_name='string+struct+lists')
In [25]:
singleton_clf = SingletonClassifier()
singleton_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct', 'lists', 'synt'))
clf = RandomForestClassifier(n_estimators=200, random_state=random_state)
sampler = BorderlineSMOTE(sampling_strategy='auto', kind='borderline-1', random_state=random_state)
singleton_clf.fit(clf, sampler)
singleton_clf.test(test_name='all features')
In [26]:
for i, feat_val in enumerate(singleton_clf.clf_.feature_importances_):
print('{}: {:.4f}'.format(singleton_clf.feature_names_[i], feat_val))
In [27]:
out_singletons = open('singletons.all.txt', 'w', encoding='utf-8')
out_non_singletons = open('non-singletons.all.txt', 'w', encoding='utf-8')
for i, item in enumerate(singleton_clf.groups_train):
if singleton_clf.y_data_train[i] == 1:
out_singletons.write(str(singleton_clf.groups_train[i]))
out_singletons.write('\n')
else:
out_non_singletons.write(str(singleton_clf.groups_train[i]))
out_non_singletons.write('\n')
In [28]:
out_fp = open('singletons.fp.txt', 'w', encoding='utf-8')
out_fn = open('singletons.fn.txt', 'w', encoding='utf-8')
y_pred = singleton_clf.clf_.predict(singleton_clf.x_data_test)
for i, item in enumerate(singleton_clf.groups_test):
if singleton_clf.y_data_test[i] == 0 and y_pred[i] != singleton_clf.y_data_test[i]:
out_fp.write(str(singleton_clf.groups_test[i]))
out_fp.write('\n')
if singleton_clf.y_data_test[i] == 1 and y_pred[i] != singleton_clf.y_data_test[i]:
out_fn.write(str(singleton_clf.groups_test[i]))
out_fn.write('\n')
In [29]:
regr = LogisticRegression(random_state=random_state, max_iter=200)
sampler = BorderlineSMOTE(sampling_strategy='auto', kind='borderline-1', random_state=random_state)
singleton_clf.fit(regr, sampler)
In [30]:
for i, feat_name in enumerate(singleton_clf.feature_names_):
print('{}: {:.4f}'.format(feat_name, regr.coef_[0,i]))
In [31]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
In [32]:
import anaphoralib.experiments.utils
In [33]:
singleton_clf.stats.keys()
Out[33]:
In [34]:
singleton_clf = SingletonClassifier()
singleton_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct', 'lists', 'synt'))
feature_distributions = {}
for feat_name in singleton_clf.stats:
feature_distributions[feat_name] = {cls: [] for cls in singleton_clf.class_names_ + ('total',)}
for i, elem in enumerate(singleton_clf.stats['class']):
feature_distributions[feat_name][elem].append(singleton_clf.stats[feat_name][i])
feature_distributions[feat_name]['total'].append(singleton_clf.stats[feat_name][i])
In [35]:
anaphoralib.experiments.utils.latexify()
In [36]:
def plot_feat_distribution(distribution, bins, class_names, x_label='Feature value', filename='plot.pdf'):
bins = range(7)
ax = plt.gca()
ax.set_xlabel(x_label)
ax.set_ylabel("Density")
#ax.set_title("Distribution of feature")
plt.tight_layout()
format_axes(ax)
normed = True
true_hist = np.histogram(distribution[class_names[1]], bins, density=normed)
false_hist = np.histogram(distribution[class_names[0]], bins, density=normed)
w = 0.3
true_x = [item for item in range(len(true_hist[0]))]
false_x = [item+w for item in range(len(false_hist[0]))]
ax.set_xticks([item + float(w) for item in true_x])
ax.set_xticklabels(true_x)
rects1 = plt.bar(false_x, false_hist[0], w, color='0.3')
rects2 = plt.bar(true_x, true_hist[0], w, color='0.7')
plt.legend((rects1, rects2), class_names, loc='upper right')
plt.savefig("{}.pdf".format(filename))
plt.show()
plt.close()
In [37]:
import os
anaphoralib.experiments.utils.latexify(columns=2)
for feat_name in feature_distributions:
if feat_name == 'class':
continue
anaphoralib.experiments.utils.plot_feature_distribution(feature_distributions[feat_name], range(7),
singleton_clf.class_names_,
x_label=feat_name.replace('_', '\\_'), filename=os.path.join('CLLS-2016', feat_name))
In [38]:
from sklearn.model_selection import learning_curve
from sklearn.metrics import make_scorer, f1_score
from sklearn.utils import shuffle
In [39]:
singleton_clf = SingletonClassifier()
singleton_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct', 'lists', 'synt'))
clf = RandomForestClassifier(n_estimators=200, random_state=random_state)
In [40]:
shuffled_x_data, shuffled_y_data = shuffle(singleton_clf.x_data, singleton_clf.y_data, random_state=random_state)
train_sizes_abs, train_scores, test_scores = learning_curve(clf,
shuffled_x_data,
shuffled_y_data,
cv=3,
scoring=make_scorer(f1_score, pos_label=0))
In [41]:
anaphoralib.experiments.utils.latexify(columns=2)
anaphoralib.experiments.utils.plot_learning_curve(train_sizes_abs,
train_scores, test_scores,
score_name='f1',
filename=os.path.join('CLLS-2016', 'learning_curve_plot'))
Out[41]: