Experiment for the paper "Identification of singleton mentions in Russian"

Replication of CLLS-2016 paper (Ionov and Toldova 2016)

To reproduce this experiment you will need:

RuCor corpus (from 2015-10-29)
Python modules:
- scikit-learn (v. 0.22.1)
- imbalanced-learn (v. 0.6.2)
- matplotlib (v. 3.1.3)
anaphoralib Python module

Since anaphoralib is in an early stage of development, there is no way to install it yet, so in order to import it, you should cd to the folder with the module. Paths to the corpus should be updated accordingly.



In [1]:

    
%cd '/Users/max/Projects/Coreference/'









    



/Users/max/Projects/Coreference



In [2]:

    
%cd 'rucoref'
from anaphoralib.corpora import rueval
from anaphoralib.tagsets import multeast
from anaphoralib.experiments.base import BaseClassifier
from anaphoralib import utils
from anaphoralib.experiments import utils as exp_utils
%cd '..'

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import BorderlineSMOTE

import numpy as np

%matplotlib inline









    



/Users/max/Projects/Coreference/rucoref
/Users/max/Projects/Coreference



In [3]:

    
lists_dir = 'CLLS-2016/wordlists'
texts_dir = 'Corpus-2015/Tokens.txt'
gs_dir = 'Corpus-2015/Groups.txt'

tagset = multeast

random_state = 42

Reading the texts from GS and matching them to actual texts

Loading chains and GS



In [4]:

    
rucoref = rueval.RuCorefCorpus(multeast, rueval)



In [5]:

    
exp_utils.load_corpus(rucoref, texts_dir, gs_dir)









    



['doc_id', 'shift', 'length', 'token', 'lemma', 'gram']
dict_keys(['doc_id', 'shift', 'length', 'token', 'lemma', 'gram'])



In [6]:

    
rucoref.groups[0][:10]









    Out[6]:





[Во:во(Sp-a, 0),
 время своих прогулок:время свой прогулка(Ncnsan, 3),
 в:в(Sp-l, 24),
 окрестностях Симеиза:окрестность симеиза(Ncfpln, 26),
 я:я(P-1-snn, 47),
 обратил:обратить(Vmis-sma-p, 49),
 внимание:внимание(Ncnsan, 57),
 на:на(Sp-a, 66),
 одинокую дачу:одинокий дача(Ncfsan, 69),
 стоявшую:стоять(Vmps-sfa-ea, 84)]



In [7]:

    
rucoref.print_stats()









    



Number of texts: 181
Number of GS texts: 181
Number of chains in a corpus: 3383
Number of words in all chains: 15886



In [8]:

    
rucoref.create_indices()

Loading special lists

Special lists load from the directory stored in lists_dir



In [9]:

    
import codecs

def load_list(filename):
    data = set()
    with codecs.open(filename, encoding='utf-8') as inp_file:
        for line in inp_file:
            data.add(line.strip('\r\n'))
    
    return data



In [10]:

    
import os

wordlists = {}

for filename in os.listdir(lists_dir):
    wordlists[filename.replace('.txt', '')] = load_list(os.path.join(lists_dir, filename))



In [11]:

    
print(wordlists.keys())









    



dict_keys(['neg_pronouns', 'indef', 'non-identity_sim', 'possessives'])

Building indices and dictionaries

Building additional indices (of all words and all groups):



In [12]:

    
import collections

word_index = []
group_index = []

for i, text in enumerate(rucoref.texts):
    word_index.append(collections.defaultdict(set))
    group_index.append(collections.defaultdict(set))
    
    for word in text:
        word_index[-1]['_'.join(word.lemma)].add(word.offset)
    for group in rucoref.groups[i]:
        for g in group.iter_groups():
            group_index[-1]['_'.join(g.lemma)].add(g.offset)



In [13]:

    
print('\n'.join(list(group_index[0].keys())[:15]))









    



во
время_свой_прогулка
время
свой
прогулка
в
окрестность_симеиза
окрестность
симеиза
я
обратить
внимание
на
одинокий_дача
одинокий

Creating a classifier



In [14]:

    
import re

class SingletonClassifier(BaseClassifier):
    def __init__(self):
        super(SingletonClassifier, self).__init__()
        
        self.feat_zones_ = ('struct', 'string', 'lists', 'synt')
        self.stats = {'str_matches_before', 'head_matches_before', 'n_adj', 'len_np', 'is_genitive'}
        self.stats.update('in_list_{}'.format(l) for l in wordlists)
        
        self.rx_lat = re.compile('[A-Za-z]')
        self.pronouns = {u"его", u"ее", u"её", u"ей", u"ему", u"ею", u"им", u"ими", u"их", u"которая", 
                u"которого", u"которое", u"которой", u"котором", u"которому", u"которую", u"которые", 
                u"который", u"которым", u"которыми", u"которых", u"него", u"нее", u"неё", u"ней", u"нем", 
                u"нём", u"нему", u"нею", u"ним", u"ними", u"них", u"он", u"она", u"они", u"оно", u"свое", 
                u"своё", u"своего", u"своей", u"своем", u"своём", u"своему", u"своею", u"свой", u"свои", 
                u"своим", u"своими", u"своих", u"свою", u"своя", u"себе", u"себя", u"собой", u"собою"}
        
        self.clear_stats()
        
    def get_feature_vector(self, corpus, group, i_text, save_feature_names=False):
        if save_feature_names:
            self.feature_names_ = []
            
        vctr = []
        
        group_lemma = '_'.join(group.lemma)
        group_occurrences = group_index[i_text][group_lemma] if group_lemma in group_index[i_text] else []

        head_index = group.head
        head_lemma = group.lemma[group.head]
        head_occurrences = word_index[i_text][head_lemma] if head_lemma in word_index[i_text] else []
        head_offset = group.head_offset

        group_words = group.words if group.type != 'word' else [group]

        str_matches_before = sum(1 for occ in group_occurrences if occ < group.offset)
        head_matches_before = sum(1 for occ in head_occurrences if occ < group.offset)
        
        adj_in_group = [word for word in group_words[:head_index+1] if tagset.pos_filters['adj'](word)]
        
        self.stats['str_matches_before'].append(str_matches_before)
        self.stats['head_matches_before'].append(head_matches_before)
        
        self.stats['n_adj'].append(len(adj_in_group))
        self.stats['len_np'].append(len(group_words))
        
        if 'string' in self.feat_zones_:
            vctr.append(('str_match_before=0', str_matches_before == 0))
            vctr.append(('head_match_before=0', head_matches_before == 0))
            
            #vctr.append(('uppercase', all(word.isupper() and len(word) > 1 for word in group.wordform)))
            #vctr.append(('capitalized', any(word[0].isupper() and len(group.wordform) > 1 for word in group.wordform[1:])))
            vctr.append(('latin', any(self.rx_lat.search(word) for word in group.wordform)))
            vctr.append(('is_proper_noun', corpus.tagset.pos_filters['properNoun'](group)))
            
            vctr.append(('is_animate', corpus.tagset.extract_feature('animate', group) == u'y'))
        vctr.append(('is_pronoun', group.wordform[0] in self.pronouns and len(group_words) == 1))
        
        i_word = corpus.words_index[i_text][group.offset]
        left_word = corpus.texts[i_text][i_word - 1] if i_word > 0 else None
        right_word = corpus.texts[i_text][i_word + len(group.wordform) + 1] \
                            if i_word + len(group.wordform) + 1 < len(corpus.texts[i_text]) else None
        
        if 'struct' in self.feat_zones_:
            #vctr.append(('conj', bool((left_word and corpus.tagset.pos_filters['conj'](left_word)) 
            #            or (right_word and corpus.tagset.pos_filters['conj'](right_word)))))
            
            vctr.append(('len_np==1', len(group.tags) == 1))
            vctr.append(('1<len_np<4', 1 < len(group.tags) < 4))
            vctr.append(('len_np>=4', 1 < len(group.tags) >= 4))
            
            vctr.append(('n_adj=0', len(adj_in_group) == 0))
            #vctr.append(('n_adj>1', len(adj_in_group) > 1))
            vctr.append(('n_adj>2', len(adj_in_group) > 2))
            
            vctr.append(('is_genitive', corpus.tagset.extract_feature('case', group) == u'g'))
            self.stats['is_genitive'].append(vctr[-1][1])
        
        sent_begin = left_word is None or left_word.tag == 'SENT'
        sent_end = right_word is None or right_word.tag == 'SENT'
        
        nomin = corpus.tagset.extract_feature('case', group) == u'n'
        accus = corpus.tagset.extract_feature('case', group) == u'a'
        
        if 'synt' in self.feat_zones_:
            vctr.append(('is_subject', nomin or sent_begin))
            #vctr.append(('is_object', accus or sent_end))
            
        if 'lists' in self.feat_zones_:
            for l in wordlists:
                feat_name = 'in_list_{}'.format(l)
                vctr.append((feat_name, any(lemma in wordlists[l] for lemma in group.lemma[:head_index+1])))
                self.stats[feat_name].append(vctr[-1][1])
        
        if save_feature_names:
            self.feature_names_ = [feat[0] for feat in vctr]
        
        return [int(feat[1]) for feat in vctr]
    
    def prepare_data(self, corpus, random_state=42, test_size=0.3, feature_zones=None):
        if feature_zones:
            self.feat_zones_ = feature_zones
        
        self.groups = []
        self.x_data = []
        self.y_data = []
        
        self.stats['class'] = []

        self.cur_data_ = 'Singletons'
        self.class_names_ = ('non-singleton', 'singleton')
        
        save_features = True
    
        exceptions = {u'и', u'в', u'а', u'к', u'у', u'по', u'где', u'ведь', u'с'}
        for i_text, text in enumerate(corpus.texts):
            for i, mention in enumerate(corpus.mentions[i_text]):
                group = corpus.heads_index[i_text][mention.offset]
                if group.lemma[0] in exceptions and group.tags[0].startswith('N'):
                    continue
                    
                if i not in rucoref.gs_index[i_text]:
                    self.y_data.append(self.class_names_.index('singleton'))
                else:
                    self.y_data.append(self.class_names_.index('non-singleton'))

                self.x_data.append(self.get_feature_vector(corpus, group, i_text, save_features))
                self.groups.append(group)
                self.stats['class'].append(self.class_names_[self.y_data[-1]])
                save_features = False

                #pronoun_index = self.feature_names_.index('is_pronoun')
                #if self.x_data[-1][pronoun_index]:
                #    self.x_data.pop()
                #    self.y_data.pop()
                #    continue
                
                #del self.x_data[-1][pronoun_index]
            
        super(SingletonClassifier, self).prepare_data(corpus, random_state, test_size)
    
        #del self.feature_names_[pronoun_index]
        class_numbers = [sum(1 for item in self.y_data if item == cur_class) for cur_class in range(len(self.class_names_))]
        self.ratio = float(min(class_numbers) / float(max(class_numbers)))

Training and testing



In [15]:

    
singleton_clf = SingletonClassifier()
singleton_clf.prepare_data(rucoref, random_state=random_state)

Baseline

Baseline condition: NP is a singleton if there is no such exact string or its head in the text before



In [16]:

    
def baseline_predict(data):
    y_pred = np.zeros(len(data))
    for i, row in enumerate(data):
        y_pred[i] = (row[0] == 1 and row[1] == 1)
    
    return y_pred



In [17]:

    
singleton_clf.test(y_pred=baseline_predict(singleton_clf.x_data_test), test_name='baseline')









    












    



Report baseline:                precision    recall  f1-score   support

non-singleton      0.452     0.668     0.539      4278
    singleton      0.831     0.668     0.741     10446

     accuracy                          0.668     14724
    macro avg      0.641     0.668     0.640     14724
 weighted avg      0.721     0.668     0.682     14724

String features



In [18]:

    
singleton_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string',))



In [19]:

    
clf = RandomForestClassifier(n_estimators=200, random_state=random_state)
sampler = BorderlineSMOTE(sampling_strategy='auto', kind='borderline-1', random_state=random_state)

singleton_clf.fit(clf, sampler)



In [20]:

    
singleton_clf.print_stats()









    



Classifier <class 'sklearn.ensemble._forest.RandomForestClassifier'>: fitted
Data: Singletons (non-singleton, singleton)
Total:	49078 samples
	14289 non-singleton
	34789 singleton



In [21]:

    
len(singleton_clf.x_data_train)









    Out[21]:





34354



In [22]:

    
singleton_clf.test(test_name='string features')









    












    



Report string features:                precision    recall  f1-score   support

non-singleton      0.468     0.519     0.492      4278
    singleton      0.794     0.758     0.775     10446

     accuracy                          0.689     14724
    macro avg      0.631     0.639     0.634     14724
 weighted avg      0.699     0.689     0.693     14724

String + Struct features



In [23]:

    
singleton_clf = SingletonClassifier()
singleton_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct'))
clf = RandomForestClassifier(n_estimators=200, random_state=random_state)
sampler = BorderlineSMOTE(sampling_strategy='auto', kind='borderline-1', random_state=random_state)

singleton_clf.fit(clf, sampler)
singleton_clf.test(test_name='string+struct features')









    












    



Report string+struct features:                precision    recall  f1-score   support

non-singleton      0.550     0.669     0.604      4278
    singleton      0.851     0.776     0.812     10446

     accuracy                          0.745     14724
    macro avg      0.701     0.723     0.708     14724
 weighted avg      0.764     0.745     0.752     14724

String + Struct + List features



In [24]:

    
singleton_clf = SingletonClassifier()
singleton_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct', 'lists'))
clf = RandomForestClassifier(n_estimators=200, random_state=random_state)
sampler = BorderlineSMOTE(sampling_strategy='auto', kind='borderline-1', random_state=random_state)

singleton_clf.fit(clf, sampler)
singleton_clf.test(test_name='string+struct+lists')









    












    



Report string+struct+lists:                precision    recall  f1-score   support

non-singleton      0.527     0.694     0.599      4278
    singleton      0.856     0.745     0.797     10446

     accuracy                          0.730     14724
    macro avg      0.691     0.719     0.698     14724
 weighted avg      0.760     0.730     0.739     14724

All features



In [25]:

    
singleton_clf = SingletonClassifier()
singleton_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct', 'lists', 'synt'))
clf = RandomForestClassifier(n_estimators=200, random_state=random_state)
sampler = BorderlineSMOTE(sampling_strategy='auto', kind='borderline-1', random_state=random_state)

singleton_clf.fit(clf, sampler)
singleton_clf.test(test_name='all features')









    












    



Report all features:                precision    recall  f1-score   support

non-singleton      0.551     0.678     0.608      4278
    singleton      0.854     0.774     0.812     10446

     accuracy                          0.746     14724
    macro avg      0.703     0.726     0.710     14724
 weighted avg      0.766     0.746     0.753     14724



In [26]:

    
for i, feat_val in enumerate(singleton_clf.clf_.feature_importances_):
    print('{}: {:.4f}'.format(singleton_clf.feature_names_[i], feat_val))









    



str_match_before=0: 0.1417
head_match_before=0: 0.1265
latin: 0.0058
is_proper_noun: 0.0363
is_animate: 0.1425
is_pronoun: 0.1410
len_np==1: 0.0654
1<len_np<4: 0.0422
len_np>=4: 0.0107
n_adj=0: 0.0429
n_adj>2: 0.0005
is_genitive: 0.0977
is_subject: 0.1055
in_list_neg_pronouns: 0.0005
in_list_indef: 0.0231
in_list_non-identity_sim: 0.0018
in_list_possessives: 0.0159



In [27]:

    
out_singletons = open('singletons.all.txt', 'w', encoding='utf-8')
out_non_singletons = open('non-singletons.all.txt', 'w', encoding='utf-8')
for i, item in enumerate(singleton_clf.groups_train):
    if singleton_clf.y_data_train[i] == 1:
        out_singletons.write(str(singleton_clf.groups_train[i]))
        out_singletons.write('\n')
    else:
        out_non_singletons.write(str(singleton_clf.groups_train[i]))
        out_non_singletons.write('\n')



In [28]:

    
out_fp = open('singletons.fp.txt', 'w', encoding='utf-8')
out_fn = open('singletons.fn.txt', 'w', encoding='utf-8')

y_pred = singleton_clf.clf_.predict(singleton_clf.x_data_test)
for i, item in enumerate(singleton_clf.groups_test):
    if singleton_clf.y_data_test[i] == 0 and y_pred[i] != singleton_clf.y_data_test[i]:
        out_fp.write(str(singleton_clf.groups_test[i]))
        out_fp.write('\n')
    if singleton_clf.y_data_test[i] == 1 and y_pred[i] != singleton_clf.y_data_test[i]:
        out_fn.write(str(singleton_clf.groups_test[i]))
        out_fn.write('\n')

Calculating feature importances



In [29]:

    
regr = LogisticRegression(random_state=random_state, max_iter=200)
sampler = BorderlineSMOTE(sampling_strategy='auto', kind='borderline-1', random_state=random_state)

singleton_clf.fit(regr, sampler)



In [30]:

    
for i, feat_name in enumerate(singleton_clf.feature_names_):
    print('{}: {:.4f}'.format(feat_name, regr.coef_[0,i]))









    



str_match_before=0: 0.0055
head_match_before=0: 0.9502
latin: -0.1537
is_proper_noun: -0.0272
is_animate: -0.7836
is_pronoun: -2.8461
len_np==1: -0.1373
1<len_np<4: 0.7212
len_np>=4: 1.1991
n_adj=0: 0.2601
n_adj>2: -0.1498
is_genitive: 0.4003
is_subject: -0.5809
in_list_neg_pronouns: 1.3862
in_list_indef: 0.8627
in_list_non-identity_sim: 1.3475
in_list_possessives: -0.0682

Additional actions

Getting feature distributions



In [31]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns



In [32]:

    
import anaphoralib.experiments.utils



In [33]:

    
singleton_clf.stats.keys()









    Out[33]:





dict_keys(['len_np', 'str_matches_before', 'in_list_neg_pronouns', 'head_matches_before', 'in_list_indef', 'in_list_non-identity_sim', 'n_adj', 'is_genitive', 'in_list_possessives', 'class'])



In [34]:

    
singleton_clf = SingletonClassifier()
singleton_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct', 'lists', 'synt'))

feature_distributions = {}
for feat_name in singleton_clf.stats:
    feature_distributions[feat_name] = {cls: [] for cls in singleton_clf.class_names_ + ('total',)}
    
    for i, elem in enumerate(singleton_clf.stats['class']):
        feature_distributions[feat_name][elem].append(singleton_clf.stats[feat_name][i])
        feature_distributions[feat_name]['total'].append(singleton_clf.stats[feat_name][i])



In [35]:

    
anaphoralib.experiments.utils.latexify()



In [36]:

    
def plot_feat_distribution(distribution, bins, class_names, x_label='Feature value', filename='plot.pdf'):
    bins = range(7)
    
    ax = plt.gca()
    ax.set_xlabel(x_label)
    ax.set_ylabel("Density")
    #ax.set_title("Distribution of feature")
    plt.tight_layout()
    format_axes(ax)
    
    normed = True
    
    true_hist = np.histogram(distribution[class_names[1]], bins, density=normed)
    false_hist = np.histogram(distribution[class_names[0]], bins, density=normed)
    
    w = 0.3
    
    true_x = [item for item in range(len(true_hist[0]))]
    false_x = [item+w for item in range(len(false_hist[0]))]
    
    ax.set_xticks([item + float(w) for item in true_x])
    ax.set_xticklabels(true_x)
    
    rects1 = plt.bar(false_x, false_hist[0], w, color='0.3')
    rects2 = plt.bar(true_x, true_hist[0], w, color='0.7')
    plt.legend((rects1, rects2), class_names, loc='upper right')
    
    plt.savefig("{}.pdf".format(filename))
    plt.show()
    plt.close()



In [37]:

    
import os

anaphoralib.experiments.utils.latexify(columns=2)
for feat_name in feature_distributions:
    if feat_name == 'class':
        continue
    anaphoralib.experiments.utils.plot_feature_distribution(feature_distributions[feat_name], range(7),
                                                            singleton_clf.class_names_, 
                           x_label=feat_name.replace('_', '\\_'), filename=os.path.join('CLLS-2016', feat_name))









    












    












    



/Users/max/anaconda3/lib/python3.7/site-packages/numpy/lib/histograms.py:778: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility.
  a, weights = _ravel_and_check_weights(a, weights)



In [38]:

    
from sklearn.model_selection import learning_curve
from sklearn.metrics import make_scorer, f1_score
from sklearn.utils import shuffle



In [39]:

    
singleton_clf = SingletonClassifier()
singleton_clf.prepare_data(rucoref, random_state=random_state, feature_zones=('string', 'struct', 'lists', 'synt'))
clf = RandomForestClassifier(n_estimators=200, random_state=random_state)



In [40]:

    
shuffled_x_data, shuffled_y_data = shuffle(singleton_clf.x_data, singleton_clf.y_data, random_state=random_state)

train_sizes_abs, train_scores, test_scores = learning_curve(clf,
                                                            shuffled_x_data, 
                                                            shuffled_y_data,
                                                            cv=3,
                                                            scoring=make_scorer(f1_score, pos_label=0))



In [41]:

    
anaphoralib.experiments.utils.latexify(columns=2)
anaphoralib.experiments.utils.plot_learning_curve(train_sizes_abs, 
                                                  train_scores, test_scores, 
                                                  score_name='f1',
                                                  filename=os.path.join('CLLS-2016', 'learning_curve_plot'))









    Out[41]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a31c35b70>