Comparison of anaphoricity annotations (Dittrich vs. Tosik)



In [1]:

    
import os
from glob import glob
from collections import defaultdict
from discoursegraphs.readwrite import AnaphoraDocumentGraph
from discoursegraphs import print_dot, info, select_nodes_by_layer



In [2]:

    
TOSIK_DIR = os.path.expanduser('~/repos/pcc-annis-merged/maz176/anaphora/tosik/')
DITTRICH_DIR = os.path.expanduser('~/repos/pcc-annis-merged/maz176/anaphora/dittrich/')
KOBOLD_DIR = os.path.expanduser('~/repos/pcc-annis-merged/maz176/anaphora/kobold/')



In [3]:

    
def have_same_annotation(graph1, graph2, node_id):
    return graph1.node[node_id][graph1.ns+':annotation'] == \
        graph2.node[node_id][graph2.ns+':annotation']



In [4]:

    
def get_anaphoricity(graph, token_id):
    return graph.node[token_id][graph.ns+':annotation'], graph.node[token_id][graph.ns+':certainty']



In [10]:

    
def anaphoricity_agreement(annotator1_dir, annotator2_dir, anaphora):
    """
    calculates the inter-annotator agreement between two annotators for
    'das'/'es' anaphoricity annotation.
    
    Parameters
    ----------
    annotator1_dir : str
        root directory of the annotation files of the first annotator
    annotator2_dir : str
        root directory of the annotation files of the second annotator
    anaphora : str
        'das' or 'es'
    """
    annotator1_files = glob(os.path.join(annotator1_dir, anaphora, '*.txt'))
    annotator2_files = glob(os.path.join(annotator2_dir, anaphora, '*.txt'))

    num_of_annotated_tokens = 0
    num_of_identical_annotations = 0
    
    annotations = defaultdict(lambda: defaultdict(dict))

    for annotator1_file in annotator1_files:
        annotator1_graph = AnaphoraDocumentGraph(annotator1_file, namespace='annotator1')
        fname = os.path.basename(annotator1_file)
        
        try:
            annotator2_graph = AnaphoraDocumentGraph(os.path.join(annotator2_dir, anaphora, fname), namespace='annotator2')
        except:
            continue

        annotator1_anaphora = set(select_nodes_by_layer(annotator1_graph, 'annotator1:annotated'))
        annotator2_anaphora = set(select_nodes_by_layer(annotator2_graph, 'annotator2:annotated'))
        assert annotator1_anaphora == annotator2_anaphora

        num_of_annotated_tokens += len(annotator1_anaphora)
        for node_id in annotator1_anaphora:
            if have_same_annotation(annotator1_graph, annotator2_graph, node_id):
                num_of_identical_annotations += 1
            
            ns1, ns2 = annotator1_graph.ns, annotator2_graph.ns
            annotations[(fname, node_id)][ns1] = get_anaphoricity(annotator1_graph, node_id)
            annotations[(fname, node_id)][ns2] = get_anaphoricity(annotator2_graph, node_id)
                                            
    title = "'{}' ANNOTATION".format(anaphora.upper())
    print "{0}\n{1}\n".format(title, len(title)*'=')
    print "annotated tokens: ", num_of_annotated_tokens
    print "identically annotatated tokens: ", num_of_identical_annotations
    print "agreement: ", num_of_identical_annotations / float(num_of_annotated_tokens) * 100
    return annotations

Comparison of 'es' annotations

agreement is much worse than for 'das'



In [11]:

    
es_annotations = anaphoricity_agreement(TOSIK_DIR, DITTRICH_DIR, 'es')









    



'ES' ANNOTATION
===============

annotated tokens:  284
identically annotatated tokens:  219
agreement:  77.1126760563



In [12]:

    
es_annotations = anaphoricity_agreement(KOBOLD_DIR, TOSIK_DIR, 'das')









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-12-5fb2468e735b> in <module>()
----> 1 es_annotations = anaphoricity_agreement(KOBOLD_DIR, TOSIK_DIR, 'das')

<ipython-input-10-1e881227443b> in anaphoricity_agreement(annotator1_dir, annotator2_dir, anaphora)
     32         annotator1_anaphora = set(select_nodes_by_layer(annotator1_graph, 'annotator1:annotated'))
     33         annotator2_anaphora = set(select_nodes_by_layer(annotator2_graph, 'annotator2:annotated'))
---> 34         assert annotator1_anaphora == annotator2_anaphora
     35 
     36         num_of_annotated_tokens += len(annotator1_anaphora)

AssertionError:

distinguishing between pleonastic and abstract es seems hard



In [ ]:

    
diverging_annotations = defaultdict(lambda: defaultdict(dict))

for anno in es_annotations:
    # if annotations are different
    anno1 = es_annotations[anno]['annotator1'][0]
    anno2 = es_annotations[anno]['annotator2'][0]
    if anno1 != anno2:
        if anno2 in diverging_annotations[anno1]:
            diverging_annotations[anno1][anno2] += 1
        else:
            diverging_annotations[anno1][anno2] = 1



In [ ]:

    
for anno1 in diverging_annotations:
    print anno1, diverging_annotations[anno1]



In [ ]:

    
diverging_annotations = defaultdict(lambda: defaultdict(dict))

for anno in es_annotations:
    # if annotations are different
    anno1 = es_annotations[anno]['annotator1'][0]
    anno2 = es_annotations[anno]['annotator2'][0]
    if anno1 != anno2:
        if anno1 in diverging_annotations[anno2]:
            diverging_annotations[anno2][anno1] += 1
        else:
            diverging_annotations[anno2][anno1] = 1
            
for anno2 in diverging_annotations:
    print anno2, diverging_annotations[anno2]

Comparison of 'das' annotations



In [ ]:

    
das_annotations = anaphoricity_agreement(TOSIK_DIR, DITTRICH_DIR, 'das')



In [ ]:

    
diverging_annotations = defaultdict(lambda: defaultdict(dict))

for anno in das_annotations:
    # if annotations are different
    anno1 = das_annotations[anno]['annotator1'][0]
    anno2 = das_annotations[anno]['annotator2'][0]
    if anno1 != anno2:
        if anno2 in diverging_annotations[anno1]:
            diverging_annotations[anno1][anno2] += 1
        else:
            diverging_annotations[anno1][anno2] = 1

for anno1 in diverging_annotations:
    print anno1, '-->', diverging_annotations[anno1]



In [ ]:

    
diverging_annotations = defaultdict(lambda: defaultdict(dict))

for anno in das_annotations:
    # if annotations are different
    anno1 = das_annotations[anno]['annotator1'][0]
    anno2 = das_annotations[anno]['annotator2'][0]
    if anno1 != anno2:
        if anno1 in diverging_annotations[anno2]:
            diverging_annotations[anno2][anno1] += 1
        else:
            diverging_annotations[anno2][anno1] = 1

for anno2 in diverging_annotations:
    print anno2, '-->', diverging_annotations[anno2]



In [ ]:

    
%matplotlib inline

import numpy as np
import pandas as pd



In [ ]:

    
def generate_disagreement_matrices(annotations):
    """
    Parameters
    ----------
    annotations : defaultdict(dict) or dict of dicts
        maps from a (filename str, token_id int) tuple to a
        dict with the keys 'annotator1' and 'annotator2' and
        (annotation str, certainty float) tuples as values.
    """

    one2two = defaultdict(lambda: defaultdict(dict))
    two2one = defaultdict(lambda: defaultdict(dict))

    for anno in annotations:
        anno1 = annotations[anno]['annotator1'][0]
        anno2 = annotations[anno]['annotator2'][0]
        if anno1 != anno2:
            if anno2 in one2two[anno1]:
                one2two[anno1][anno2] += 1
            else:
                one2two[anno1][anno2] = 1
            
            if anno1 in two2one[anno2]:
                two2one[anno2][anno1] += 1
            else:
                two2one[anno2][anno1] = 1

    return one2two, two2one



In [ ]:

    
das_annotations = anaphoricity_agreement(TOSIK_DIR, DITTRICH_DIR, 'das')
one2two, two2one = generate_disagreement_matrices(das_annotations)



In [ ]:

    
one2two

rows = one2two.keys()
print rows

cols = set(k for v in one2two.values() for k in v.keys())
print cols



In [ ]:

    
one2two_df = pd.DataFrame(np.zeros((len(rows), len(cols)), dtype=np.int), columns=cols, index=rows)
one2two_df



In [ ]:

    
for anno in one2two:
    for disagree_anno, disagree_count in one2two[anno].items():
        one2two_df[disagree_anno][anno] = disagree_count



In [ ]:

    
one2two_plot = one2two_df.plot(kind='bar', stacked=True)
one2two_plot.set_xlabel("Annotator 1 annotated ...")
one2two_plot.set_ylabel("but Annotator 2 disagreed")



In [ ]:

    
two2one
rows = two2one.keys()
cols = set(k for v in two2one.values() for k in v.keys())
two2one_df = pd.DataFrame(np.zeros((len(rows), len(cols)), dtype=np.int), columns=cols, index=rows)
for anno in two2one:
    for disagree_anno, disagree_count in two2one[anno].items():
        two2one_df[disagree_anno][anno] = disagree_count

two2one_plot = two2one_df.plot(kind='bar', stacked=True)
two2one_plot.set_xlabel("Annotator 2 annotated ...")
two2one_plot.set_ylabel("but Annotator 1 disagreed")

Possible improvements

include statistics for agreement grouped by n/a/p/r annotation
add titles to plots to differenciate betw. 'das' and 'es'
allow comparisons for more than 2 annotators
add annotator names early in the process and add them to all plots
try subplots?
try out different visualizations
- chord diagrams (d3)
  - http://bl.ocks.org/mbostock/4062006
  - http://bl.ocks.org/mbostock/1308257
- parallel sets (d3 or java app)
  - http://www.jasondavies.com/parallel-sets/
  - https://eagereyes.org/parallel-sets



In [ ]: