In [1]:
import os
import glob
import tempfile
import sh
import discoursegraphs as dg
from discoursegraphs.readwrite import MMAXDocumentGraph, write_conll
from discoursekernels.util import draw_multiple_graphs
In [2]:
%load_ext gvmagic
from discoursegraphs import print_dot
In [3]:
# wget http://www.ling.uni-potsdam.de/acl-lab/Forsch/pcc/potsdam-commentary-corpus-2.0.0.zip
# unzip potsdam-commentary-corpus-2.0.0.zip -d ~/corpora
In [4]:
MMAX_DIR = os.path.expanduser('~/repos/pcc-annis-merged/maz176/coreference')
In [5]:
# grab a copy of the official CoNLL scorer using git-svn instead of svn
# git svn clone http://reference-coreference-scorers.googlecode.com/svn/trunk/ ~/repos/reference-coreference-scorers
In [6]:
SCORER_PATH = '/home/arne/repos/reference-coreference-scorers/scorer.pl'
In [7]:
scorer = sh.Command(SCORER_PATH)
scorer()
Out[7]:
We're now using the official CoNLL scorer to compare each
MAZ176 coreference annotated document against itself.
All comparisons should result in an F1 of 100%
In [8]:
import sys
def has_valid_annotation(mmax_file, scorer_path, metric, verbose=False):
"""
Parameters
----------
metric : str
muc, bcub, ceafm, ceafe, blanc
verbose : bool or str
True, False or 'very'
"""
scorer = sh.Command(scorer_path)
mdg = MMAXDocumentGraph(mmax_file)
conll_fname = '/tmp/{}.conll'.format(os.path.basename(mmax_file))
write_conll(mdg, conll_fname)
try:
results = scorer(metric, conll_fname, conll_fname)
scores_str = results.stdout.splitlines()[-2]
if not scores_str.endswith('100%'):
if verbose == 'very':
sys.stderr.write("{}\n{}\n".format(conll_fname, results))
elif verbose:
sys.stderr.write("{}\n{}\n".format(conll_fname, scores_str))
return False
except sh.ErrorReturnCode as e:
if verbose:
sys.stderr.write("Error in '{}'\n{}".format(conll_fname, e))
return False
return True
def get_bad_scoring_files(mmax_dir, scorer_path, metric, verbose=False):
"""
returns filepaths of MMAX2 coreference files which don't produce perfect
results when testing them against themselves with scorer.pl
"""
bad_files = []
for mmax_file in glob.glob(os.path.join(mmax_dir, '*.mmax')):
if not has_valid_annotation(mmax_file, scorer_path, metric, verbose=verbose):
bad_files.append(mmax_file)
return bad_files
In [9]:
blanc_errors = get_bad_scoring_files(MMAX_DIR, SCORER_PATH, 'blanc', verbose=True)
In [10]:
bad_scoring_files = {}
for metric in ('muc', 'bcub', 'ceafm', 'ceafe', 'blanc'):
bad_scoring_files[metric] = get_bad_scoring_files(MMAX_DIR, SCORER_PATH, metric, verbose=False)
In [11]:
for metric in bad_scoring_files:
print "number of erroneous files found by '{}': {}".format(metric, len(bad_scoring_files[metric]))
all_bad_files = set()
for metric in bad_scoring_files:
all_bad_files.update(bad_scoring_files[metric])
print "total number of erroneous files:", len(all_bad_files)
In [12]:
from discoursegraphs import get_pointing_chains
In [13]:
from discoursegraphs.readwrite.mmax2 import spanstring2text
def print_all_chains(docgraph):
"""
print a list of all pointing chains (i.e coreference chains)
contained in a document graph
"""
for chain in get_pointing_chains(docgraph):
for node_id in chain:
print node_id, spanstring2text(docgraph, docgraph.node[node_id][docgraph.ns+':span'])
print '\n'
In [14]:
mdg = MMAXDocumentGraph(os.path.join(MMAX_DIR, 'maz-3377.mmax'))
print_all_chains(mdg)
In [15]:
from itertools import combinations
def get_ambiguous_markables(mmax_docgraph):
"""returns a list of markables that occur in more than one coreference chain"""
ambiguous_markables = []
chain_sets = (set(chain) for chain in get_pointing_chains(mmax_docgraph))
for chain1, chain2 in combinations(chain_sets, 2):
chain_intersect = chain1.intersection(chain2)
if chain_intersect:
ambiguous_markables.extend(chain_intersect)
return ambiguous_markables
In [16]:
files_with_ambigious_chains = []
for mmax_file in glob.glob(os.path.join(MMAX_DIR, '*.mmax')):
mdg = MMAXDocumentGraph(mmax_file)
if get_ambiguous_markables(mdg):
files_with_ambigious_chains.append(mmax_file)
In [17]:
print "# of files with ambiguous coreference chains: ", len(files_with_ambigious_chains)
print "# of files scorer.pl doesn't like: ", len(bad_scoring_files)
if len(files_with_ambigious_chains) > 0:
print "percent of files w/ ambiguous chains that scorer.pl doesn't like:", \
len( set(files_with_ambigious_chains).intersection(set(bad_scoring_files)) ) / len(files_with_ambigious_chains) * 100
Initially, this was true. After Markus fixed a bunch of annotations,
Hypothesis 1 could not be validated any longer.
In [18]:
# test this with
# markable_32 auf beiden Seiten
# markable_56 Arafat Scharon
mdg = MMAXDocumentGraph(os.path.join(MMAX_DIR, 'maz-19074.mmax'))
In [19]:
mdg.node['markable_56']
Out[19]:
In [20]:
from discoursegraphs import get_span, select_nodes_by_layer
def get_noncontiguous_markables(docgraph):
"""return all markables that don't represent adjacent tokens"""
noncontiguous_markables = []
id2index = {tok_id:i for i, tok_id in enumerate(docgraph.tokens)}
for markable in select_nodes_by_layer(docgraph, docgraph.ns+':markable'):
span_token_ids = get_span(docgraph, markable)
for span_index, tok_id in enumerate(span_token_ids[:-1]):
tok_index = id2index[tok_id]
next_tok_id = span_token_ids[span_index+1]
next_tok_index = id2index[next_tok_id]
if next_tok_index - tok_index != 1:
noncontiguous_markables.append(markable)
return noncontiguous_markables
In [21]:
files_with_noncontiguous_markables = []
for mmax_file in glob.glob(os.path.join(MMAX_DIR, '*.mmax')):
mdg = MMAXDocumentGraph(mmax_file)
if get_noncontiguous_markables(mdg):
files_with_noncontiguous_markables.append(mmax_file)
In [22]:
print "# of files with non-continuous markables: ", len(files_with_noncontiguous_markables)
print "# of files scorer.pl doesn't like: ", len(bad_scoring_files)
print "percent of files w/ non-continuous markables that scorer.pl doesn't like:", \
len( set(files_with_noncontiguous_markables).intersection(set(bad_scoring_files)) ) / len(files_with_noncontiguous_markables) * 100
Hypothesis 2 doesn't hold.
In [23]:
mysterious_files = [os.path.basename(fname)
for fname in set(all_bad_files).difference(set(files_with_ambigious_chains))]
In [24]:
len(mysterious_files)
Out[24]:
In [25]:
# for fname in mysterious_files:
# mdg = MMAXDocumentGraph(os.path.join(MMAX_DIR, fname))
# print fname, '\n==============\n\n'
# try:
# print_all_chains(mdg)
# except Exception as e:
# print "\n{} FAILED: {}".format(fname, e)
In [26]:
from collections import defaultdict
import networkx as nx
from discoursegraphs import get_text
def get_ambiguous_chains(mmax_docgraph, token_labels=False):
"""
Returns a list of networkx graphs that represent ambiguous
coreference chains. An ambiguous chain represents two or more
coreference chains that share at least one markable.
There should be no ambiguous coreference chains, but the
current version of our annotation guidelines allow them. // SRSLY?
"""
ambiguous_markables = get_ambiguous_markables(mmax_docgraph)
coreference_chains = get_pointing_chains(mmax_docgraph)
markable2chain = defaultdict(list)
for i, chain in enumerate(coreference_chains):
for markable in chain:
if markable in ambiguous_markables:
markable2chain[markable].append(i)
chain_graphs = []
for markable in markable2chain:
ambig_chain_ids = markable2chain[markable]
chain_graph = nx.MultiDiGraph()
chain_graph.name = mmax_docgraph.name
for chain_id in ambig_chain_ids:
ambig_chain = coreference_chains[chain_id]
for i, markable in enumerate(ambig_chain[:-1]):
chain_graph.add_edge(markable, ambig_chain[i+1])
if token_labels:
for markable in chain_graph.nodes_iter():
markable_text = get_text(mmax_docgraph, markable)
chain_graph.node[markable]['label'] = markable_text
chain_graphs.append(chain_graph)
return chain_graphs
In [27]:
def merge_ambiguous_chains(ambiguous_chains):
"""
Parameters
----------
ambiguous_chains : list of MultiDiGraph
a list of graphs, each representing an ambiguous coreference chain
"""
merged_chain = nx.DiGraph(nx.compose_all(ambiguous_chains))
merged_chain.add_node('name', shape='tab',
color='blue',
label=ambiguous_chains[0].name)
for node in merged_chain:
if merged_chain.in_degree(node) > 1 \
or merged_chain.out_degree(node) > 1:
merged_chain.node[node]['color'] = 'red'
return merged_chain
In [28]:
len(files_with_ambigious_chains) # nothing to see here, move on!
Out[28]:
In [29]:
from discoursegraphs import info
files_without_chains = []
for mmax_file in glob.glob(os.path.join(MMAX_DIR, '*.mmax')):
mdg = MMAXDocumentGraph(mmax_file)
if not get_pointing_chains(mdg):
files_without_chains.append(os.path.basename(mmax_file))
# info(mdg)
# print '\n\n'
print files_without_chains
In [30]:
for fname in mysterious_files:
has_valid_annotation(os.path.join(MMAX_DIR, fname), SCORER_PATH, 'muc', verbose='very')
In [31]:
def get_all_good_scoring_files(mmax_dir, scorer_path, verbose=False):
"""
returns filepaths of MMAX2 coreference files which don't produce perfect
results when testing them against themselves with scorer.pl
"""
all_mmax_files = glob.glob(os.path.join(mmax_dir, '*.mmax'))
all_bad_files = set()
metrics = ['muc', 'bcub', 'ceafm', 'ceafe', 'blanc']
for mmax_file in all_mmax_files:
for metric in metrics:
if not has_valid_annotation(mmax_file, scorer_path, metric, verbose=verbose):
all_bad_files.add(mmax_file)
break # continue with next mmax file
return set(all_mmax_files).difference(all_bad_files)
In [32]:
all_good_scoring_files = get_all_good_scoring_files(MMAX_DIR, SCORER_PATH)
In [33]:
len(all_good_scoring_files)
Out[33]:
In [34]:
# for fname in all_good_scoring_files:
# mdg = dg.read_mmax2(fname)
# bname = os.path.basename(fname)
# print bname, '\n==============\n\n'
# try:
# # [the dog]_{markable_23} means that [the dog] is part of a
# # coreference chain whose first element is markable_23
# print dg.readwrite.brackets.gen_bracketed_output(mdg), '\n\n'
# except KeyError as e:
# print "Error in {}: {}".format(bname, e)
# print dg.get_text(mdg)
# try:
# print_all_chains(mdg)
# except Exception as e:
# print "\n{} FAILED: {}".format(fname, e)
In [40]:
MMAX_DIR
Out[40]:
In [36]:
mmax_14172 = os.path.join(MMAX_DIR, 'maz-14172.mmax')
In [37]:
mdg = dg.read_mmax2(mmax_14172)
In [39]:
print_all_chains(mdg)
In [ ]: