Is ProCon11 a subset of ProCon44?


In [1]:
import os
import re

PCC_ROOTDIR = os.path.expanduser('~/repos/pcc-annis-merged/')
PROCON44_ROOTDIR = os.path.join(PCC_ROOTDIR, 'procon44')
PROCON11_ROOTDIR = os.path.join(PCC_ROOTDIR, 'procon11')

In [5]:
PROCON_ID_REGEX = re.compile('\.(\w+?\d{6})')

def unique_sorted_ids(absdirpath, compiled_regex):
    ids = set()
    for fname in os.listdir(absdirpath):
        result = compiled_regex.search(fname)
        if result:
            ids.add(result.groups()[0])
    return sorted(ids)

In [8]:
def is_subset(subset, superset):
    result = set(subset).issubset(set(superset))
    if result is True:
        return result
    else:
        for doc_id in subset:
            if doc_id not in superset:
                print "Document ID '{}' not in superset!".format(doc_id)
        return result

In [2]:
procon11_subdirs = !ls $PROCON11_ROOTDIR
print procon11_subdirs


['conano', 'coreference', 'illocution', 'primary-data', 'rst', 'syntax', 'tokenized']

In [3]:
procon44_subdirs = !ls $PROCON44_ROOTDIR
print procon44_subdirs


['anaphora', 'conano', 'coreference', 'random_subset', 'syntax', 'tokenized']

In [4]:
common_subdirs = set(procon44_subdirs).intersection(set(procon11_subdirs))
print common_subdirs


set(['tokenized', 'conano', 'coreference', 'syntax'])

Conano

Procon11 is a subset of Procon44, but the 11 files are annotated by Chiarcos (vs. Raithel/Tabbert in Procon44). Both sets use the same document IDs but the file names aren't identical.


In [6]:
procon11_conano_ids = unique_sorted_ids(os.path.join(PROCON11_ROOTDIR, 'conano'), PROCON_ID_REGEX)
print len(procon11_conano_ids)
print procon11_conano_ids


11
['baeumecon_040516', 'baeumepro_040516', 'impfenpro_041107', 'kreiselcon_041114', 'kreiselpro_041114', 'olympiacon_040523', 'olympiapro_040523', 'senatcon_040606', 'senatpro_040606', 'topographiecon_040530', 'topographiepro_040530']

In [14]:
procon44_conano_ids_raithel = unique_sorted_ids(os.path.join(PROCON44_ROOTDIR, 'conano/raithel'), PROCON_ID_REGEX)
print len(procon44_conano_ids_raithel)
print procon44_conano_ids_raithel


44
['autoscon_040718', 'autospro_040718', 'baeumecon_040516', 'baeumepro_040516', 'betriebsausflugcon_040926', 'betriebsausflugpro_040926', 'diktatecon_041003', 'diktatepro_041003', 'gedenktafelncon_040620', 'gedenktafelnpro_040620', 'gottesdienstcon_041031', 'gottesdienstpro_041031', 'guetesiegelcon_040725', 'guetesiegelpro_040725', 'hallenbaedercon_040704', 'hallenbaederpro_040704', 'handelcon_040801', 'handelpro_040801', 'impfencon041107', 'impfenpro_041107', 'kreiselcon_041114', 'kreiselpro_041114', 'kurzparkercon_041024', 'kurzparkerpro_041024', 'mahnmalcon_040627', 'mahnmalpro_040627', 'matzcon_040912', 'matzpro_040912', 'mauercon_041010', 'mauerpro_041010', 'oeffentlichkeitcon_040711', 'oeffentlichkeitpro_040711', 'olympiacon_040523', 'olympiapro_040523', 'radarfallencon_040815', 'radarfallenpro_040815', 'rauchencon_040613', 'raucherpro_040613', 'riesenradcon_040822', 'riesenradpro_040822', 'senatcon_040606', 'senatpro_040606', 'topographiecon_040530', 'topographiepro_040530']

In [15]:
is_subset(subset=procon11_conano_ids, superset=procon44_conano_ids_raithel)


Out[15]:
True

In [23]:
procon44_conano_ids_tabbert = unique_sorted_ids(os.path.join(PROCON44_ROOTDIR, 'conano/tabbert'), PROCON_ID_REGEX)
print procon44_conano_ids_tabbert, len(procon44_conano_ids_tabbert)


['autoscon_040718', 'autospro_040718', 'baeumecon_040516', 'baeumepro_040516', 'betriebsausflugcon_040926', 'betriebsausflugpro_040926', 'diktatecon_041003', 'diktatepro_041003', 'gedenktafelncon_040620', 'gedenktafelnpro_040620', 'gottesdienstcon_041031', 'gottesdienstpro_041031', 'guetesiegelcon_040725', 'guetesiegelpro_040725', 'hallenbaedercon_040704', 'hallenbaederpro_040704', 'handelcon_040801', 'handelpro_040801', 'impfencon041107', 'impfenpro_041107', 'kreiselcon_041114', 'kreiselpro_041114', 'kurzparkercon_041024', 'kurzparkerpro_041024', 'mahnmalcon_040627', 'mahnmalpro_040627', 'matzcon_040912', 'matzpro_040912', 'mauercon_041010', 'mauerpro_041010', 'oeffentlichkeitcon_040711', 'oeffentlichkeitpro_040711', 'olympiacon_040523', 'olympiapro_040523', 'radarfallencon_040815', 'radarfallenpro_040815', 'rauchencon_040613', 'raucherpro_040613', 'riesenradcon_040822', 'riesenradpro_040822', 'senatcon_040606', 'senatpro_040606', 'topographiecon_040530', 'topographiepro_040530'] 44

In [19]:
is_subset(subset=procon11_conano_ids, superset=procon44_conano_ids_tabbert)


Out[19]:
True

Coreference

Procon11 is a subset of Procon44. Both sets use the same document IDs but the file names aren't identical.


In [22]:
procon11_coref_ids = unique_sorted_ids(os.path.join(PROCON11_ROOTDIR, 'coreference'), PROCON_ID_REGEX)
print procon11_coref_ids, len(procon11_coref_ids)


['baeumecon_040516', 'baeumepro_040516', 'impfenpro_041107', 'kreiselcon_041114', 'kreiselpro_041114', 'olympiacon_040523', 'olympiapro_040523', 'senatcon_040606', 'senatpro_040606', 'topographiecon_040530', 'topographiepro_040530'] 11

In [24]:
procon44_coref_ids = unique_sorted_ids(os.path.join(PROCON44_ROOTDIR, 'coreference'), PROCON_ID_REGEX)
print procon44_coref_ids, len(procon44_coref_ids)


['autoscon_040718', 'autospro_040718', 'baeumecon_040516', 'baeumepro_040516', 'betriebsausflugcon_040926', 'betriebsausflugpro_040926', 'diktatecon_041003', 'diktatepro_041003', 'gedenktafelncon_040620', 'gedenktafelnpro_040620', 'gottesdienstcon_041031', 'gottesdienstpro_041031', 'guetesiegelcon_040725', 'guetesiegelpro_040725', 'hallenbaedercon_040704', 'hallenbaederpro_040704', 'handelcon_040801', 'handelpro_040801', 'impfencon_041107', 'impfenpro_041107', 'kreiselcon_041114', 'kreiselpro_041114', 'kurzparkercon_041024', 'kurzparkerpro_041024', 'mahnmalcon_040627', 'mahnmalpro_040627', 'matzcon_040912', 'matzpro_040912', 'mauercon_041010', 'mauerpro_041010', 'oeffentlichkeitcon_040711', 'oeffentlichkeitpro_040711', 'olympiacon_040523', 'olympiapro_040523', 'radarfallencon_040815', 'radarfallenpro_040815', 'rauchencon_040613', 'rauchenpro_040613', 'riesenradcon_040822', 'riesenradpro_040822', 'senatcon_040606', 'senatpro_040606', 'topographiecon_040530', 'topographiepro_040530'] 44

In [25]:
is_subset(subset=procon11_coref_ids, superset=procon44_coref_ids)


Out[25]:
True

Syntax

Procon11 is a subset of Procon44. Both sets use the same document IDs but the file names aren't identical.


In [39]:
PROCON_SYNTAX_ID_REGEX = re.compile('_(\w+?\d{6})')

procon11_syntax_ids = unique_sorted_ids(os.path.join(PROCON11_ROOTDIR, 'syntax'), PROCON_SYNTAX_ID_REGEX)
print procon11_syntax_ids, len(procon11_syntax_ids)


['baeumecon_040516', 'baeumepro_040516', 'impfenpro_041107', 'kreiselcon_041114', 'kreiselpro_041114', 'olympiacon_040523', 'olympiapro_040523', 'senatcon_040606', 'senatpro_040606', 'topographiecon_040530', 'topographiepro_040530'] 11

In [41]:
procon44_syntax_ids = unique_sorted_ids(os.path.join(PROCON44_ROOTDIR, 'syntax'), PROCON_ID_REGEX)
print procon44_syntax_ids, len(procon44_syntax_ids)


['autoscon_040718', 'autospro_040718', 'baeumecon_040516', 'baeumepro_040516', 'betriebsausflugcon_040926', 'betriebsausflugpro_040926', 'diktatecon_041003', 'diktatepro_041003', 'gedenktafelncon_040620', 'gedenktafelnpro_040620', 'gottesdienstcon_041031', 'gottesdienstpro_041031', 'guetesiegelcon_040725', 'guetesiegelpro_040725', 'hallenbaedercon_040704', 'hallenbaederpro_040704', 'handelcon_040801', 'handelpro_040801', 'impfencon041107', 'impfenpro_041107', 'kreiselcon_041114', 'kreiselpro_041114', 'kurzparkercon_041024', 'kurzparkerpro_041024', 'mahnmalcon_040627', 'mahnmalpro_040627', 'matzcon_040912', 'matzpro_040912', 'mauercon_041010', 'mauerpro_041010', 'oeffentlichkeitcon_040711', 'oeffentlichkeitpro_040711', 'olympiacon_040523', 'olympiapro_040523', 'radarfallencon_040815', 'radarfallenpro_040815', 'rauchencon_040613', 'raucherpro_040613', 'riesenradcon_040822', 'riesenradpro_040822', 'senatcon_040606', 'senatpro_040606', 'topographiecon_040530', 'topographiepro_040530'] 44

In [42]:
is_subset(subset=procon11_syntax_ids, superset=procon44_syntax_ids)


Out[42]:
True