In [1]:
import os
import re
PCC_ROOTDIR = os.path.expanduser('~/repos/pcc-annis-merged/')
PROCON44_ROOTDIR = os.path.join(PCC_ROOTDIR, 'procon44')
PROCON11_ROOTDIR = os.path.join(PCC_ROOTDIR, 'procon11')
In [5]:
PROCON_ID_REGEX = re.compile('\.(\w+?\d{6})')
def unique_sorted_ids(absdirpath, compiled_regex):
ids = set()
for fname in os.listdir(absdirpath):
result = compiled_regex.search(fname)
if result:
ids.add(result.groups()[0])
return sorted(ids)
In [8]:
def is_subset(subset, superset):
result = set(subset).issubset(set(superset))
if result is True:
return result
else:
for doc_id in subset:
if doc_id not in superset:
print "Document ID '{}' not in superset!".format(doc_id)
return result
In [2]:
procon11_subdirs = !ls $PROCON11_ROOTDIR
print procon11_subdirs
In [3]:
procon44_subdirs = !ls $PROCON44_ROOTDIR
print procon44_subdirs
In [4]:
common_subdirs = set(procon44_subdirs).intersection(set(procon11_subdirs))
print common_subdirs
In [6]:
procon11_conano_ids = unique_sorted_ids(os.path.join(PROCON11_ROOTDIR, 'conano'), PROCON_ID_REGEX)
print len(procon11_conano_ids)
print procon11_conano_ids
In [14]:
procon44_conano_ids_raithel = unique_sorted_ids(os.path.join(PROCON44_ROOTDIR, 'conano/raithel'), PROCON_ID_REGEX)
print len(procon44_conano_ids_raithel)
print procon44_conano_ids_raithel
In [15]:
is_subset(subset=procon11_conano_ids, superset=procon44_conano_ids_raithel)
Out[15]:
In [23]:
procon44_conano_ids_tabbert = unique_sorted_ids(os.path.join(PROCON44_ROOTDIR, 'conano/tabbert'), PROCON_ID_REGEX)
print procon44_conano_ids_tabbert, len(procon44_conano_ids_tabbert)
In [19]:
is_subset(subset=procon11_conano_ids, superset=procon44_conano_ids_tabbert)
Out[19]:
In [22]:
procon11_coref_ids = unique_sorted_ids(os.path.join(PROCON11_ROOTDIR, 'coreference'), PROCON_ID_REGEX)
print procon11_coref_ids, len(procon11_coref_ids)
In [24]:
procon44_coref_ids = unique_sorted_ids(os.path.join(PROCON44_ROOTDIR, 'coreference'), PROCON_ID_REGEX)
print procon44_coref_ids, len(procon44_coref_ids)
In [25]:
is_subset(subset=procon11_coref_ids, superset=procon44_coref_ids)
Out[25]:
In [39]:
PROCON_SYNTAX_ID_REGEX = re.compile('_(\w+?\d{6})')
procon11_syntax_ids = unique_sorted_ids(os.path.join(PROCON11_ROOTDIR, 'syntax'), PROCON_SYNTAX_ID_REGEX)
print procon11_syntax_ids, len(procon11_syntax_ids)
In [41]:
procon44_syntax_ids = unique_sorted_ids(os.path.join(PROCON44_ROOTDIR, 'syntax'), PROCON_ID_REGEX)
print procon44_syntax_ids, len(procon44_syntax_ids)
In [42]:
is_subset(subset=procon11_syntax_ids, superset=procon44_syntax_ids)
Out[42]: