Importing InKind from FileMaker

We use an XML export of the various tables in the FileMaker Inkind database.

The XML will be read, field definitions will be extracted from it, the data will be read. We do the following:

  • adapt the table and field organization;
  • adjust the field types and the values, especially for datetime and currency;
  • generate value tables and cross tables;
  • add extra information for countries, so that they can be visualized on a map
  • link values to existing tables;
  • write SQL create statements and insert statements
  • import a moderately denormalized version of the data into MongoDB

In [31]:
import os,sys,re,collections,json
from os.path import splitext, basename
from functools import reduce
from glob import glob
from lxml import etree
from datetime import datetime
from pymongo import MongoClient
from bson.objectid import ObjectId

Locations


In [32]:
HOME_DIR = os.path.expanduser('~').replace('\\', '/')
BASE_DIR = '{}/Documents/DANS/projects/has/dacs'.format(HOME_DIR)
FM_DIR = '{}/fm'.format(BASE_DIR)
FMNS = '{http://www.filemaker.com/fmpxmlresult}'
CONFIG_DIR = '.'

Config

All configuration in a big yaml file


In [ ]:
with open('{}/config.yaml')

Data description

Main source tables and fields to skip


In [ ]:
CONFIG = yaml.load('''
mainTables:
- contrib
- country
''')

In [33]:
mainTables = ('contrib', 'country')

SKIP_FIELDS = dict(
    contrib=set('''
dateandtime_ciozero
ikid
ikid_base
find_country_id
find_type
gnewpassword
gnewpassword2
goldpassword
help_description
help_text
message
message_allert
teller
total_costs_total
whois
'''.strip().split()),
    country=set('''
'''.strip().split()),
)

Fields to merge


In [34]:
MERGE_FIELDS = dict(
    contrib=dict(
        academic_entity_url=['academic_entity_url_2'],
        contribution_url=['contribution_url_2'],
        contact_person_mail=['contact_person_mail_2'],
        type_of_inkind=['other_type_of_inkind'],
        vcc11_name=[
            'vcc12_name',
            'vcc21_name',
            'vcc22_name',
            'vcc31_name',
            'vcc32_name',
            'vcc41_name',
            'vcc42_name',
        ],
        vcc_head_decision_vcc11=[
            'vcc_head_decision_vcc12',
            'vcc_head_decision_vcc21',
            'vcc_head_decision_vcc22',
            'vcc_head_decision_vcc31',
            'vcc_head_decision_vcc32',
            'vcc_head_decision_vcc41',
            'vcc_head_decision_vcc42',
        ],
    ),
    country=dict(),
)

Fields to rename


In [35]:
MAP_FIELDS = dict(
    contrib=dict(
        approved='approved',
        academic_entity_url='urlAcademic',
        contribution_url='urlContribution',
        contact_person_mail='contactPersonEmail',
        contact_person_name='contactPersonName',
        costs_description='costDescription',
        costs_total='costTotal',
        country='country',
        creation_date_time='dateCreated',
        creator='creator',
        dateandtime_approval='dateApproved',
        dateandtime_cioapproval='dateApprovedCIO',
        description_of_contribution='description',
        disciplines_associated='discipline',
        last_modifier='modifiedBy',
        modification_date_time='dateModified',
        other_keywords='keyword',
        submit='submitted',
        tadirah_research_activities='tadirahActivity',
        tadirah_research_objects='tadirahObject',
        tadirah_research_techniques='tadirahTechnique',
        title='title',
        total_costs_total='costTotalTotal',
        type_of_inkind='typeContribution',
        vcc='vcc',
        vcc11_name='reviewerName',
        vcc_head_decision='vccDecision',
        vcc_head_decision_vcc11='reviewerDecision',
        vcchead_approval='vccApproval',
        vcchead_disapproval='vccDisApproval',
        year='year',
    ),
    country=dict(
        countrycode='iso',
        countryname='name',
        member_dariah='isMember',
    ),
)

Fields to split into multiple values


In [36]:
generic = re.compile('[ \t]*[\n+][ \t\n]*')          # split on newlines (with surrounding white space)
genericComma = re.compile('[ \t]*[\n+,;][ \t\n]*')    # split on newlines or commas (with surrounding white space)

SPLIT_FIELDS=dict(
    contrib=dict(
        discipline=generic,
        keyword=genericComma,
        typeContribution=generic,
        tadirahActivity=generic,
        tadirahObject=generic,
        tadirahTechnique=generic,
        vcc=generic,
    ),
    country=dict(),
)

Fields to hack


In [37]:
STRIP_NUM = re.compile('^[0-9]\s*\.?\s+')

def stripNum(v): return STRIP_NUM.sub('', v)
    
HACK_FIELDS=dict(
    contrib=dict(
        tadirahActivity=stripNum,
    ),
    country=dict(),
)

Fields to decompose into several fields


In [38]:
DECOMPOSE_FIELDS=dict(
    contrib=dict(
        typeContribution='typeContributionOther',
    ),
    country=dict(),
)

Custom field types


In [39]:
FIELD_TYPE = dict(
    contrib=dict(
        costTotal='valuta',
        dateCreated='datetime',
        dateModified='datetime',
        dateApproved='datetime',
        dateApprovedCIO='datetime',
        contactPersonEmail='email',
        submitted='bool',
        approved='bool',
        reviewerDecision='bool',
        vccApproval='bool',
        vccDecision='bool',
        vccDisApproval='bool',
    ),
    country=dict(
        isMember='bool',
    ),
)

Default values


In [40]:
DEFAULT_VALUES=dict(
    contrib=dict(
        dateCreated=datetime(2000,1,1,0,0,0),
        creator="admin",
        type_of_inkind="General",
    ),
    country=dict(),
)

Fields to move to other tables


In [41]:
MOVE_FIELDS=dict(
    contrib=dict(
        assessment=set('''
approved
dateApproved
dateApprovedCIO
submitted
reviewerName
reviewerDecision
vccDecision
vccApproval
vccDisApproval
        '''.strip().split()),
    ),
    country=dict(),
)

Fields to value lists


In [42]:
MAKE_VALUE_LISTS = dict(
    contrib=set('''
keyword
year
'''.strip().split()),
)
VALUE_LISTS = dict(
    contrib=set('''
discipline
keyword
tadirahActivity
tadirahObject
tadirahTechnique
typeContribution
typeContributionOther:typeContribution
vcc
year
'''.strip().split()),
)

MOVE_MISSING = dict(
    contrib='description',
)

Field values

Patterns for value types


In [43]:
# Source field types, including types assigned by type overriding (see FIELD_TYPE_OVERRIDE above).
# These will be translated into appropriate SQL field types

TYPES = {'bool', 'number', 'decimal', 'text', 'valuta', 'email', 'date', 'datetime'}

# dates are already in ISO (date2_pattern).
# If we encounter other dates, we could use date_pattern instead)
# datetimes are not in iso, they will be transformed to iso.

DECIMAL_PATTERN = re.compile(
    r'^-?[0-9]+\.?[0-9]*'
)
DATE_PATTERN = re.compile(
    r'^\s*([0-9]{2})/([0-9]{2})/([0-9]{4})$'
)
DATE2_PATTERN = re.compile(
    r'^\s*([0-9]{4})-([0-9]{2})-([0-9]{2})$'
)
DATETIME_PATTERN = re.compile(
    r'^\s*([0-9]{2})/([0-9]{2})/([0-9]{4})\s+([0-9]{2}):([0-9]{2})(?::([0-9]{2}))?$'
)

# meaningless values will be translated into None
NULL_VALUES = {
    'http://',
    'https://',
    '@',
}

BOOL_VALUES = {
    True: {'Yes', 'YES', 'yes', 1, '1', True},
    False: {'No', 'NO', 'no', 0, '0', 'NULL', False},
}

Date and Time values


In [44]:
def date_repl(match):
    [d,m,y] = list(match.groups())
    return '{}-{}-{}'.format(y,m,d)
    
def date2_repl(match):
    [y,m,d] = list(match.groups())
    return '{}-{}-{}'.format(y,m,d)
    
def datetime_repl(match):
    [d,m,y,hr,mn,sc] = list(match.groups())
    return '{}-{}-{}T{}:{}:{}'.format(y,m,d,hr,mn,sc or '00')

def dt(v_raw, i, t, fname):
    if not DATE2_PATTERN.match(v_raw):
        warning(
            'table `{}` field `{}` record {}: not a valid date: "{}"'.format(
                t, fname, i, v_raw
        ))
        return v_raw
    return datetime(*map(int, re.split('[:T-]', DATE2_PATTERN.sub(date2_repl, v_raw))))

def dtm(v_raw, i, t, fname):
    if not DATETIME_PATTERN.match(v_raw):
        warning(
            'table `{}` field `{}` record {}: not a valid date time: "{}"'.format(
                t, fname, i, v_raw
        ))
        return v_raw
    return datetime(*map(int, re.split('[:T-]', DATETIME_PATTERN.sub(datetime_repl, v_raw))))

Boolean, numeric and string values


In [45]:
def bools(v_raw, i, t, fname):
    if v_raw in BOOL_VALUES[True]: return True
    if v_raw in BOOL_VALUES[False]: return False
    warning(
        'table `{}` field `{}` record {}: not a boolean value: "{}"'.format(
            t, fname, i, v_raw
    ))
    return v_raw

def num(v_raw, i, t, fname):
    if type(v_raw) is int: return v_raw
    if v_raw.isdigit(): return int(v_raw)
    warning(
        'table `{}` field `{}` record {}: not an integer: "{}"'.format(
            t, fname, i, v_raw
    ))
    return v_raw

def decimal(v_raw, i, t, fname):
    if type(v_raw) is float: return v_raw
    if v_raw.isdigit(): return float(v_raw)
    if DECIMAL_PATTERN.match(v_raw): return float(v_raw)
    warning(
        'table `{}` field `{}` record {}: not an integer: "{}"'.format(
            t, fname, i, v_raw
    ))
    return v_raw

def email(v_raw, i, t, fname):
    return v_raw.replace('mailto:', '', 1) if v_raw.startswith('mailto:') else v_raw

Money values


In [46]:
def money(v_raw, i, t, fname):
    note = ',' in v_raw or '.' in v_raw
    v = v_raw.strip().lower().replace(' ','').replace('€', '').replace('euro', '').replace('\u00a0', '')
    for p in range(2,4): # interpret . or , as decimal point if less than 3 digits follow it
        if len(v) >= p and v[-p] in '.,': 
            v_i = v[::-1]
            if v_i[p-1] == ',': v_i = v_i.replace(',', 'D', 1)
            elif v_i[p-1] == '.': v_i = v_i.replace('.', 'D', 1)
            v = v_i[::-1]
    v = v.replace('.','').replace(',','')
    v = v.replace('D', '.')
    if not v.replace('.','').isdigit():
        if len(set(v) & set('0123456789')):
            warning(
                'table `{}` field `{}` record {}: not a decimal number: "{}" <= "{}"'.format(
                    t, fname, i, v, v_raw,
            ))
            money_warnings.setdefault('{}:{}'.format(t, fname), {}).setdefault(v, set()).add(v_raw)
            v = None
        else:
            v = None
            money_notes.setdefault('{}:{}'.format(t, fname), {}).setdefault('NULL', set()).add(v_raw)
    elif note:
        money_notes.setdefault('{}:{}'.format(t, fname), {}).setdefault(v, set()).add(v_raw)
    return None if v == None else float(v)

Clean up field values


In [47]:
def sanitize(t, i, fname, value):
    if fname == '_id': return value
    (ftype, fmult) = allFields[t][fname]
    newValue = []
    for v_raw in value:
        if v_raw == None or v_raw in NULL_VALUES: continue
        elif ftype == 'text': v = v_raw
        elif ftype == 'bool': v = bools(v_raw, i, t, fname)
        elif ftype == 'number': v = num(v_raw, i, t, fname)
        elif ftype == 'decimal': v = decimal(v_raw, i, t, fname)
        elif ftype == 'email': v = email(v_raw, i, t, fname)
        elif ftype == 'valuta': v = money(v_raw, i, t, fname)
        elif ftype == 'date': v = dt(v_raw, i, t, fname)
        elif ftype == 'datetime': v = dtm(v_raw, i, t, fname)
        else: v = v_raw
        if v != None and (fmult <= 1 or v != ''): newValue.append(v)
    if len(newValue) == 0:
        defValue = DEFAULT_VALUES.get(t, {}).get(fname, None)
        if defValue != None:
            newValue = [defValue]
    return newValue

Show information


In [48]:
def info(x): sys.stdout.write('{}\n'.format(x))
def warning(x): sys.stderr.write('{}\n'.format(x))

def showFields():
    for (mt, defs) in sorted(allFields.items()):
        info(mt)
        for (fname, fdef) in sorted(defs.items()):
            info('{:>25}: {:<10} ({})'.format(fname, *fdef))

def showdata(rows):
    for row in rows:
        for f in sorted(row.items()):
            info('{:>20} = {}'.format(*f))
        info('o-o-o-o-o-o-o-o-o-o-o-o')

def showData():
    for (mt, rows) in sorted(allData.items()):
        info('o-o-o-o-o-o-o TABLE {} with {} rows o-o-o-o-o-o-o-o '.format(mt, len(rows)))
        showdata(rows[0:2])

def showMoney():
    for tf in sorted(money_notes):
        for v in sorted(money_notes[tf]):
            info('{} "{}" <= {}'.format(
                tf, v,
                ' | '.join(money_notes[tf][v]),
        ))

Read FM fields


In [49]:
def readFmFields():
    for mt in mainTables:
        infile = '{}/{}.xml'.format(FM_DIR, mt)
        root = etree.parse(infile, parser).getroot()
        fieldroots = [x for x in root.iter(FMNS+'METADATA')]
        fieldroot = fieldroots[0]
        fields = []
        fieldDefs = {}
        for x in fieldroot.iter(FMNS+'FIELD'):
            fname = x.get('NAME').lower().replace(' ','_').replace(':', '_')
            ftype = x.get('TYPE').lower()
            fmult = int(x.get('MAXREPEAT'))
            fields.append(fname)
            fieldDefs[fname] = [ftype, fmult]
        rawFields[mt] = fields
        allFields[mt] = fieldDefs

        for f in SKIP_FIELDS[mt]:
            del allFields[mt][f]

        for (f, mfs) in MERGE_FIELDS[mt].items():
            allFields[mt][f][1] += 1
            for mf in mfs:
                del allFields[mt][mf]
        allFields[mt] = dict((MAP_FIELDS[mt][f], v) for (f,v) in allFields[mt].items())
        for f in SPLIT_FIELDS[mt]:
            allFields[mt][f][1] += 1
        for (f, fo) in DECOMPOSE_FIELDS[mt].items():
            allFields[mt][fo] = allFields[mt][f]
            allFields[mt][f] = [allFields[mt][f][0], 1]
        for (f, t) in FIELD_TYPE[mt].items():
            allFields[mt][f][0] = t

Read FM data


In [50]:
def readFmData():
    for mt in mainTables:
        infile = '{}/{}.xml'.format(FM_DIR, mt)
        root = etree.parse(infile, parser).getroot()
        dataroots = [x for x in root.iter(FMNS+'RESULTSET')]
        dataroot = dataroots[0]
        rows = []
        rowsRaw = []
        fields = rawFields[mt]
        for (i, r) in enumerate(dataroot.iter(FMNS+'ROW')):
            rowRaw = []
            for c in r.iter(FMNS+'COL'):
                data = [x.text.strip() for x in c.iter(FMNS+'DATA') if x.text != None]
                rowRaw.append(data)
            if len(rowRaw) != len(fields):
                warning('row {}: fields encountered = {}, should be {}'.format(len(row), len(fields)))
            rowsRaw.append(dict((f,v) for (f, v) in zip(fields, rowRaw)))
            row = dict((f,v) for (f, v) in zip(fields, rowRaw) if f not in SKIP_FIELDS[mt])
            for (f, mfs) in MERGE_FIELDS[mt].items():
                for mf in mfs:
                    row[f].extend(row[mf])
                    del row[mf]
            row = dict((MAP_FIELDS[mt][f], v) for (f,v) in row.items())
            for (f, spl) in SPLIT_FIELDS[mt].items():
                row[f] = reduce(lambda x,y: x+y, [spl.split(v) for v in row[f]], [])
            for (f, hack) in HACK_FIELDS[mt].items():
                row[f] = [hack(v) for v in row[f]]
            for (f, fo) in DECOMPOSE_FIELDS[mt].items():
                row[fo] = row[f][1:]
                row[f] = [row[f][0]] if len(row[f]) else []
            row['_id'] = ObjectId()
            #info('\n'.join('{}={}'.format(*x) for x in sorted(row.items())))
            for (f, v) in row.items(): row[f] = sanitize(mt, i, f, v)
            rows.append(row)
        allData[mt] = rows
        rawData[mt] = rowsRaw

    if money_warnings:
        for tf in sorted(money_warnings):
            for v in sorted(money_warnings[tf]):
                warning('{} "{}" <= {}'.format(
                    tf, v,
                    ' | '.join(money_warnings[tf][v]),
            ))

Split tables into several tables by column groups


In [51]:
def moveFields():
    for mt in mainTables:
        for (omt, mfs) in MOVE_FIELDS[mt].items():
            for mf in mfs:
                allFields.setdefault(omt, dict())[mf] = allFields[mt][mf]
                del allFields[mt][mf]
            allFields.setdefault(omt, dict)['{}_id'.format(mt)] = ('id', 1)

        for row in allData[mt]:
            for (omt, mfs) in MOVE_FIELDS[mt].items():
                orow = dict((mf, row[mf]) for mf in mfs)
                orow['_id'] = ObjectId()
                orow['{}_id'.format(mt)] = row['_id']
                allData.setdefault(omt, []).append(orow)
                for mf in mfs: del row[mf]

Value Lists


In [52]:
def readLists():
    valueLists = dict()
    for path in glob('{}/*.txt'.format(FM_DIR)):
        tname = basename(splitext(path)[0])
        data = []
        with open(path) as fh:
            for line in fh:
                data.append(line.rstrip().split('\t'))
        valueLists[tname] = data

    for (vList, data) in valueLists.items():
        if vList == 'countryExtra':
            mapping = dict((x[0], x[1:]) for x in data)
        else:
            mapping = dict((i+1, x[0]) for (i, x) in enumerate(data))
        valueDict[vList] = mapping
        allFields[vList] = dict(
            _id=('id', 1),
            value=('string', 1),
        )
    
    for mt in allData:
        fs = MAKE_VALUE_LISTS.get(mt, set())
        for f in fs:
            valSet = set()
            for row in allData[mt]:
                values = row.get(f, [])
                if type(values) is not list:
                    values = [values]
                valSet |= set(values)
            valueDict[f] = dict((i+1, x) for (i, x) in enumerate(sorted(valSet)))
            allFields[f] = dict(
                _id=('id', 1),
                value=('string', 1),
            )

Country table


In [53]:
def countryTable():
    extraInfo = valueDict['countryExtra']
    idMapping = dict()

    for row in allData['country']:
        for f in row:
            if type(row[f]) is list: row[f] = row[f][0]
        iso = row['iso']
        row['_id'] = ObjectId()
        idMapping[iso] = row['_id']
        (name, lat, long) = extraInfo[iso]
        row['latitude'] = lat
        row['longitude'] = long

    for row in allData['contrib']:
        newValue = []
        for iso in row['country']:
            newValue.append(dict(_id=idMapping[iso], iso=iso, value=extraInfo[iso][0]))
        row['country'] = newValue
    
    allFields['country']['_id'] = ('id', 1)
    allFields['country']['iso'] = ('string', 1)
    allFields['country']['latitude'] = ('float', 1)
    allFields['country']['longitude'] = ('float', 1)

User table


In [54]:
def userTable():
    idMapping = dict()
    existingUsers = []
    testUsers = [
        dict(eppn='suzan', email='suzan1@test.eu', mayLogin=True, authority='local', 
             firstName='Suzan', lastName='Karelse'),
        dict(eppn='marie', email='suzan2@test.eu', mayLogin=True, authority='local',
            firstName='Marie', lastName='Pieterse'),
        dict(eppn='gertjan', email='gertjan@test.eu', mayLogin=False, authority='local',
            firstName='Gert Jan', lastName='Klein-Holgerink'),
        dict(eppn='lisa', email='lisa@test.eu', mayLogin=True, authority='local',
            firstName='Lisa', lastName='de Leeuw'),
        dict(eppn='dirk', email='dirk@test.eu', mayLogin=True, authority='local',
            firstName='Dirk', lastName='Roorda'),
    ]    

    users = collections.defaultdict(set)
    eppnSet = set()
    for c in allData['contrib']:
        crs = c.get('creator', []) + c.get('modifiedBy', [])
        for cr in crs:
            eppnSet.add(cr)
    idMapping = dict((eppn, ObjectId()) for eppn in sorted(eppnSet))
    for c in allData['contrib']:
        c['creator'] = [dict(_id=idMapping[cr]) for cr in c['creator']]

        if 'modifiedBy' not in c:
            c['modifiedBy'] = []
        else:
            c['modifiedBy'] = [dict(_id=idMapping[cr]) for cr in c['modifiedBy']]

    users = dict((i, eppn) for (eppn, i) in idMapping.items())
    for (i, eppn) in sorted(users.items()):
        existingUsers.append(dict(_id=i, eppn=eppn, mayLogin=False, authority='legacy'))

    for u in testUsers:
        u['_id'] = ObjectId()
        idMapping[u['eppn']] = u['_id']
        existingUsers.append(u)
    inGroups = [
        dict(eppn='DirkRoorda@dariah.eu', authority='DARIAH', group='system'),
        dict(eppn='LisaDeLeeuw@dariah.eu', authority='DARIAH', group='office'),
        dict(eppn='suzan', authority='local', group='auth'),
        dict(eppn='marie', authority='local', group='auth'),
        dict(eppn='gertjan', authority='local', group='auth'),
        dict(eppn='lisa', authority='local', group='office'),
        dict(eppn='dirk', authority='local', group='system'),
    ]
    inGroups = [dict(tuple(ig.items())+(('_id', ObjectId()),)) for ig in inGroups]
    allData['user'] = existingUsers
    allData['group'] = inGroups
    
    allFields['user'] = dict(
        _id=('id', 1),
        eppn=('string', 1),
        email=('email', 1),
        mayLogin=('bool', 1),
        authority=('string', 1),
        firstName=('string', 1),
        lastName=('string', 1),
    )
    allFields['group'] = dict(
        _id=('id', 1),
        eppn=('string', 1),
        authority=('string', 1),
        group=('string', 1),
    )
    uidMapping.update(idMapping)

Related tables


In [55]:
def relTables():
    def norm(x): return x.strip().lower()
    
    relIndex = dict()
    for mt in sorted(VALUE_LISTS):
        rows = allData[mt]
        for f in sorted(VALUE_LISTS[mt]):
            comps = f.split(':')
            if len(comps) == 2:
                (f, fAs) = comps
            else:
                fAs = f
            relInfo = valueDict[fAs]
            if not fAs in relIndex:
                idMapping = dict((i, ObjectId()) for i in relInfo)
                allData[fAs] = [dict(_id=idMapping[i], value=v) for (i, v) in relInfo.items()]
                relIndex[fAs] = dict((norm(v), (idMapping[i], v)) for (i, v) in relInfo.items())
            for row in rows:
                newValue = []
                for v in row[f]:
                    rnv = norm(v)
                    (i, nv) = relIndex[fAs].get(rnv, ("-1", None))
                    if nv == None:
                        target = MOVE_MISSING[mt]
                        if target not in row: row[target] = ['']
                        row[target][0] += '\nMOVED FROM {}: {}'.format(f, v)
                    else: newValue.append(dict(_id=i, value=nv))
                row[f] = newValue

Test tweaks

Tweaks for testing purposes.


In [56]:
def testTweaks():
    mt = 'contrib'
    myContribs = {'3DHOP', 'AAI'}
    my = uidMapping['dirk']
    for row in allData[mt]:
        title = row.get('title', [None])
        if len(title) == 0: title = [None]
        if title[0] in myContribs:
            row['creator'] = [dict(_id=my)]

Insert into MongoDB


In [57]:
def importMongo():
    client = MongoClient()
    client.drop_database('dariah')
    db = client.dariah
    for (mt, rows) in allData.items():
        info(mt)
        db[mt].insert_many(rows)

The whole pipeline


In [58]:
money_warnings = {}
money_notes = {}
valueDict = dict()
rawFields = dict()
allFields = dict()
rawData = dict()
allData = dict()
uidMapping = dict()

parser = etree.XMLParser(remove_blank_text=True, ns_clean=True)
readFmFields()
readFmData()
readLists()
moveFields()
countryTable()
userTable()
relTables()
testTweaks()
importMongo()
#showData()
#showMoney()


contrib
country
assessment
user
group
discipline
keyword
tadirahActivity
tadirahObject
tadirahTechnique
typeContribution
vcc
year

To import the bson dump in another mongodb installation, use the commandline to dump the dariah database here

mongodump -d dariah -o dariah

and to import it elsewhere.

mongorestore --drop -d dariah dariah

In [33]:
valueDict.keys()


Out[33]:
dict_keys(['countryExtra', 'disciplines', 'tadirahActivities', 'tadirahObjects', 'tadirahTechniques', 'typeContribution', 'vcc', 'keywords', 'year'])

In [54]:
valueDict['keywords']


Out[54]:
{1: '(socio-)linguistic analyses',
 2: '1795-2015',
 3: '3D modeling',
 4: '3D scanning',
 5: 'Analyse quantitative',
 6: 'Analysis-Stylistic Analysis',
 7: 'Architecture',
 8: 'Archives',
 9: 'Arts',
 10: 'Arts and Humanities',
 11: 'Augmented reality',
 12: 'Belgian justice',
 13: 'Belgium',
 14: 'Browsing',
 15: 'Brussels',
 16: 'Cairo',
 17: 'Certification',
 18: 'Commenting',
 19: 'Community Involvement',
 20: 'Community building',
 21: 'Competences',
 22: 'Corpus linguistics',
 23: 'Critical edition',
 24: 'Cœur du Hainaut',
 25: 'DBMS',
 26: 'Database design',
 27: 'Digital Heritage',
 28: 'Digital Humanities',
 29: 'Digitisation',
 30: 'Distance intertextuelle',
 31: 'Document Understanding',
 32: 'Early Modern History',
 33: 'Editing',
 34: 'Egyptian Language',
 35: 'Electronic publishing',
 36: 'Encoding',
 37: 'Encoding of complex writing systems',
 38: 'Etiquetage des textes',
 39: 'Evaluations',
 40: 'Expertise',
 41: 'File formats',
 42: 'French SMS',
 43: 'Georeferencing',
 44: 'Hagiographie',
 45: 'Handwritten Text Recognition of historical documents',
 46: 'Heritage',
 47: 'History and memory',
 48: 'Humanities',
 49: 'Humanities and Arts',
 50: 'Information_retrieval',
 51: 'Intellectual property rights',
 52: 'Latin',
 53: 'Latin language',
 54: 'Legal Deposit',
 55: 'Lemmatisation et analyse morpho-syntaxique',
 56: 'Long Nineteenth Century',
 57: 'Manuscripts',
 58: 'Mapping',
 59: 'Mediaeval Studies',
 60: 'Mediaeval Studies / Working group',
 61: 'Medieval Authors',
 62: 'Medieval History',
 63: 'Medieval Textual Tradition',
 64: 'Mediterranean',
 65: 'Meta-Search',
 66: 'Modern History',
 67: 'Mons',
 68: 'Moyen Âge',
 69: 'Natural Language Processing',
 70: 'Natural Language Processing.',
 71: 'Network analysis',
 72: 'OAI',
 73: 'OAI-PMH',
 74: 'Observatory',
 75: 'Open-access',
 76: 'Optical Character Recognition',
 77: 'Ortolang',
 78: 'POS Tagging > Analysis-Structural Analysis',
 79: 'POS-Tagging > Analysis-Structural Analysis',
 80: 'POT tagging > Analysis-Structural Analysis',
 81: 'POT-Tagging > Analysis-Structural Analysis',
 82: 'Pattern Recognition > Analysis-Relational Analysis',
 83: 'Philological reconstruction',
 84: 'Photography',
 85: 'Preservation - Metadata > Storage - Preservation',
 86: 'Preservation Metadata',
 87: 'Preservation Metadata > Storage - Preservation',
 88: 'Preservation Metadata > Storage-Preservation',
 89: 'Preservation_metadata',
 90: 'Principal Component Analysis',
 91: 'Promotional material',
 92: 'Proofreading',
 93: 'Prosopography',
 94: 'RDF',
 95: 'Regional History',
 96: 'Renaissance',
 97: 'Replication',
 98: 'Replication > Storage - Preservation',
 99: 'Replication > Storage Preservations',
 100: 'Replication > Storage-Preservation',
 101: 'SPARQL',
 102: 'SPARQL end point',
 103: 'Scanning',
 104: 'Searching',
 105: 'Sentiment Analysis > Analysis-Content Analysis',
 106: 'Sentiment Analysis >Analysis-Content Analysis',
 107: 'Seraching',
 108: 'Social Network Analysis (SNA)',
 109: 'Social reform',
 110: 'Social regulation',
 111: 'Social research methods',
 112: 'Sociology of Knowledge',
 113: 'Software localisation',
 114: 'Statistics',
 115: 'Storage > Preservation',
 116: 'TEI XML',
 117: 'Technology - Preservation > Storage - Preservation',
 118: 'Technology > Storage - Preservation',
 119: 'Technology Preservation > Storage - Preservation',
 120: 'Technology Preservation > Storage-Preservation',
 121: 'Technology-Preservation > Storage-Preservation',
 122: 'Technology_preservation',
 123: 'Text edition',
 124: 'Texts',
 125: 'Textual Traditions',
 126: 'Textual analysis',
 127: 'Tools',
 128: 'Topic modeling > Analysis-Content Analysis',
 129: 'Town planning',
 130: 'Transnational History',
 131: 'Transnational Networks',
 132: 'Urbain History',
 133: 'Urban Archaeology',
 134: 'User guides',
 135: 'VRE-SI',
 136: 'Versioning > Storage-Preservation',
 137: 'Virtual Research Environment Service Infrastructure',
 138: 'Web Cra',
 139: 'Web Crawling > Capture - Gathering',
 140: 'Web Crawling > Capture-Gathering',
 141: 'Web-Crawling > Capture-Gathering',
 142: 'Working group',
 143: 'XML-MEI',
 144: 'XML-TEI',
 145: 'advice',
 146: 'aggregation',
 147: 'alignment',
 148: 'annotation',
 149: 'anonymisation',
 150: 'art and art history',
 151: 'benefits',
 152: 'best practices',
 153: 'bibliography',
 154: 'bilingualism',
 155: 'citability',
 156: 'codicology',
 157: 'collection',
 158: 'collections',
 159: 'colonial history',
 160: 'commenting',
 161: 'community',
 162: 'community engagement',
 163: 'contemporary architecture',
 164: 'coordination',
 165: 'course',
 166: 'crawling',
 167: 'cultural innovation',
 168: 'database design',
 169: 'database management system (DBMS)',
 170: 'diasporas',
 171: 'dictionary building',
 172: 'digital curation',
 173: 'digital edition',
 174: 'digital humanities',
 175: 'digital publishing',
 176: 'digital repository',
 177: 'diplomatics',
 178: 'dissemination',
 179: 'documentary portal',
 180: 'drama',
 181: 'embedded metadata',
 182: 'ethical issues',
 183: 'expertise',
 184: 'federation of identity',
 185: 'french literature',
 186: 'geographical name',
 187: 'handle',
 188: 'heraldry',
 189: 'historical cartography',
 190: 'history',
 191: 'iconography',
 192: 'indexing',
 193: 'information',
 194: 'institution',
 195: 'institutions',
 196: 'integration',
 197: 'interactive visualization',
 198: 'interoperability',
 199: 'knowledge management',
 200: 'knowledge sharing',
 201: 'law and justice',
 202: 'law and justice history',
 203: 'library',
 204: 'linguistic',
 205: 'linguistic resources',
 206: 'magistrates',
 207: 'mediated electronic discourse',
 208: 'migration',
 209: 'migration.',
 210: 'modern architecture',
 211: 'monuments',
 212: 'multilingualism',
 213: 'multimedia',
 214: 'museum documentation',
 215: 'network analysis',
 216: 'networking',
 217: 'numerical catalogues',
 218: 'old english language and literature',
 219: 'ontologies',
 220: 'ontology',
 221: 'open access',
 222: 'open archive',
 223: 'palaeography',
 224: 'persistent identifier',
 225: 'platform',
 226: 'presentation',
 227: 'preservation',
 228: 'prosopography',
 229: 'publications',
 230: 'publishing software',
 231: 'reference data registries',
 232: 'registries',
 233: 'repression',
 234: 'research',
 235: 'research archives',
 236: 'resources',
 237: 'rhetoric',
 238: 'romanization of Arabic',
 239: 'scholarly content',
 240: 'scientific sources',
 241: 'semantic linking',
 242: 'semantic searc',
 243: 'semantic search',
 244: 'semantic structuring',
 245: 'semantic web',
 246: 'social actors',
 247: 'social media',
 248: 'social network',
 249: 'sound archives',
 250: 'statistics',
 251: 'studies',
 252: 'survey',
 253: 'taxonomy',
 254: 'terminology systems',
 255: 'text encoding',
 256: 'the tool doesn\'t show some of the "Disciplines Associated" "TaDiRAH Research techniques" that should be available',
 257: 'thesauri',
 258: 'thesaurus',
 259: 'toponymy',
 260: 'training',
 261: 'transcoding',
 262: 'transcription',
 263: 'translation',
 264: 'triple store',
 265: 'urbanism',
 266: 'virtual environment',
 267: 'visualisation',
 268: 'web',
 269: 'web of data',
 270: 'workshop'}

Exploration

The process has finished, but here is space to explore the data, in order to find patterns, regularities, and, more importantly, irregularities.

First step: create csv files of the data and combine them into an excel sheet.


In [106]:
import xlsxwriter

EXPORT_DIR = os.path.expanduser('~/Downloads')
EXPORT_ORIG = '{}/contribFromFileMaker.xlsx'.format(EXPORT_DIR)
EXPORT_MONGO = '{}/contribInMongoDB.xlsx'.format(EXPORT_DIR)

In [107]:
workbook = xlsxwriter.Workbook(EXPORT_ORIG, {'strings_to_urls': False})
for mt in rawData:
    worksheet = workbook.add_worksheet(mt)
    for (f, field) in enumerate(rawFields[mt]):
            worksheet.write(0, f, field)
    for (r, row) in enumerate(rawData[mt]):
        for (f, field) in enumerate(rawFields[mt]):
            val = row[field]
            val = [] if val == None else val if type(val) is list else [val]
            val = '|'.join(val)
            worksheet.write(r+1, f, val)
workbook.close()

In [108]:
workbook = xlsxwriter.Workbook(EXPORT_MONGO, {'strings_to_urls': False})
for mt in allData:
    worksheet = workbook.add_worksheet(mt)
    fields = sorted(allFields[mt])
    for (f, field) in enumerate(fields):
            worksheet.write(0, f, field)
    for (r, row) in enumerate(allData[mt]):
        for (f, field) in enumerate(fields):
            fmt = None
            val = row.get(field, [])
            (ftype, fmult) = allFields[mt][field]
            val = [] if val == None else [val] if type(val) is not list else val
            exportVal = []
            for v in val:
                if type(v) is dict:
                    exportVal.append(','.join(str(vv) for vv in v.values()))
                elif ftype == 'date' or ftype == 'datetime':
                    exportVal.append(v if type(v) is str else v.isoformat())
                else:
                    exportVal.append(str(v))
            worksheet.write(r+1, f, ' | '.join(exportVal))
workbook.close()

In [109]:
showFields()


assessment
                 approved: bool       (1)
               contrib_id: id         (1)
             dateApproved: datetime   (8)
          dateApprovedCIO: datetime   (1)
         reviewerDecision: bool       (2)
            reviewerNames: text       (2)
                submitted: bool       (1)
              vccApproval: bool       (1)
              vccDecision: bool       (8)
           vccDisApproval: bool       (1)
contrib
       contactPersonEmail: email      (4)
        contactPersonName: text       (1)
          costDescription: text       (1)
                costTotal: valuta     (1)
                  country: text       (1)
                  creator: text       (1)
              dateCreated: datetime   (1)
             dateModified: datetime   (1)
              description: text       (1)
              disciplines: text       (2)
                 keywords: text       (2)
               modifiedBy: text       (1)
        tadirahActivities: text       (2)
           tadirahObjects: text       (2)
        tadirahTechniques: text       (2)
                    title: text       (1)
         typeContribution: text       (1)
    typeContributionOther: text       (3)
              urlAcademic: text       (3)
          urlContribution: text       (3)
                      vcc: text       (2)
                     year: text       (1)
country
                      _id: id         (1)
                 isMember: bool       (1)
                      iso: string     (1)
                 latitude: float      (1)
                longitude: float      (1)
                     name: text       (1)
countryExtra
                      _id: id         (1)
                    value: string     (1)
disciplines
                      _id: id         (1)
                    value: string     (1)
groups
                      _id: id         (1)
                authority: string     (1)
                     eppn: string     (1)
                    group: string     (1)
keywords
                      _id: id         (1)
                    value: string     (1)
tadirahActivities
                      _id: id         (1)
                    value: string     (1)
tadirahObjects
                      _id: id         (1)
                    value: string     (1)
tadirahTechniques
                      _id: id         (1)
                    value: string     (1)
typeContribution
                      _id: id         (1)
                    value: string     (1)
users
                      _id: id         (1)
                authority: string     (1)
                    email: email      (1)
                     eppn: string     (1)
                firstName: string     (1)
                 lastName: string     (1)
                 mayLogin: bool       (1)
vcc
                      _id: id         (1)
                    value: string     (1)
year
                      _id: id         (1)
                    value: string     (1)

In [30]:
client = MongoClient()
dbm = client.dariah
for d in dbm.contrib.find({'title': '3DHOP'}).limit(2):
    print('=' * 50)
    for f in sorted(d):
        print('{}={}'.format(f, d[f]))


==================================================
_id=58a6dcad2179c01d8f0dd64d
contactPersonEmail=['roberto.scopigno@isti.cnr.it']
contactPersonName=['Roberto Scopigno']
costDescription=['The labour effort is related to maintenance of all software tools and resources shared with the community (see also the others in the following excel shett lines); it includes one full-time researcher (the responsible person for maintenance and management of the MeshLab tool, that is the most complex among our tools and the one with the widest distribution, with more than 350,000 downloads in 2015) and the (part-time) contribution of several other VClab staff (contributing to the other tools).']
costTotal=[34280.0]
country=[{'_id': ObjectId('58a6dcad2179c01d8f0dd9f5'), 'iso': 'IT', 'value': 'Italy'}]
creator=[{'_id': ObjectId('58a6dcad2179c01d8f0dda1f')}]
dateCreated=[datetime.datetime(2016, 4, 26, 12, 1, 5)]
dateModified=[datetime.datetime(2016, 7, 6, 13, 59, 52)]
description=['3DHOP (3D Heritage Online Presenter) is an open-source tool for the creation of multimedia interactive Web presentations of digital cultural artifacts. 3DHOP target audience range from the museum curators with some IT experience to the experienced Web designers who wants to embed 3D contents in their creations. Based on WebGL, works on almost all platforms, without plugin or a dedicated server, directly inside HTML pages and is capable of streaming multiresolution 3D meshes over HTTP, supporting the exploration of huge 3D models on commodity computers and standard internet connections. IT has been downloaded and used by more than 1700 colleagues.']
disciplines=[{'_id': ObjectId('58a6dcad2179c01d8f0dda27'), 'value': 'Archaeology and Prehistory'}, {'_id': ObjectId('58a6dcad2179c01d8f0dda29'), 'value': 'Art and art history'}, {'_id': ObjectId('58a6dcad2179c01d8f0dda2a'), 'value': 'Biological anthropology'}, {'_id': ObjectId('58a6dcad2179c01d8f0dda2d'), 'value': 'Cultural heritage and museology'}, {'_id': ObjectId('58a6dcad2179c01d8f0dda30'), 'value': 'Education'}]
keywords=[{'_id': ObjectId('58a6dcad2179c01d8f0dda43'), 'value': ''}, {'_id': ObjectId('58a6dcad2179c01d8f0ddaaa'), 'value': 'Scanning'}, {'_id': ObjectId('58a6dcad2179c01d8f0ddb43'), 'value': 'the tool doesn\'t show some of the "Disciplines Associated" "TaDiRAH Research techniques" that should be available'}]
modifiedBy=[{'_id': ObjectId('58a6dcad2179c01d8f0dda10')}]
tadirahActivities=[{'_id': ObjectId('58a6dcad2179c01d8f0ddb52'), 'value': 'Capture'}]
tadirahObjects=[{'_id': ObjectId('58a6dcad2179c01d8f0ddb5a'), 'value': 'Artifacts'}, {'_id': ObjectId('58a6dcad2179c01d8f0ddb5d'), 'value': 'Computers'}, {'_id': ObjectId('58a6dcad2179c01d8f0ddb5f'), 'value': 'Digital Humanities'}, {'_id': ObjectId('58a6dcad2179c01d8f0ddb62'), 'value': 'Images'}, {'_id': ObjectId('58a6dcad2179c01d8f0ddb63'), 'value': 'Images (3D)'}, {'_id': ObjectId('58a6dcad2179c01d8f0ddb6d'), 'value': 'Multimedia'}, {'_id': ObjectId('58a6dcad2179c01d8f0ddb72'), 'value': 'Research'}, {'_id': ObjectId('58a6dcad2179c01d8f0ddb73'), 'value': 'Research Process'}, {'_id': ObjectId('58a6dcad2179c01d8f0ddb74'), 'value': 'Research Results'}, {'_id': ObjectId('58a6dcad2179c01d8f0ddb7b'), 'value': 'Tools'}]
tadirahTechniques=[]
title=['3DHOP']
typeContribution=[{'_id': ObjectId('58a6dcad2179c01d8f0ddba4'), 'value': 'Tools and Software'}]
typeContributionOther=[]
urlAcademic=['www.isti.cnr.it/']
urlContribution=['http://3dhop.net/']
vcc=[{'_id': ObjectId('58a6dcad2179c01d8f0ddbad'), 'value': 'VCC1'}]
year=[{'_id': ObjectId('58a6dcad2179c01d8f0ddbb4'), 'value': '2016'}]

Here is a query to get all 'type_of_inkind' values for contributions.


In [32]:
for c in dbm.contrib.distinct('typeContribution', {}):
    print(c)


{'_id': 1, 'value': 'Access'}
{'_id': 6, 'value': 'Event'}
{'_id': 12, 'value': 'DARIAH Coordination'}
{'_id': 9, 'value': 'Cooperation'}
{'_id': 2, 'value': 'Expertise'}
{'_id': 5, 'value': 'Tools and Software'}
{'_id': 8, 'value': 'Summer School'}
{'_id': 7, 'value': 'Training'}
{'_id': 11, 'value': 'Data'}
{'_id': 10, 'value': 'Educational Resources'}
{'_id': 4, 'value': 'Content Hosting'}
{'_id': 3, 'value': 'Interoperability'}

Here are the users:


In [33]:
for c in dbm.users.find({}):
    print(c)


{'_id': 1, 'eppn': 'ATNC01', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 2, 'eppn': 'BENC01', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 3, 'eppn': 'CIO01', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 4, 'eppn': 'CIO02', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 5, 'eppn': 'CIOHenk', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 6, 'eppn': 'CIOLisa', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 7, 'eppn': 'DENC01', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 8, 'eppn': 'DGA', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 9, 'eppn': 'FRNC01', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 10, 'eppn': 'FRNC02', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 11, 'eppn': 'GRNC01', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 12, 'eppn': 'HRNC01', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 13, 'eppn': 'Henk Harmsen', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 14, 'eppn': 'IENC01', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 15, 'eppn': 'ITNC01', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 16, 'eppn': 'LUNC01', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 17, 'eppn': 'NLNC01', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 18, 'eppn': 'PLNC01', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 19, 'eppn': 'SINC01', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 20, 'eppn': 'VCC11', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 21, 'eppn': 'VCC12', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 22, 'eppn': 'VCC22', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 23, 'eppn': 'VCC42', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 24, 'eppn': 'admin', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 25, 'eppn': 'consult', 'mayLogin': False, 'authority': 'legacy'}
{'_id': 26, 'eppn': 'suzan', 'email': 'suzan1@test.eu', 'mayLogin': True, 'authority': 'local', 'firstName': 'Suzan', 'lastName': 'Karelse'}
{'_id': 27, 'eppn': 'marie', 'email': 'suzan2@test.eu', 'mayLogin': True, 'authority': 'local', 'firstName': 'Marie', 'lastName': 'Pieterse'}
{'_id': 28, 'eppn': 'gertjan', 'email': 'gertjan@test.eu', 'mayLogin': False, 'authority': 'local', 'firstName': 'Gert Jan', 'lastName': 'Klein-Holgerink'}
{'_id': 29, 'eppn': 'lisa', 'email': 'lisa@test.eu', 'mayLogin': True, 'authority': 'local', 'firstName': 'Lisa', 'lastName': 'de Leeuw'}
{'_id': 30, 'eppn': 'dirk', 'email': 'dirk@test.eu', 'mayLogin': True, 'authority': 'local', 'firstName': 'Dirk', 'lastName': 'Roorda'}

Here are the countries:


In [34]:
for c in dbm.country.find({'isMember': True}):
    print(c)


{'_id': 'AT', 'name': 'Austria', 'isMember': True, 'latitude': '47.7', 'longitude': '15.11'}
{'_id': 'BE', 'name': 'Belgium', 'isMember': True, 'latitude': '51.3', 'longitude': '3.1'}
{'_id': 'HR', 'name': 'Croatia', 'isMember': True, 'latitude': '44.7', 'longitude': '15.6'}
{'_id': 'CY', 'name': 'Cyprus', 'isMember': True, 'latitude': '35.0', 'longitude': '32.8'}
{'_id': 'DK', 'name': 'Denmark', 'isMember': True, 'latitude': '55.6', 'longitude': '11.0'}
{'_id': 'FR', 'name': 'France', 'isMember': True, 'latitude': '46.5', 'longitude': '1.9'}
{'_id': 'DE', 'name': 'Germany', 'isMember': True, 'latitude': '51.0', 'longitude': '10.4'}
{'_id': 'GR', 'name': 'Greece', 'isMember': True, 'latitude': '38.0', 'longitude': '23.8'}
{'_id': 'IE', 'name': 'Ireland', 'isMember': True, 'latitude': '53.1', 'longitude': '-8.4'}
{'_id': 'IT', 'name': 'Italy', 'isMember': True, 'latitude': '41.6', 'longitude': '13.0'}
{'_id': 'LU', 'name': 'Luxembourg', 'isMember': True, 'latitude': '49.6', 'longitude': '6.1'}
{'_id': 'MT', 'name': 'Malta', 'isMember': True, 'latitude': '35.9', 'longitude': '14.4'}
{'_id': 'NL', 'name': 'Netherlands', 'isMember': True, 'latitude': '52.8', 'longitude': '5.8'}
{'_id': 'PT', 'name': 'Portugal', 'isMember': True, 'latitude': '38.7', 'longitude': '-9.0'}
{'_id': 'RS', 'name': 'Serbia', 'isMember': True, 'latitude': '44.0', 'longitude': '20.8'}
{'_id': 'SI', 'name': 'Slovenia', 'isMember': True, 'latitude': '46.2', 'longitude': '14.4'}
{'_id': 'PL', 'name': 'Poland', 'isMember': True, 'latitude': '52.3', 'longitude': '19.8'}

In [35]:
for c in dbm.contrib.distinct('country', {}):
    print(c)


{'_id': 'HR', 'value': 'Croatia'}
{'_id': 'LU', 'value': 'Luxembourg'}
{'_id': 'SI', 'value': 'Slovenia'}
{'_id': 'BE', 'value': 'Belgium'}
{'_id': 'GR', 'value': 'Greece'}
{'_id': 'RS', 'value': 'Serbia'}
{'_id': 'AT', 'value': 'Austria'}
{'_id': 'IT', 'value': 'Italy'}
{'_id': 'FR', 'value': 'France'}
{'_id': 'IE', 'value': 'Ireland'}
{'_id': 'DE', 'value': 'Germany'}
{'_id': 'NL', 'value': 'Netherlands'}
{'_id': 'DK', 'value': 'Denmark'}
{'_id': 'PL', 'value': 'Poland'}

Let us get related data: the type_of_inkind of all contributions. For each contribution we need only the ids of the related type_of_inkind values.


In [39]:
for d in dbm.contrib.find({}, {'typeContribution': True}).limit(10):
    print(d)


{'_id': 1, 'typeContribution': [{'_id': 1, 'value': 'Access'}]}
{'_id': 2, 'typeContribution': [{'_id': 1, 'value': 'Access'}]}
{'_id': 3, 'typeContribution': [{'_id': 6, 'value': 'Event'}]}
{'_id': 4, 'typeContribution': [{'_id': 12, 'value': 'DARIAH Coordination'}]}
{'_id': 5, 'typeContribution': [{'_id': 9, 'value': 'Cooperation'}]}
{'_id': 6, 'typeContribution': [{'_id': 1, 'value': 'Access'}]}
{'_id': 7, 'typeContribution': [{'_id': 6, 'value': 'Event'}]}
{'_id': 8, 'typeContribution': [{'_id': 9, 'value': 'Cooperation'}]}
{'_id': 9, 'typeContribution': [{'_id': 1, 'value': 'Access'}]}
{'_id': 10, 'typeContribution': [{'_id': 12, 'value': 'DARIAH Coordination'}]}

In [40]:
for d in dbm.contrib.find({}, {'country': True}).limit(10):
    print(d)


{'_id': 1, 'country': [{'_id': 'HR', 'value': 'Croatia'}]}
{'_id': 2, 'country': [{'_id': 'HR', 'value': 'Croatia'}]}
{'_id': 3, 'country': [{'_id': 'HR', 'value': 'Croatia'}]}
{'_id': 4, 'country': [{'_id': 'HR', 'value': 'Croatia'}]}
{'_id': 5, 'country': [{'_id': 'HR', 'value': 'Croatia'}]}
{'_id': 6, 'country': [{'_id': 'HR', 'value': 'Croatia'}]}
{'_id': 7, 'country': [{'_id': 'HR', 'value': 'Croatia'}]}
{'_id': 8, 'country': [{'_id': 'HR', 'value': 'Croatia'}]}
{'_id': 9, 'country': [{'_id': 'HR', 'value': 'Croatia'}]}
{'_id': 10, 'country': [{'_id': 'LU', 'value': 'Luxembourg'}]}

In [29]:
x = dict(_id=5, value='66')
y = dict(_id=5, value='66')
x == y


Out[29]:
True

In [ ]: