In [ ]:
import os, json, tarfile, sys
from time import sleep, time
from mpcontribs.client import Client
from pymatgen import Structure, MPRester
from urllib.request import urlretrieve
from monty.json import MontyDecoder
from itertools import groupby
from operator import itemgetter
from emmet.vasp.materials import group_structures
from collections import defaultdict

In [ ]:
project = 'jarvis_dft'
client = Client('your-api-key-here')
mpr = MPRester()

Create project (once)


In [ ]:
is_public = True
info = {
    'project': project,
    'is_public': is_public,
    'title': 'JARVIS-DFT',
    'owner': 'kamal.choudhary@nist.gov',
    'authors': 'K. Choudhary, F. Tavazza',
    'description': ' '.join('''
        The DFT section of JARVIS (JARVIS-DFT) consists of thousands
        of VASP based calculations for 3D-bulk, single layer (2D), nanowire (1D) and 
        molecular (0D) systems. Most of the calculations are carried out with optB88vDW functional.
        JARVIS-DFT includes materials data such as: energetics, diffraction pattern, radial distribution
        function, band-structure, density of states, carrier effective mass, temperature and carrier
        concentration dependent thermoelectric properties, elastic constants and gamma-point phonons.
    '''.replace('\n', '').split()),
    'urls': {
        'JARVIS': 'http://www.ctcms.nist.gov/~knc6/JVASP.html',
        'SciRep': 'https://doi.org/10.1038/s41598-017-05402-0',
        'GitHub': 'https://github.com/usnistgov/jarvis'
    }
}
# client.projects.create_entry(project=info).result()

Retrieve and update project info


In [ ]:
client.projects.get_entry(pk=project, _fields=['_all']).result()

In [ ]:
client.projects.update_entry(pk=project, project={
    'long_title': '2D & 3D DFT Calculations by NIST',
    'description': info['description']
}).result()

Source data


In [ ]:
config = {
    "files": [
        "https://www.ctcms.nist.gov/~knc6/jdft_{}.json.tgz".format(t)
        for t in ['3d', '2d']
    ],
    "details": "https://www.ctcms.nist.gov/~knc6/jsmol/{}.html",
    'columns': {  # 'mpid'
        'jid': {'name': 'details'},
        'exfoliation_en': {'name': 'Eₓ', 'unit': 'eV'},
        'form_enp': {'name': 'ΔH', 'unit': 'eV'},
        'fin_en': {'name': 'E', 'unit': 'meV'},
        'op_gap': {'name': 'ΔE|vdW', 'unit': 'meV'},
        'mbj_gap': {'name': 'ΔE|mbj', 'unit': 'meV'},
        'kv': {'name': 'Kᵥ', 'unit': 'GPa'},
        'gv': {'name': 'Gᵥ', 'unit': 'GPa'},
        #'magmom': {'name': 'μ', 'unit': 'μᵇ'}
    }
}

In [ ]:
raw_data = {}  # as read from raw files
for url in config['files']:
    dbfile = url.rsplit('/')[-1]
    if not os.path.exists(dbfile):
        print('downloading', dbfile, '...')
        urlretrieve(url, dbfile)

    print(dbfile, 'loading ...')
    with tarfile.open(dbfile, "r:gz") as tar:
        member = tar.getmembers()[0]
        raw_data[dbfile] = json.load(tar.extractfile(member), cls=MontyDecoder)
        
    print(dbfile, len(raw_data[dbfile]))

Create contributions


In [ ]:
def get_contrib(rd, identifier):
    contrib = {'project': project, 'is_public': is_public, 'identifier': identifier, 'data': {}}
    for t in rd:
        contrib['data'][t] = {}
        for k, col in config['columns'].items():
            hdr, unit = col['name'], col.get('unit')
            if k == 'jid':
                contrib['data'][t][hdr] = config[hdr].format(rd[t][k])
            elif k in rd[t]:
                if unit:
                    try:
                        float(rd[t][k])
                    except ValueError:
                        continue
                contrib['data'][t][hdr] = f'{rd[t][k]} {unit}' if unit else rd[t][k]
    return contrib

In [ ]:
# merge mp-ids from 2d and 3d datasets, prioritize by availability of both
raw_data_grouped = defaultdict(dict)
for t in ['2d', '3d']:
    fn = f'jdft_{t}.json.tgz'
    raw_data_sorted = sorted(raw_data[fn], key=itemgetter('mpid'))
    for task_id, objects in groupby(raw_data_sorted, key=itemgetter('mpid')):
        if task_id.startswith('mp-') or task_id.startswith('mvc-'):
            raw_data_grouped[task_id][t] = list(objects)        
    print(t, len(raw_data_sorted), len(raw_data_grouped))
    
task_ids = sorted(raw_data_grouped, key=lambda k: len(raw_data_grouped[k]), reverse=True)

In [ ]:
for idx, task_id in enumerate(task_ids):
    if idx >= 500:
        break

    try:
        identifier = mpr.get_materials_id_from_task_id(task_id)
    except Exception as ex:
        print(idx, task_id, 'invalid')
        continue

    if not idx%25:
        if idx > 0:
            stop = time()
            duration = stop-start
            print(duration)
        start = time()
        print(idx, task_id, identifier)
        
    rd, structures = {}, []
    for t, objects in raw_data_grouped[task_id].items():
        rd[t.upper()] = objects[0]
        for i, g in enumerate(group_structures([d['final_str'] for d in objects])):
            comp = g[0].composition.reduced_formula
            structures.append({'label': t.upper(), 'name': f'{comp}-{t}-{i}', 'is_public': is_public})
            structures[-1].update(g[0].as_dict())
    
    contrib = get_contrib(rd, identifier)

    ntries = 0
    while ntries < 3:
        try:
            for d in client.contributions.get_entries(
                project=project, identifier=identifier, _fields=['id'],
            ).result()['data']:
                client.contributions.delete_entry(pk=d['id']).result()
                #print(idx, d['id'], 'deleted')

            cid = client.contributions.create_entry(contribution=contrib).result()['id']
            #print(client.contributions.get_entry(pk=cid, _fields=['_all']).result())
            #print(idx, cid, 'created')

            for sdct in structures:
                sdct['contribution'] = cid
                sid = client.structures.create_entry(structure=sdct).result()['id']
                #print(idx, 'structure', sid, 'created')

            break
        except Exception as ex:
            ntries += 1
            print('waiting', ntries*30, 'seconds...')
            sleep(ntries*30)
            print(idx, identifier, 'continue ...')
    else:
        print('I give up.')
        break