In [ ]:
import os, json, tarfile, sys
from time import sleep, time
from mpcontribs.client import Client
from pymatgen import Structure, MPRester
from urllib.request import urlretrieve
from monty.json import MontyDecoder
from itertools import groupby
from operator import itemgetter
from emmet.vasp.materials import group_structures
from collections import defaultdict

In [ ]:
project = 'jarvis_dft'
client = Client('your-api-key-here')
mpr = MPRester()

Create project (once)

In [ ]:
is_public = True
info = {
    'project': project,
    'is_public': is_public,
    'title': 'JARVIS-DFT',
    'owner': '',
    'authors': 'K. Choudhary, F. Tavazza',
    'description': ' '.join('''
        The DFT section of JARVIS (JARVIS-DFT) consists of thousands
        of VASP based calculations for 3D-bulk, single layer (2D), nanowire (1D) and 
        molecular (0D) systems. Most of the calculations are carried out with optB88vDW functional.
        JARVIS-DFT includes materials data such as: energetics, diffraction pattern, radial distribution
        function, band-structure, density of states, carrier effective mass, temperature and carrier
        concentration dependent thermoelectric properties, elastic constants and gamma-point phonons.
    '''.replace('\n', '').split()),
    'urls': {
        'JARVIS': '',
        'SciRep': '',
        'GitHub': ''
# client.projects.create_entry(project=info).result()

Retrieve and update project info

In [ ]:
client.projects.get_entry(pk=project, _fields=['_all']).result()

In [ ]:
client.projects.update_entry(pk=project, project={
    'long_title': '2D & 3D DFT Calculations by NIST',
    'description': info['description']

Source data

In [ ]:
config = {
    "files": [
        for t in ['3d', '2d']
    "details": "{}.html",
    'columns': {  # 'mpid'
        'jid': {'name': 'details'},
        'exfoliation_en': {'name': 'Eₓ', 'unit': 'eV'},
        'form_enp': {'name': 'ΔH', 'unit': 'eV'},
        'fin_en': {'name': 'E', 'unit': 'meV'},
        'op_gap': {'name': 'ΔE|vdW', 'unit': 'meV'},
        'mbj_gap': {'name': 'ΔE|mbj', 'unit': 'meV'},
        'kv': {'name': 'Kᵥ', 'unit': 'GPa'},
        'gv': {'name': 'Gᵥ', 'unit': 'GPa'},
        #'magmom': {'name': 'μ', 'unit': 'μᵇ'}

In [ ]:
raw_data = {}  # as read from raw files
for url in config['files']:
    dbfile = url.rsplit('/')[-1]
    if not os.path.exists(dbfile):
        print('downloading', dbfile, '...')
        urlretrieve(url, dbfile)

    print(dbfile, 'loading ...')
    with, "r:gz") as tar:
        member = tar.getmembers()[0]
        raw_data[dbfile] = json.load(tar.extractfile(member), cls=MontyDecoder)
    print(dbfile, len(raw_data[dbfile]))

Create contributions

In [ ]:
def get_contrib(rd, identifier):
    contrib = {'project': project, 'is_public': is_public, 'identifier': identifier, 'data': {}}
    for t in rd:
        contrib['data'][t] = {}
        for k, col in config['columns'].items():
            hdr, unit = col['name'], col.get('unit')
            if k == 'jid':
                contrib['data'][t][hdr] = config[hdr].format(rd[t][k])
            elif k in rd[t]:
                if unit:
                    except ValueError:
                contrib['data'][t][hdr] = f'{rd[t][k]} {unit}' if unit else rd[t][k]
    return contrib

In [ ]:
# merge mp-ids from 2d and 3d datasets, prioritize by availability of both
raw_data_grouped = defaultdict(dict)
for t in ['2d', '3d']:
    fn = f'jdft_{t}.json.tgz'
    raw_data_sorted = sorted(raw_data[fn], key=itemgetter('mpid'))
    for task_id, objects in groupby(raw_data_sorted, key=itemgetter('mpid')):
        if task_id.startswith('mp-') or task_id.startswith('mvc-'):
            raw_data_grouped[task_id][t] = list(objects)        
    print(t, len(raw_data_sorted), len(raw_data_grouped))
task_ids = sorted(raw_data_grouped, key=lambda k: len(raw_data_grouped[k]), reverse=True)

In [ ]:
for idx, task_id in enumerate(task_ids):
    if idx >= 500:

        identifier = mpr.get_materials_id_from_task_id(task_id)
    except Exception as ex:
        print(idx, task_id, 'invalid')

    if not idx%25:
        if idx > 0:
            stop = time()
            duration = stop-start
        start = time()
        print(idx, task_id, identifier)
    rd, structures = {}, []
    for t, objects in raw_data_grouped[task_id].items():
        rd[t.upper()] = objects[0]
        for i, g in enumerate(group_structures([d['final_str'] for d in objects])):
            comp = g[0].composition.reduced_formula
            structures.append({'label': t.upper(), 'name': f'{comp}-{t}-{i}', 'is_public': is_public})
    contrib = get_contrib(rd, identifier)

    ntries = 0
    while ntries < 3:
            for d in client.contributions.get_entries(
                project=project, identifier=identifier, _fields=['id'],
                #print(idx, d['id'], 'deleted')

            cid = client.contributions.create_entry(contribution=contrib).result()['id']
            #print(client.contributions.get_entry(pk=cid, _fields=['_all']).result())
            #print(idx, cid, 'created')

            for sdct in structures:
                sdct['contribution'] = cid
                sid = client.structures.create_entry(structure=sdct).result()['id']
                #print(idx, 'structure', sid, 'created')

        except Exception as ex:
            ntries += 1
            print('waiting', ntries*30, 'seconds...')
            print(idx, identifier, 'continue ...')
        print('I give up.')