In [ ]:
import os, json, tarfile, sys
from time import sleep, time
from mpcontribs.client import Client
from pymatgen import Structure, MPRester
from urllib.request import urlretrieve
from monty.json import MontyDecoder
from itertools import groupby
from operator import itemgetter
from emmet.vasp.materials import group_structures
from collections import defaultdict
In [ ]:
project = 'jarvis_dft'
client = Client('your-api-key-here')
mpr = MPRester()
Create project (once)
In [ ]:
is_public = True
info = {
'project': project,
'is_public': is_public,
'title': 'JARVIS-DFT',
'owner': 'kamal.choudhary@nist.gov',
'authors': 'K. Choudhary, F. Tavazza',
'description': ' '.join('''
The DFT section of JARVIS (JARVIS-DFT) consists of thousands
of VASP based calculations for 3D-bulk, single layer (2D), nanowire (1D) and
molecular (0D) systems. Most of the calculations are carried out with optB88vDW functional.
JARVIS-DFT includes materials data such as: energetics, diffraction pattern, radial distribution
function, band-structure, density of states, carrier effective mass, temperature and carrier
concentration dependent thermoelectric properties, elastic constants and gamma-point phonons.
'''.replace('\n', '').split()),
'urls': {
'JARVIS': 'http://www.ctcms.nist.gov/~knc6/JVASP.html',
'SciRep': 'https://doi.org/10.1038/s41598-017-05402-0',
'GitHub': 'https://github.com/usnistgov/jarvis'
}
}
# client.projects.create_entry(project=info).result()
Retrieve and update project info
In [ ]:
client.projects.get_entry(pk=project, _fields=['_all']).result()
In [ ]:
client.projects.update_entry(pk=project, project={
'long_title': '2D & 3D DFT Calculations by NIST',
'description': info['description']
}).result()
Source data
In [ ]:
config = {
"files": [
"https://www.ctcms.nist.gov/~knc6/jdft_{}.json.tgz".format(t)
for t in ['3d', '2d']
],
"details": "https://www.ctcms.nist.gov/~knc6/jsmol/{}.html",
'columns': { # 'mpid'
'jid': {'name': 'details'},
'exfoliation_en': {'name': 'Eₓ', 'unit': 'eV'},
'form_enp': {'name': 'ΔH', 'unit': 'eV'},
'fin_en': {'name': 'E', 'unit': 'meV'},
'op_gap': {'name': 'ΔE|vdW', 'unit': 'meV'},
'mbj_gap': {'name': 'ΔE|mbj', 'unit': 'meV'},
'kv': {'name': 'Kᵥ', 'unit': 'GPa'},
'gv': {'name': 'Gᵥ', 'unit': 'GPa'},
#'magmom': {'name': 'μ', 'unit': 'μᵇ'}
}
}
In [ ]:
raw_data = {} # as read from raw files
for url in config['files']:
dbfile = url.rsplit('/')[-1]
if not os.path.exists(dbfile):
print('downloading', dbfile, '...')
urlretrieve(url, dbfile)
print(dbfile, 'loading ...')
with tarfile.open(dbfile, "r:gz") as tar:
member = tar.getmembers()[0]
raw_data[dbfile] = json.load(tar.extractfile(member), cls=MontyDecoder)
print(dbfile, len(raw_data[dbfile]))
Create contributions
In [ ]:
def get_contrib(rd, identifier):
contrib = {'project': project, 'is_public': is_public, 'identifier': identifier, 'data': {}}
for t in rd:
contrib['data'][t] = {}
for k, col in config['columns'].items():
hdr, unit = col['name'], col.get('unit')
if k == 'jid':
contrib['data'][t][hdr] = config[hdr].format(rd[t][k])
elif k in rd[t]:
if unit:
try:
float(rd[t][k])
except ValueError:
continue
contrib['data'][t][hdr] = f'{rd[t][k]} {unit}' if unit else rd[t][k]
return contrib
In [ ]:
# merge mp-ids from 2d and 3d datasets, prioritize by availability of both
raw_data_grouped = defaultdict(dict)
for t in ['2d', '3d']:
fn = f'jdft_{t}.json.tgz'
raw_data_sorted = sorted(raw_data[fn], key=itemgetter('mpid'))
for task_id, objects in groupby(raw_data_sorted, key=itemgetter('mpid')):
if task_id.startswith('mp-') or task_id.startswith('mvc-'):
raw_data_grouped[task_id][t] = list(objects)
print(t, len(raw_data_sorted), len(raw_data_grouped))
task_ids = sorted(raw_data_grouped, key=lambda k: len(raw_data_grouped[k]), reverse=True)
In [ ]:
for idx, task_id in enumerate(task_ids):
if idx >= 500:
break
try:
identifier = mpr.get_materials_id_from_task_id(task_id)
except Exception as ex:
print(idx, task_id, 'invalid')
continue
if not idx%25:
if idx > 0:
stop = time()
duration = stop-start
print(duration)
start = time()
print(idx, task_id, identifier)
rd, structures = {}, []
for t, objects in raw_data_grouped[task_id].items():
rd[t.upper()] = objects[0]
for i, g in enumerate(group_structures([d['final_str'] for d in objects])):
comp = g[0].composition.reduced_formula
structures.append({'label': t.upper(), 'name': f'{comp}-{t}-{i}', 'is_public': is_public})
structures[-1].update(g[0].as_dict())
contrib = get_contrib(rd, identifier)
ntries = 0
while ntries < 3:
try:
for d in client.contributions.get_entries(
project=project, identifier=identifier, _fields=['id'],
).result()['data']:
client.contributions.delete_entry(pk=d['id']).result()
#print(idx, d['id'], 'deleted')
cid = client.contributions.create_entry(contribution=contrib).result()['id']
#print(client.contributions.get_entry(pk=cid, _fields=['_all']).result())
#print(idx, cid, 'created')
for sdct in structures:
sdct['contribution'] = cid
sid = client.structures.create_entry(structure=sdct).result()['id']
#print(idx, 'structure', sid, 'created')
break
except Exception as ex:
ntries += 1
print('waiting', ntries*30, 'seconds...')
sleep(ntries*30)
print(idx, identifier, 'continue ...')
else:
print('I give up.')
break