In [ ]:
import os, gzip, json
from time import sleep, time
from mpcontribs.client import Client
from pymatgen import Structure, MPRester
from urllib.request import urlretrieve
from monty.json import MontyDecoder
from itertools import groupby
from operator import itemgetter
In [ ]:
project = '2dmatpedia'
client = Client('your-api-key-here')
mpr = MPRester()
Create project (once)
In [ ]:
is_public = True
info = {
'project': project,
'is_public': is_public,
'title': '2DMatPedia',
'long_title': '2D Materials Encyclopedia',
'owner': 'migueldiascosta@nus.edu.sg',
'authors': 'M. Dias Costa, F.Y. Ping, Z. Jun',
'description': ' '.join('''
We start from the around 80000 inorganic compounds in the Materials Project database. A geometry-based
algorithm [PRL] was used to identify layered structures among these compounds. Two-dimensional (2D)
materials were theoretically exfoliated by extracting one cluster in the standard conventional unit cell
of the layered structures screened in the above steps. A 20 Å vacuum along the c axis was imposed to
minimize the interactions of image slabs by periodic condition. Structure matcher tools from Pymatgen were
used to find duplicates of the exfoliated 2D materials. The standard workflow developed by the Materials
Project was used to perform high-throughput calculations for all the layered bulk and 2D materials screened
in this project. The calculations were performed by density functional theory as implemented in the Vienna
Ab Initio Simulation Package (VASP) software with Perdew-Burke-Ernzerhof (PBE) approximation for the
exchange-correlation functional and the frozen-core all-electron projector-augmented wave (PAW) method for
the electron-ion interaction. The cutoff energy for the plane wave expansion was set to 520 eV.
'''.replace('\n', '').split()),
'urls': {
'WWW': 'http://www.2dmatpedia.org',
'PRL': 'https://doi.org/10.1103/PhysRevLett.118.106101'
}
}
# client.projects.create_entry(project=info).result()
# client.projects.get_entry(pk=project, _fields=['_all']).result()
Download source data
In [ ]:
config = {
"file": "http://www.2dmatpedia.org/static/db.json.gz",
"details": "http://www.2dmatpedia.org/2dmaterials/doc/{}",
'columns': {
'material_id': {'name': 'details'},
'exfoliation_energy_per_atom': {'name': 'Eₓ', 'unit': 'eV'},
'energy_per_atom': {'name': 'E', 'unit': 'meV'},
'energy_vdw_per_atom': {'name': 'ΔE|vdW', 'unit': 'meV'},
'bandgap': {'name': 'ΔE', 'unit': 'meV'},
}
}
In [ ]:
raw_data = [] # as read from raw files
dbfile = config['file'].rsplit('/')[-1]
if not os.path.exists(dbfile):
print('downloading', dbfile, '...')
urlretrieve(config['file'], dbfile)
with gzip.open(dbfile, 'rb') as f:
for line in f:
raw_data.append(json.loads(line, cls=MontyDecoder))
len(raw_data)
In [ ]:
# get data dict from raw data entry
def get_data(rd, identifier):
data = {}
for k, col in config['columns'].items():
hdr, unit = col['name'], col.get('unit')
if k == 'material_id':
data[hdr] = config[hdr].format(rd[k])
elif k in rd:
if unit:
try:
float(rd[k])
except ValueError:
continue
data[hdr] = f'{rd[k]} {unit}' if unit else rd[k]
return data
In [ ]:
raw_data_sorted = sorted(raw_data, key=itemgetter('source_id'))
raw_data_grouped = {}
for task_id, objects in groupby(raw_data_sorted, key=itemgetter('source_id')):
raw_data_grouped[task_id] = list(objects)
print(len(raw_data_sorted), len(raw_data_grouped))
In [ ]:
contributions = {}
for idx, (task_id, objects) in enumerate(raw_data_grouped.items()):
if not idx%500:
print(idx, task_id)
valid = bool(task_id.startswith('mp-') or task_id.startswith('mvc-'))
if valid and task_id not in contributions:
contrib = {'project': project, 'is_public': is_public, 'identifier': task_id, 'data': {}}
structures = []
names = set()
for j, d in enumerate(objects):
structure = d['structure']
comp = d['structure'].composition.reduced_formula
name = f'{comp}|{j}' if comp in names else comp
names.add(name)
structure = d['structure'].as_dict()
structure.update({'label': '2D', 'name': name, 'is_public': is_public})
structures.append(structure)
label = f'2D|{j}'
contrib['data'][label] = get_data(d, task_id)
contrib['data'][label]['formula'] = comp
contributions[task_id] = {'contrib': contrib, 'structures': structures}
print(len(contributions))
In [ ]:
from itertools import islice
def chunks(data, SIZE=100):
it = iter(data)
for i in range(0, len(data), SIZE):
if isinstance(data, dict):
yield {k: data[k] for k in islice(it, SIZE)}
else:
yield data[i:i+SIZE]
In [ ]:
def get_entries():
return [c['id'] for c in client.contributions.get_entries(
project=project, _limit=100, _fields=['id']
).result()['data']]
cids = get_entries()
while cids:
cnt = client.contributions.delete_entries(id__in=cids).result()['count']
print(cnt, 'contributions deleted')
cids = get_entries()
In [ ]:
for i, chunk in enumerate(chunks(contributions)):
contribs = [c['contrib'] for c in chunk.values()]
created = client.contributions.create_entries(contributions=contribs).result()
print(i, created['count'], 'contributions created')
create_structures = []
for contrib in created['data']:
identifier = contrib['identifier']
for s in chunk[identifier]['structures']:
s['contribution'] = contrib['id']
create_structures.append(s)
print('submit', len(create_structures), 'structures ...')
for j, subchunk in enumerate(chunks(create_structures)):
created = client.structures.create_entries(structures=subchunk).result()
print(j, created['count'], 'structures created')
In [ ]: