In [ ]:
import os, gzip, json
from time import sleep, time
from mpcontribs.client import Client
from pymatgen import Structure, MPRester
from urllib.request import urlretrieve
from monty.json import MontyDecoder
from itertools import groupby
from operator import itemgetter

In [ ]:
project = '2dmatpedia'
client = Client('your-api-key-here')
mpr = MPRester()

Create project (once)


In [ ]:
is_public = True
info = {
    'project': project,
    'is_public': is_public,
    'title': '2DMatPedia',
    'long_title': '2D Materials Encyclopedia',
    'owner': 'migueldiascosta@nus.edu.sg',
    'authors': 'M. Dias Costa, F.Y. Ping, Z. Jun',
    'description': ' '.join('''
    We start from the around 80000 inorganic compounds in the Materials Project database. A geometry-based
    algorithm [PRL] was used to identify layered structures among these compounds. Two-dimensional (2D)
    materials were theoretically exfoliated by extracting one cluster in the standard conventional unit cell
    of the layered structures screened in the above steps. A 20 Å vacuum along the c axis was imposed to
    minimize the interactions of image slabs by periodic condition. Structure matcher tools from Pymatgen were
    used to find duplicates of the exfoliated 2D materials. The standard workflow developed by the Materials
    Project was used to perform high-throughput calculations for all the layered bulk and 2D materials screened
    in this project. The calculations were performed by density functional theory as implemented in the Vienna
    Ab Initio Simulation Package (VASP) software with Perdew-Burke-Ernzerhof (PBE) approximation for the
    exchange-correlation functional and the frozen-core all-electron projector-augmented wave (PAW) method for
    the electron-ion interaction. The cutoff energy for the plane wave expansion was set to 520 eV.
    '''.replace('\n', '').split()),
    'urls': {
        'WWW': 'http://www.2dmatpedia.org',
        'PRL': 'https://doi.org/10.1103/PhysRevLett.118.106101'    
    }
}
# client.projects.create_entry(project=info).result()
# client.projects.get_entry(pk=project, _fields=['_all']).result()

Download source data


In [ ]:
config = {
    "file": "http://www.2dmatpedia.org/static/db.json.gz",
    "details": "http://www.2dmatpedia.org/2dmaterials/doc/{}",
    'columns': {
        'material_id': {'name': 'details'},
        'exfoliation_energy_per_atom': {'name': 'Eₓ', 'unit': 'eV'},
        'energy_per_atom': {'name': 'E', 'unit': 'meV'},
        'energy_vdw_per_atom': {'name': 'ΔE|vdW', 'unit': 'meV'},
        'bandgap': {'name': 'ΔE', 'unit': 'meV'},
    }
}

In [ ]:
raw_data = []  # as read from raw files
dbfile = config['file'].rsplit('/')[-1]
if not os.path.exists(dbfile):
    print('downloading', dbfile, '...')
    urlretrieve(config['file'], dbfile)

with gzip.open(dbfile, 'rb') as f:
    for line in f:
        raw_data.append(json.loads(line, cls=MontyDecoder))

len(raw_data)

In [ ]:
# get data dict from raw data entry
def get_data(rd, identifier):
    data = {}
    for k, col in config['columns'].items():
        hdr, unit = col['name'], col.get('unit')
        if k == 'material_id':
            data[hdr] = config[hdr].format(rd[k])
        elif k in rd:
            if unit:
                try:
                    float(rd[k])
                except ValueError:
                    continue
            data[hdr] = f'{rd[k]} {unit}' if unit else rd[k]
    return data

In [ ]:
raw_data_sorted = sorted(raw_data, key=itemgetter('source_id'))
raw_data_grouped = {}

for task_id, objects in groupby(raw_data_sorted, key=itemgetter('source_id')):
    raw_data_grouped[task_id] = list(objects)        

print(len(raw_data_sorted), len(raw_data_grouped))

In [ ]:
contributions = {}
for idx, (task_id, objects) in enumerate(raw_data_grouped.items()):  
    if not idx%500:
        print(idx, task_id)
    
    valid = bool(task_id.startswith('mp-') or task_id.startswith('mvc-'))
    if valid and task_id not in contributions:
        contrib = {'project': project, 'is_public': is_public, 'identifier': task_id, 'data': {}}
        structures = []
        names = set()
        for j, d in enumerate(objects):
            structure = d['structure']
            comp = d['structure'].composition.reduced_formula
            name = f'{comp}|{j}' if comp in names else comp        
            names.add(name)
            structure = d['structure'].as_dict()
            structure.update({'label': '2D', 'name': name, 'is_public': is_public})
            structures.append(structure)
            label = f'2D|{j}'
            contrib['data'][label] = get_data(d, task_id)
            contrib['data'][label]['formula'] = comp

        contributions[task_id] = {'contrib': contrib, 'structures': structures}

print(len(contributions))

In [ ]:
from itertools import islice

def chunks(data, SIZE=100):
    it = iter(data)
    for i in range(0, len(data), SIZE):
        if isinstance(data, dict):
            yield {k: data[k] for k in islice(it, SIZE)}
        else:
            yield data[i:i+SIZE]

In [ ]:
def get_entries():
    return [c['id'] for c in client.contributions.get_entries(
        project=project, _limit=100, _fields=['id']
    ).result()['data']]

cids = get_entries()
while cids:
    cnt = client.contributions.delete_entries(id__in=cids).result()['count']
    print(cnt, 'contributions deleted')
    cids = get_entries()

In [ ]:
for i, chunk in enumerate(chunks(contributions)):
    contribs = [c['contrib'] for c in chunk.values()]
    created = client.contributions.create_entries(contributions=contribs).result()
    print(i, created['count'], 'contributions created')    
    
    create_structures = []
    for contrib in created['data']:
        identifier = contrib['identifier']
        for s in chunk[identifier]['structures']:
            s['contribution'] = contrib['id']
            create_structures.append(s)
         
    print('submit', len(create_structures), 'structures ...')
    for j, subchunk in enumerate(chunks(create_structures)):
        created = client.structures.create_entries(structures=subchunk).result()
        print(j, created['count'], 'structures created')

In [ ]: