notebook.community

Edit and run



In [ ]:

    
import json
from pandas import DataFrame
from pymatgen import MPRester, Structure
from mpcontribs.client import Client



In [ ]:

    
with open('/global/homes/h/huck/log_kvrh', 'r') as f:
    data = json.load(f)
df = DataFrame(data['data'], columns=data['columns'])



In [ ]:

    
project = 'matbench_v1'
client = Client('your-api-key-here')
mpr = MPRester()

Get project info:



In [ ]:

    
client.projects.get_entry(pk=project).result()

Create contributions with data and add structures:



In [ ]:

    
from itertools import islice

def chunks(data, SIZE=500):
    it = iter(data)
    for i in range(0, len(data), SIZE):
        if isinstance(data, dict):
            yield {k: data[k] for k in islice(it, SIZE)}
        else:
            yield data[i:i+SIZE]



In [ ]:

    
# clean up
has_more = True
while has_more:
    resp = client.contributions.delete_entries(project=project, _limit=250).result()    
    print(resp['count'], 'contributions deleted')
    has_more = resp['has_more']



In [ ]:

    
contributions, existing, uploaded = {}, [], None
batch_size = 500

for idx, (s, kvrh) in enumerate(zip(df['structure'], df['log10(K_VRH)'])):

    if len(contributions) >= batch_size:
        for i, chunk in enumerate(chunks(contributions, SIZE=250)):
            contribs = [c['contrib'] for c in chunk.values()]
            created = client.contributions.create_entries(contributions=contribs).result()
            print(i, created['count'], 'contributions created')    

            create_structures = []
            for contrib in created['data']:
                identifier = contrib['identifier']
                for s in chunk[identifier]['structures']:
                    s['contribution'] = contrib['id']
                    create_structures.append(s)

            print('submit', len(create_structures), 'structures ...')
            for j, subchunk in enumerate(chunks(create_structures, SIZE=100)):
                created = client.structures.create_entries(structures=subchunk).result()
                print(j, created['count'], 'structures created')

        contributions.clear()
        existing.clear()   

    if not len(contributions) and not len(existing):
        has_more = True
        while has_more:
            skip = len(existing)
            contribs = client.contributions.get_entries(
                project=project, _skip=skip, _limit=250, _fields=['identifier']
            ).result()
            existing += [c['identifier'] for c in contribs['data']]
            has_more = contribs['has_more']
        uploaded = len(existing)
        print(uploaded, 'already uploaded.')

    if idx < uploaded:
        continue

    structure = Structure.from_dict(s)
    matches = mpr.find_structure(structure)
    if not matches:
        print('no match for idx', idx)
        continue
        
    identifier = matches[0]
    if identifier in existing:
        continue
    if identifier in contributions:
        print(idx, identifier, 'already parsed')
        continue

    contrib = {'project': project, 'identifier': identifier, 'data': {'K|VRH': kvrh}}
    sdct = dict(name=structure.composition.reduced_formula, label='2020/02/02')
    sdct.update(structure.as_dict())
    contributions[identifier] = {'contrib': contrib, 'structures': [sdct]}

Retrieve and download contributions:



In [ ]:

    
contribs = client.contributions.get_entries(project=project, _fields=['id'], _limit=100).result()
contribs['total_count'], len(contribs['data'])