In [ ]:
from mpcontribs.client import Client
from time import sleep, time
import gzip, json, os
import numpy as np
from pandas import DataFrame
from collections import defaultdict

In [ ]:
is_public = True
project = 'carrier_transport'
client = Client('your-api-key-here')

Get and update project info (see https://portal.mpcontribs.org/carrier_transport)


In [ ]:
# client.projects.get_entry(pk=project, _fields=['_all']).result()
# client.projects.update_entry(pk=project, project={'long_title': 'Electronic Transport Properties'}).result()

Create contributions with tables


In [ ]:
input_dir = '/project/projectdirs/matgen/fricci/transport_data/coarse'
variables = [
    {'key': 'cond_eff_mass', 'name': 'mₑᶜᵒⁿᵈ', 'unit': 'mₑ'},
    {'key': 'seebeck_doping', 'name': 'S', 'unit': 'µV/K'},
    {'key': 'cond_doping', 'name': 'σ', 'unit': '1/Ω/m/s'},
]
eigs_keys = ['ε₁', 'ε₂', 'ε₃', 'ε̄']
props = {
    'seebeck_doping': ['S', 'µV/K'],
    'cond_doping': ['σ', '1/Ω/m/s'],
    'kappa_doping': ['κₑ', 'W/K/m/s']
}
pfkey = '⟨S²σ⟩'

In [ ]:
files = [x for x in os.scandir(input_dir) if x.is_file()]
len(files)

In [ ]:
from itertools import islice

def chunks(data, SIZE=500):
    it = iter(data)
    for i in range(0, len(data), SIZE):
        if isinstance(data, dict):
            yield {k: data[k] for k in islice(it, SIZE)}
        else:
            yield data[i:i+SIZE]

In [ ]:
contributions, existing = {}, []
batch_size = 750
total = len(files)

for idx, obj in enumerate(files):
    if not idx%1000:
        print(idx, len(contributions))               

    if len(contributions) >= batch_size or idx == total-1:
        for i, chunk in enumerate(chunks(contributions, SIZE=250)):
            contribs = [c['contrib'] for c in chunk.values()]
            created = client.contributions.create_entries(contributions=contribs).result()
            print(i, created['count'], 'contributions created')    

            create_tables = []
            for contrib in created['data']:
                identifier = contrib['identifier']
                for t in chunk[identifier]['tables']:
                    t['contribution'] = contrib['id']
                    create_tables.append(t)

            print('submit', len(create_tables), 'tables ...')
            for j, subchunk in enumerate(chunks(create_tables, SIZE=100)):
                created = client.tables.create_entries(tables=subchunk).result()
                print(j, created['count'], 'tables created')

        contributions.clear()
        existing.clear()
    
    if not len(contributions) and not len(existing):
        has_more = True
        while has_more:
            skip = len(existing)
            contribs = client.contributions.get_entries(
                project=project, _skip=skip, _limit=250, _fields=['identifier']
            ).result()
            existing += [c['identifier'] for c in contribs['data']]
            has_more = contribs['has_more']

        print(len(existing), 'already uploaded.')

    identifier = obj.name.split('.', 1)[0].rsplit('_', 1)[-1]

    valid = bool(identifier.startswith('mp-') or identifier.startswith('mvc-'))
    if not valid:
        print(idx, identifier, 'not valid')
        continue
    if identifier in existing:
        continue
    if identifier in contributions:
        print(idx, identifier, 'already parsed')
        continue
        
    with gzip.open(obj.path, 'rb') as input_file:
        data = json.loads(input_file.read())
        contrib = {'project': project, 'identifier': identifier, 'is_public': is_public, 'data': {}}
        task_type = list(data['gap'].keys())[0]
        gap = list(data['gap'].values())[0]
        contrib['data']['task'] = list(data['task_id'].values())[0]
        contrib['data']['type'] = task_type
        contrib['data']['metal'] = 'Yes' if gap < 0.1 else 'No'        
        contrib['data']['T'] = '300 K'
        contrib['data']['doplvl'] = '1e18 cm⁻³'
        contrib['data']['ΔE'] = ' '.join([str(gap), 'eV'])
        contrib['data']['V'] = ' '.join([str(data['volume']), 'ų'])
        
        S2 = None
        for v in variables:
            for doping_type in ['p', 'n']:
                d = data[task_type][v['key']].get(doping_type, {}).get('300', {}).get('1e+18', {})
                
                if d:
                    eigs = d if isinstance(d, list) else d['eigs']
                    key = '|'.join([v['name'], doping_type])
                    contrib['data'][key] = dict(
                        (eigs_keys[neig], ' '.join([str(eig), v['unit']]))
                        for neig, eig in enumerate(eigs)
                    )
                    contrib['data'][key][eigs_keys[-1]] = ' '.join([str(np.mean(eigs)), v['unit']])
                    if v['key'] == 'seebeck_doping':
                        S2 = np.dot(d['tensor'], d['tensor'])
                    elif v['key'] == 'cond_doping':
                        pf = np.mean(np.linalg.eigh(np.dot(S2, d['tensor']))[0]) * 1e-8
                        if pfkey not in contrib['data']:
                            contrib['data'][pfkey] = {}
                        contrib['data'][pfkey][doping_type] = ' '.join([str(pf), 'µW/cm/K²/s'])
                        
        # build data and max values for seebeck, conductivity and kappa
        tables = []       
        for prop_name, (label, unit) in props.items():
            for doping_type in ['p', 'n']:
                prop = data[task_type][prop_name][doping_type]
                prop_averages, dopings, columns = [], None, ['T [K]']
                temps = sorted(map(int, prop.keys()))
                for temp in temps:
                    row = [temp]
                    if dopings is None:
                        dopings = sorted(map(float, prop[str(temp)].keys()))
                    for doping in dopings:
                        doping_str = f'{doping:.0e}'
                        if len(columns) <= len(dopings):
                            columns.append(f'{doping_str} cm⁻³ [{unit}]')
                        eigs = prop[str(temp)][doping_str]['eigs']
                        row.append(np.mean(eigs))
                    prop_averages.append(row)
                
                table_name = f'{label}({doping_type})'
                np_prop_averages = np.array(prop_averages)
                df = DataFrame(np_prop_averages, columns=columns)
                for col in df.columns:
                    df[col] = df[col].astype(str)
                table = df.to_dict(orient='split')
                table.pop('index')
                table['name'] = table_name
                table['is_public'] = is_public
                tables.append(table)

                arr_prop_avg = np.array(np_prop_averages)[:,1:]
                max_v = np.max(arr_prop_avg)
                if prop_name[0] == 's' and doping_type == 'n':
                    max_v = np.min(arr_prop_avg)
                if prop_name[0] == 'k':
                    max_v = np.min(arr_prop_avg)
                arg_max = np.argwhere(arr_prop_avg==max_v)[0]

                elabel = label + 'ᵉ'
                edoping_type = 'ⁿ' if doping_type == 'n' else 'ᵖ'
                contrib['data'][elabel] = {
                    doping_type: ' '.join([str(max_v), unit]),
                    f'T{edoping_type}': ' '.join([str(temps[arg_max[0]]), 'K']),
                    f'c{edoping_type}': ' '.join([str(dopings[arg_max[1]]), 'cm⁻³']),
                }
                
        contributions[identifier] = {'contrib': contrib, 'tables': tables}

In [ ]:
# clean up (CAREFUL!)
has_more = True
while has_more:
    resp = client.contributions.delete_entries(project=project, _limit=250).result()    
    print(resp['count'], 'contributions deleted')
    has_more = resp['has_more']