In [ ]:
from mpcontribs.client import Client
from time import sleep, time
import gzip, json, os
import numpy as np
from pandas import DataFrame
from collections import defaultdict
In [ ]:
is_public = True
project = 'carrier_transport'
client = Client('your-api-key-here')
Get and update project info (see https://portal.mpcontribs.org/carrier_transport)
In [ ]:
# client.projects.get_entry(pk=project, _fields=['_all']).result()
# client.projects.update_entry(pk=project, project={'long_title': 'Electronic Transport Properties'}).result()
Create contributions with tables
In [ ]:
input_dir = '/project/projectdirs/matgen/fricci/transport_data/coarse'
variables = [
{'key': 'cond_eff_mass', 'name': 'mₑᶜᵒⁿᵈ', 'unit': 'mₑ'},
{'key': 'seebeck_doping', 'name': 'S', 'unit': 'µV/K'},
{'key': 'cond_doping', 'name': 'σ', 'unit': '1/Ω/m/s'},
]
eigs_keys = ['ε₁', 'ε₂', 'ε₃', 'ε̄']
props = {
'seebeck_doping': ['S', 'µV/K'],
'cond_doping': ['σ', '1/Ω/m/s'],
'kappa_doping': ['κₑ', 'W/K/m/s']
}
pfkey = '⟨S²σ⟩'
In [ ]:
files = [x for x in os.scandir(input_dir) if x.is_file()]
len(files)
In [ ]:
from itertools import islice
def chunks(data, SIZE=500):
it = iter(data)
for i in range(0, len(data), SIZE):
if isinstance(data, dict):
yield {k: data[k] for k in islice(it, SIZE)}
else:
yield data[i:i+SIZE]
In [ ]:
contributions, existing = {}, []
batch_size = 750
total = len(files)
for idx, obj in enumerate(files):
if not idx%1000:
print(idx, len(contributions))
if len(contributions) >= batch_size or idx == total-1:
for i, chunk in enumerate(chunks(contributions, SIZE=250)):
contribs = [c['contrib'] for c in chunk.values()]
created = client.contributions.create_entries(contributions=contribs).result()
print(i, created['count'], 'contributions created')
create_tables = []
for contrib in created['data']:
identifier = contrib['identifier']
for t in chunk[identifier]['tables']:
t['contribution'] = contrib['id']
create_tables.append(t)
print('submit', len(create_tables), 'tables ...')
for j, subchunk in enumerate(chunks(create_tables, SIZE=100)):
created = client.tables.create_entries(tables=subchunk).result()
print(j, created['count'], 'tables created')
contributions.clear()
existing.clear()
if not len(contributions) and not len(existing):
has_more = True
while has_more:
skip = len(existing)
contribs = client.contributions.get_entries(
project=project, _skip=skip, _limit=250, _fields=['identifier']
).result()
existing += [c['identifier'] for c in contribs['data']]
has_more = contribs['has_more']
print(len(existing), 'already uploaded.')
identifier = obj.name.split('.', 1)[0].rsplit('_', 1)[-1]
valid = bool(identifier.startswith('mp-') or identifier.startswith('mvc-'))
if not valid:
print(idx, identifier, 'not valid')
continue
if identifier in existing:
continue
if identifier in contributions:
print(idx, identifier, 'already parsed')
continue
with gzip.open(obj.path, 'rb') as input_file:
data = json.loads(input_file.read())
contrib = {'project': project, 'identifier': identifier, 'is_public': is_public, 'data': {}}
task_type = list(data['gap'].keys())[0]
gap = list(data['gap'].values())[0]
contrib['data']['task'] = list(data['task_id'].values())[0]
contrib['data']['type'] = task_type
contrib['data']['metal'] = 'Yes' if gap < 0.1 else 'No'
contrib['data']['T'] = '300 K'
contrib['data']['doplvl'] = '1e18 cm⁻³'
contrib['data']['ΔE'] = ' '.join([str(gap), 'eV'])
contrib['data']['V'] = ' '.join([str(data['volume']), 'ų'])
S2 = None
for v in variables:
for doping_type in ['p', 'n']:
d = data[task_type][v['key']].get(doping_type, {}).get('300', {}).get('1e+18', {})
if d:
eigs = d if isinstance(d, list) else d['eigs']
key = '|'.join([v['name'], doping_type])
contrib['data'][key] = dict(
(eigs_keys[neig], ' '.join([str(eig), v['unit']]))
for neig, eig in enumerate(eigs)
)
contrib['data'][key][eigs_keys[-1]] = ' '.join([str(np.mean(eigs)), v['unit']])
if v['key'] == 'seebeck_doping':
S2 = np.dot(d['tensor'], d['tensor'])
elif v['key'] == 'cond_doping':
pf = np.mean(np.linalg.eigh(np.dot(S2, d['tensor']))[0]) * 1e-8
if pfkey not in contrib['data']:
contrib['data'][pfkey] = {}
contrib['data'][pfkey][doping_type] = ' '.join([str(pf), 'µW/cm/K²/s'])
# build data and max values for seebeck, conductivity and kappa
tables = []
for prop_name, (label, unit) in props.items():
for doping_type in ['p', 'n']:
prop = data[task_type][prop_name][doping_type]
prop_averages, dopings, columns = [], None, ['T [K]']
temps = sorted(map(int, prop.keys()))
for temp in temps:
row = [temp]
if dopings is None:
dopings = sorted(map(float, prop[str(temp)].keys()))
for doping in dopings:
doping_str = f'{doping:.0e}'
if len(columns) <= len(dopings):
columns.append(f'{doping_str} cm⁻³ [{unit}]')
eigs = prop[str(temp)][doping_str]['eigs']
row.append(np.mean(eigs))
prop_averages.append(row)
table_name = f'{label}({doping_type})'
np_prop_averages = np.array(prop_averages)
df = DataFrame(np_prop_averages, columns=columns)
for col in df.columns:
df[col] = df[col].astype(str)
table = df.to_dict(orient='split')
table.pop('index')
table['name'] = table_name
table['is_public'] = is_public
tables.append(table)
arr_prop_avg = np.array(np_prop_averages)[:,1:]
max_v = np.max(arr_prop_avg)
if prop_name[0] == 's' and doping_type == 'n':
max_v = np.min(arr_prop_avg)
if prop_name[0] == 'k':
max_v = np.min(arr_prop_avg)
arg_max = np.argwhere(arr_prop_avg==max_v)[0]
elabel = label + 'ᵉ'
edoping_type = 'ⁿ' if doping_type == 'n' else 'ᵖ'
contrib['data'][elabel] = {
doping_type: ' '.join([str(max_v), unit]),
f'T{edoping_type}': ' '.join([str(temps[arg_max[0]]), 'K']),
f'c{edoping_type}': ' '.join([str(dopings[arg_max[1]]), 'cm⁻³']),
}
contributions[identifier] = {'contrib': contrib, 'tables': tables}
In [ ]:
# clean up (CAREFUL!)
has_more = True
while has_more:
resp = client.contributions.delete_entries(project=project, _limit=250).result()
print(resp['count'], 'contributions deleted')
has_more = resp['has_more']