In [ ]:
from __future__ import print_function
from shutil import copyfile
import os, subprocess, pipes, re, json, glob, yaml, sys
from collections import Counter
from tqdm import tqdm
from fireworks.core.launchpad import LaunchPad
from boltons.tbutils import ParsedException
from pymongo import MongoClient
GARDEN = '/global/projecta/projectdirs/matgen/garden/'

In [ ]:
lpad = LaunchPad.from_file('my_launchpad.yaml')
lpad.fireworks.count()

In [ ]:
materials_prod_config_path = os.path.join('materials_db_prod.yaml')
materials_prod_config_file = open(materials_prod_config_path, 'r')
config = yaml.load(materials_prod_config_file)

In [ ]:
conn = MongoClient(config['host'], config['port'], j=False)
db_jp = conn[config['db']]
db_jp.authenticate(config['user_name'], config['password'])
db_jp.materials.count()

In [ ]:
vasp_config = json.load(open('tasks_db.json'))
vasp_conn = MongoClient(vasp_config['host'], vasp_config['port'], j=False)
db_vasp = vasp_conn[vasp_config['database']]
db_vasp.authenticate(vasp_config['readonly_user'], vasp_config['readonly_password'])
db_vasp.tasks.count()

In [ ]:
!cat tasks_db.json

In [ ]:
def print_categories(cats):
    c = Counter(dict((k, len(v)) for k, v in cats.items()))
    top10 = c.most_common(10)
    total = 0
    for k,v in top10:
        print(v, '\t', cats[k], '\t', k)
        total += v
    print(total)

In [ ]:
states = ["RUNNING", "WAITING", "FIZZLED", "READY", "COMPLETED", "RESERVED", "ARCHIVED", "DEFUSED", "PAUSED"]

general: fizzled workflows and according list of fireworks


In [ ]:
user_remarks = [
    "new ICSD batch", "Pauling file", "Heusler ABC2 phases",
    "proton conducting materials for fuel cells", "solid solution metal", "solid solution oxide", "intermetallic",
    "CNGMD Nitrides", "MAGICS calculation of band structures of 2D TMDC stacked heterostructures"
]

fw_ids for user-submitted workflows


In [ ]:
#user_query = {"spec.task_type": "Add to SNL database", "spec.snl.about.remarks": "MP user submission"}
user_query = {"spec.task_type": "Add to SNL database", "spec.snl.about.remarks": user_remarks[-3]}
fw_ids_user = lpad.fireworks.find(user_query, {'fw_id': 1, '_id': 0}).distinct('fw_id')
print(len(fw_ids_user), 'user-submitted workflows')

pause controller, defuse/fizzle workflows with >20 nodes


In [ ]:
counter = Counter()
for root_fw_id in fw_ids_user:
#     print(root_fw_id)
    wflows = list(lpad.workflows.find({'nodes': root_fw_id}, ['nodes', 'state', 'links']))
    if len(wflows) > 1:
        print('\tmultiple workflows for', root_fw_id)
        continue
    wf = wflows[0]
    fws = {}
    for fw in lpad.fireworks.find(
        {'fw_id': {'$in': wf['nodes']}}, {'fw_id': 1, 'spec.task_type': 1, 'state': 1, '_id': 0}
    ):
        fw_id = fw.pop('fw_id')
        fws[fw_id] = fw

    for fw_id, fw in fws.items():
        # pause controller tasks (problems with band structure calcs)
#         if fw['spec']['task_type'] == 'Controller: add Electronic Structure v2' and \
#         fw['state'] in ['WAITING', 'READY']:
#             lpad.pause_fw(fw_id)
#             fws[fw_id]['state'] = 'PAUSED'
#             print('\tpaused', fw_id)
        # defuse workflows with more than 20 tasks (endless SO?)
        if wf['state'] != 'COMPLETED' and len(wf['nodes']) > 20 and \
        fw['state'] not in ['COMPLETED', 'DEFUSED', 'PAUSED']:
            try:
                lpad.defuse_fw(fw_id)
                fws[fw_id]['state'] = 'DEFUSED'
                print('\tdefused', fw_id)
            except Exception as ex:
                print('\tdefusing', fw_id, 'failed:', str(ex))
                lpad.fireworks.update_one({'fw_id': fw_id}, {"$set":{"state":"FIZZLED"}})                
                print('\t', fw_id, 'set to FIZZLED')
            
    if fws[root_fw_id]['state'] == 'COMPLETED':
        current_fw_id = root_fw_id
        while 1:
            daughters = wf['links'][str(current_fw_id)]
            if not daughters:
                raise ValueError('why did I get here?')
            if len(daughters) == 1:
                #print('daughter:', current_fw_id, daughters[0], fws[daughters[0]]['spec']['task_type'])
                if fws[daughters[0]]['spec']['task_type'] == 'Controller: add Electronic Structure v2':
                    counter[fws[current_fw_id]['state']] += 1
                    break
                else:
                    current_fw_id = daughters[0]
            else:
                so_task_found = False
                for daughter in daughters:
                    if fws[daughter]['spec']['task_type'] == 'GGA optimize structure (2x)':
                        current_fw_id = daughter
                        so_task_found = True
                        break
                if not so_task_found:
                    raise ValueError('SO task not found!')
    else:
        counter[fws[root_fw_id]['state']] += 1

print(counter)
print('total =', sum(counter.values()))

In [ ]:
vw_fws = {}
for state in states:
    vw_fws[state] = list(lpad.fireworks.find({
        "state": state, "$or": [
            {"spec.snl.about.remarks": "solid solution metal"},
            {"spec.mpsnl.about.remarks": "solid solution metal"}
        ]
    }, ['spec.task_type', 'fw_id']))
    if vw_fws[state]:
        print(state, len(vw_fws[state]))
        if state in ['RUNNING', 'READY', 'RESERVED']:
            print(Counter(fw['spec']['task_type'] for fw in vw_fws[state]))

prioritized user-submitted "Add to SNL" tasks to get duplicate checking done


In [ ]:
priority_user_query = {
    "$and": [
        {"$or": [
            {"spec.snl.about.remarks": {"$in": ["MP user submission"], "$nin": user_remarks}},
            {"spec.mpsnl.about.remarks": {"$in": ["MP user submission"], "$nin": user_remarks}},    
        ]}, {"$or": [
            {"spec.prev_vasp_dir": {"$exists": 0}},
            {"spec.prev_vasp_dir": {"$regex": "/oasis/"}},        
        ]}      
    ]
}
priority_user_fws = {}
for state in states:
    if state == 'READY':
        state_query = {'state': state}
        state_query.update(priority_user_query)
        priority_user_fws[state] = list(lpad.fireworks.find(state_query, {
                    "fw_id": 1, "spec.task_type": 1, "spec.prev_vasp_dir": 1, "_id": 0}))
        nr_fws = len(priority_user_fws[state])
        if nr_fws > 0:
            add_to_snl = []
            for d in priority_user_fws[state]:
                if d['spec']['task_type'] == 'Add to SNL database':
                    add_to_snl.append(d['fw_id'])
            print(' '.join(map(str, add_to_snl)))
            print('{} {} user-submitted XSEDE tasks'.format(nr_fws, state))
print('DONE')

percentage of workflows in each state


In [ ]:
# 118151 = {Ti,Zr,Hf}-Zn-N piezoelectricity study -> ALL COMPLETED 2017-01-24
# 114781 = Kitchaev Workflows
# 115780 = Heusler ABC2 phases
# 89070 = Silvana Botti Perovskite Structures
submission_group_id = 89070
query = {'nodes': {'$in': fw_ids_user}}
# if user_query["spec.snl.about.remarks"] == "MP user submission":
#     print('FYI: only looking at workflows with submission_group_id', submission_group_id)
#     query.update({'metadata.submission_group_id': submission_group_id})
wflows = {}
total_wflows = float(lpad.workflows.find(query).count())
wflows_projection = {'fw_states': 1, 'parent_links': 1, 'links': 1, 'nodes': 1, '_id': 0, 'state': 1}
for state in states:
    state_query = {'state': state}
    state_query.update(query)
    wflows[state] = list(lpad.workflows.find(state_query, wflows_projection))
    nr_wflows = len(wflows[state])
    if nr_wflows > 0:
        if state == 'FIZZLED':
            print([wf['nodes'][0] for wf in wflows[state]])
        wflows_fraction =  nr_wflows / total_wflows
        print('{} {} workflows ({:.1f}%)'.format(nr_wflows, state, wflows_fraction*100.))
print(int(total_wflows), 'workflows in total')

list of first fizzled fw_id in each workflow


In [ ]:
def find_root_node(wflow):
    # wflow['nodes'][0] is not necessarily the root node!
    parent_links_keys = wflow['parent_links'].keys()
    for node in wflow['nodes']:
        if str(node) in parent_links_keys:
            continue
        return node

In [ ]:
state = 'FIZZLED' # workflow state
rerun_fws = []
fw_ids_state = {}
for wflow in wflows[state]:
    root_fw_id = find_root_node(wflow)
    # decend links until fizzled firework found
    fw_id = root_fw_id
    check_states = [state] if state != 'RUNNING' else ['READY', 'RESERVED']
    while 1:
        current_state = wflow['fw_states'][str(fw_id)]
        if current_state == 'RUNNING':
            print(fw_id, 'is RUNNING -> probably need to do `lpad rerun_fws -i {}`'.format(fw_id))
            break
        if current_state in check_states:
            task_type = lpad.fireworks.find_one({'fw_id': fw_id}, {'spec.task_type': 1})['spec']['task_type']
            if task_type not in fw_ids_state:
                fw_ids_state[task_type] = [int(fw_id)]
            else:
                fw_ids_state[task_type].append(int(fw_id))
            alt_state = lpad.fireworks.find_one({'fw_id': fw_id}, {'state': 1, '_id': 0})['state']
            if alt_state == 'RESERVED':
                rerun_fws.append(str(fw_id))
            break
        # if multiple children use non-waiting fw
        children = wflow['links'][str(fw_id)]
        for child in children:
            if wflow['fw_states'][str(child)] != 'WAITING':
                fw_id = child
if rerun_fws:
    print('lpad rerun_fws -i', ' '.join(rerun_fws))
for k,v in fw_ids_state.items():
    #if 'GGA' not in k: continue
    print(k, v)
#     for fw_id in v:
#         launches = lpad.launches.find({'fw_id': fw_id}, {'launch_dir': 1})
#         for launch in launches:
#             if not 'oasis' in launch['launch_dir']:
#                 print ('\t', fw_id, launch['launch_dir'])

list of incomplete fireworks in RUNNING workflows for fworker query


In [ ]:
fw_ids_incomplete = {}
for wflow in wflows['RUNNING']:
    for fw_id, fw_state in wflow['fw_states'].items():
        if fw_state != 'COMPLETED':
            if fw_state not in fw_ids_incomplete:
                fw_ids_incomplete[fw_state] = [int(fw_id)]
            else:
                fw_ids_incomplete[fw_state].append(int(fw_id))
print(fw_ids_incomplete)

In [ ]:
nodes = []
for d in lpad.workflows.find({'nodes': {'$in':[1370872,1566138,1566120,1566104,1566099,1567504,1567491,1563287,1652717]}}, {'_id': 0, 'nodes': 1}):
    nodes += d['nodes']
print(nodes)

list of first fireworks for fizzled workflows


In [ ]:
query = {'fw_id': {'$in': [fw_id for fw_id in fw_ids_state.values()]}} # FIXME
projection = {'fw_id': 1, 'launches': 1, '_id': 0}
fws = list(lpad.fireworks.find(query, projection))
assert(len(fws) == len(wflows[state]))

list of uniform tasks/fw_ids for projections in BoltzTraP builder (VASP DB insertion reruns)


In [ ]:
with open('task_fw_ids_wBS.json', 'r') as f:
    task_fw_ids_wBS = json.loads(f.read())
print(len(task_fw_ids_wBS), 'tasks already checked for projections')

In [ ]:
vasp_fw_ids = []
for fw_id in task_fw_ids_wBS.itervalues():
    wf = lpad.workflows.find_one({'nodes': fw_id}, {'_id': 0, 'links': 1})
    for daughter in wf['links'][str(fw_id)]:
        fw = lpad.fireworks.find_one(
            {'fw_id': daughter, 'spec.task_type': 'VASP db insertion'}, {'fw_id': 1, '_id': 0}
        )
        if fw:
            vasp_fw_ids.append(fw['fw_id'])
            break
len(vasp_fw_ids)

In [ ]:
lpad.fireworks.update_many(
    {'fw_id': {'$in': vasp_fw_ids}},
    {'$unset' : {'spec._tasks.0.update_duplicates' : 1}}
).raw_result

In [ ]:
print(
    lpad.fireworks.find({'state': 'READY', 'spec.task_type': 'VASP db insertion'}).count(),
    'VASP db insertion tasks ready to run'
)

In [ ]:
with open('task_fw_ids_woBS.json', 'r') as f:
    task_fw_ids_woBS = json.loads(f.read())
print(len(task_fw_ids_woBS), 'tasks without BS')
fws = lpad.fireworks.find(
    {'fw_id': {'$in': task_fw_ids_woBS.values()}, 'state': 'COMPLETED'}, 
    {'launches': 1, 'fw_id': 1, '_id': 0}
)
print('{}/{} fireworks found'.format(fws.count(), len(task_fw_ids_woBS)))

launch directories

for XSEDE: rsync to Mendel from

  • /oasis/projects/nsf/csd436/phuck/garden
  • /oasis/scratch/comet/phuck/temp_project

rsync -avz block_* mendel:/global/projecta/projectdirs/matgen/garden/


In [ ]:
fws_info = {}
no_launches_found = []

for fw in fws:
    if not fw['launches']:
        no_launches_found.append(fw['fw_id'])
        continue

    launch_id = fw['launches'][-1]
    launch = lpad.launches.find_one({'launch_id': launch_id}, {'launch_dir': 1, '_id': 0})
    launch_dir = launch['launch_dir']
    launch_dir_exists = False

    for fw_id, fw_info in fws_info.items():
        if launch_dir == fw_info['launch_dir']:
            launch_dir_exists = True
            break
    
    if launch_dir_exists:
        if 'duplicates' in fws_info[fw_id]:
            fws_info[fw_id]['duplicates'].append(fw['fw_id'])
        else:
            fws_info[fw_id]['duplicates'] = [fw['fw_id']]
        continue
    
    fws_info[fw['fw_id']] = {'launch_dir': launch_dir.strip()}

if len(no_launches_found) > 0:
    print('launches not found for', len(no_launches_found), 'fireworks')

In [ ]:
nr_duplicates = 0
for fw_id, fw_info in fws_info.iteritems():
    if 'duplicates' in fw_info:
        nr_duplicates += len(fw_info['duplicates'])
print(nr_duplicates, '/', len(fws), 'workflows have duplicate launch_dirs =>',
      len(fws)-nr_duplicates, 'unique launch_dirs')

In [ ]:
def get_dest_blocks(s):
    a = s.strip().split('/block_')
    if len(a) == 2:
        return [a[0], 'block_'+a[1]]
    a = s.strip().split('/launcher_')
    return [a[0], 'launcher_'+a[1]]

In [ ]:
def parse_launchdirs():
    for fw_id, fw_info in fws_info.iteritems():
        launch_dir = fw_info['launch_dir']
        if not os.path.exists(launch_dir):
            dest, block = get_dest_blocks(launch_dir)
            launch_dir = os.path.join(GARDEN, block)
            fw_info['launch_dir'] = launch_dir if os.path.exists(launch_dir) else None
            # 'compgen -G "$i/*.out" >> ~/launchdirs_exist_outfiles.txt; '
            # 'compgen -G "$i/*.error" >> ~/launchdirs_exist_outfiles.txt; '
    print('found {}/{} launch directories'.format(
        sum([bool(fw_info['launch_dir']) for fw_info in fws_info.itervalues()]), len(fws_info)
    ))

In [ ]:
parse_launchdirs()

analyze log output of fizzled workflows

scan for error messages


In [ ]:
def get_file_path(extension, dirlist):
    for fstr in dirlist:
        fn, ext = os.path.splitext(os.path.basename(fstr))
        if fn+ext == 'vasp.out':
            continue
        if ext == extension:
            return fstr
    return None

In [ ]:
def scan_errors_warnings(f):
    for line in f.readlines():
        line_lower = line.strip().lower()
        if 'error:' in line_lower or 'warning:' in line_lower:
            return line.strip()

In [ ]:
for fw_id, fw_info in tqdm(fws_info.items()):
    fw_info['errors'] = []
    
    if 'remote_dir' not in fw_info:
        fw_info['errors'].append('remote_dir not found')
        continue
    local_dir = fw_info['local_dir']
    if not os.path.exists(local_dir):
        fw_info['errors'].append('local_dir not found')
        continue
    ls = glob.glob(os.path.join(local_dir, '*'))
    if not ls:
        fw_info['errors'].append('no files found in local_dir')
        continue

    error_file = get_file_path('.error', ls)
    if error_file is not None:
        # look for a traceback in *.error
        with open(error_file, 'r') as f:
            fcontent = f.read()
            match = re.search('Traceback((.+\n)+)Traceback', fcontent)
            if not match:
                match = re.search('Traceback((.+\n)+)INFO', fcontent)
                if not match:
                    match = re.search('Traceback((.+\n)+)$', fcontent)
            if match:
                fw_info['errors'].append('Traceback'+match.group(1))
            else:
                scan = scan_errors_warnings(f)
                if scan:
                    fw_info['errors'].append(scan)

    # look into .out file
    out_file = get_file_path('.out', ls)
    with open(out_file, 'r') as f:
        scan = scan_errors_warnings(f)
        if scan:
            fw_info['errors'].append(scan)

    # look into vasp.out
    vasp_out = os.path.join(local_dir, 'vasp.out')
    if os.path.exists(vasp_out):
        with open(vasp_out, 'r') as f:
            vasp_out_tail = f.readlines()[-1].strip()
            fw_info['errors'].append(' -- '.join(['vasp.out', vasp_out_tail]))

# FIXME .out and .error for non-reservation mode one directory up

categorize errors


In [ ]:
def add_fw_to_category(fw_id, key, cats):
    if key in cats:
        cats[key].append(fw_id)
    else:
        cats[key] = [fw_id]

In [ ]:
categories = {}
for fw_id, fw_info in fws_info.iteritems():
    if not fw_info['errors']:
        add_fw_to_category(fw_id, 'no errors parsed', categories)
        continue
    for error in fw_info['errors']:
        if 'remote_dir' in error or 'local_dir' in error:
            add_fw_to_category(fw_id, error, categories)
        elif error.startswith('Traceback'):       
            exc = ParsedException.from_string(error)
            msg = exc.exc_msg[:50]
            match = re.search('errors reached: (.*)', msg)
            if match:
                msg = match.group(1)
            key = ' -- '.join([exc.exc_type, msg])
            lineno = exc.frames[-1]['lineno']
            key = ' -- '.join([key, os.path.basename(exc.source_file) + '#' + lineno])
            add_fw_to_category(fw_id, key, categories)
        else:
            match = re.search('{(.*)}', error) # matches dictionary
            if match:
                dstr = '{' + match.group(1) + '}'
                dstr = dstr.replace("u'", '"').replace("'", '"')
                dstr = re.sub('{"handler": (.*), "errors"', '{"handler": "\g<1>", "errors"', dstr)
                try:
                    d = json.loads(dstr)
                except:
                    add_fw_to_category(fw_id, 'looks like dict but could not decode', categories)
                else:
                    if 'handler' in d and 'errors' in d:
                        if '<' in d['handler']:
                            match = re.search('custodian\.vasp\.handlers\.(.*) object', d['handler'])
                            if match:
                                d['handler'] = match.group(1)
                            else:
                                raise ValueError('custodian.vasp.handlers not matched')
                        add_fw_to_category(fw_id, d['handler'], categories)
                    elif 'action' in d:
                        add_fw_to_category(fw_id, 'action', categories)
                    else:
                        add_fw_to_category(fw_id, 'found dict but not handler or action error', categories)
            else:
                add_fw_to_category(fw_id, error, categories)
        break # only look at first error
print_categories(categories)

debugging


In [ ]:
fws_info[1564191]['remote_dir']

In [ ]:
lpad.fireworks.find_one({'fw_id': 1564191}, {'spec._priority': 1, 'state': 1})

In [ ]:
lpad.fireworks.find_one({'fw_id': 1285769}, {'spec._priority': 1, 'state': 1})

In [ ]:
lpad.fireworks.find_one({'fw_id': 1399045}, {'spec._priority': 1, 'state': 1})

Kitchaev submissions: mp-ids


In [ ]:
f = open('mpcomplete_kitchaev.json', 'r')

In [ ]:
import json
d = json.load(f)

In [ ]:
def find_last_node(wflow):
    for node in wflow['links'].keys():
        if not wflow['links'][node]:
            return node
    raise ValueError('last node not found!')

In [ ]:
for cif, info in d.items():
    submission_id = info['submission_id']
    wflow = lpad.workflows.find_one({'metadata.submission_id': submission_id}, wflows_projection)
    if wflow['state'] != 'COMPLETED':
        continue
    fw_id = find_root_node(wflow)
    task_ids = [None]
    while 1:
        launch_id = lpad.fireworks.find_one({'fw_id': fw_id}, {'launches': 1, '_id': 0})['launches'][-1]
        launch = lpad.launches.find_one(
            {'launch_id': launch_id, 'action.stored_data.task_id': {'$exists': 1}},
            {'action.stored_data.task_id': 1, '_id': 0}
        )
        if launch:
            task_ids.append(launch['action']['stored_data']['task_id'])
        children = wflow['links'][str(fw_id)]
        if not children:
            break
        fw_id = children[-1]
    mat = db_jp.materials.find_one({'task_ids': {'$in': task_ids}}, {'task_id': 1, 'task_ids': 1, '_id': 0})
    info['fw_id'] = fw_id
    info['mp_id'] = mat['task_id']
    print(d[cif])
    #break
print('DONE')

In [ ]:
fout = open('mpcomplete_kitchaev_mpids.json', 'w')
json.dump(d, fout)

fw_ids for list of mp-ids to fix DOS offset


In [ ]:
# mp_ids = ['mp-27187','mp-695866','mp-25732','mp-770957','mp-770953','mp-685168','mp-672214','mp-561549','mp-679630',
#           'mp-7323','mp-772665','mp-17895','mp-770566','mp-25772','mp-3009','mp-625837','mp-12797','mp-28588',
#           'mp-770887','mp-776836','mp-5185','mp-24570','mp-723049','mp-657176','mp-25766','mp-19548','mp-625823',
#           'mp-684950','mp-557613','mp-704536','mp-722237','mp-676950']
mp_ids = ['mp-5229']
snlgroup_ids = db_jp.materials.find({'task_ids': {'$in': mp_ids}}).distinct('snlgroup_id')
fw_ids_dosfix = lpad.fireworks.find({"spec.snlgroup_id": {'$in': snlgroup_ids}}).distinct('fw_id')
wflows_dosfix = list(lpad.workflows.find({'nodes': {'$in': fw_ids_dosfix}}))
fw_ids_rerun = []
fw_ids_defuse = []
task_ids = set()
for wflow in wflows_dosfix:
    print('wf:', wflow['nodes'][0])
    fw_ids_uniform = []
    for fw in list(lpad.fireworks.find({'fw_id': {'$in': wflow['nodes']}})):
        if 'Uniform' in fw['spec']['task_type']:
            fw_ids_uniform.append(fw['fw_id'])
        elif 'Boltztrap' in fw['spec']['task_type']:
            fw_ids_defuse.append(fw['fw_id'])
        elif 'VASP db' in fw['spec']['task_type']:
            print(fw['fw_id'], fw['launches'][-1])
            launch = lpad.launches.find_one({'launch_id': fw['launches'][-1]}, {'_id': 0, 'action.stored_data': 1})
            task_ids.add(launch['action']['stored_data'].get('task_id'))
    if not fw_ids_uniform:
        continue
    fw_ids_rerun.append(max(fw_ids_uniform))
len(fw_ids_rerun)
fw_ids_rerun
task_ids

In [ ]:
' '.join(map(str, fw_ids_rerun))

In [ ]:
fw_ids_defuse

In [ ]:
fw_ids_run = []
for wflow in lpad.workflows.find({'nodes': {'$in': fw_ids_rerun}}):
    for fw_id, fw_state in wflow['fw_states'].items():
        if fw_state != 'COMPLETED' and fw_state != 'DEFUSED':
            fw_ids_run.append(fw_id)
','.join(map(str, fw_ids_run))

In [ ]:
' '.join(map(str, fw_ids_defuse))

In [ ]:
fw_ids_dos_offset = []
for doc in list(lpad.workflows.find({'nodes': {'$in': fw_ids_gga}}, {'fw_states': 1, '_id': 0})):
    for fw_id, fw_state in doc['fw_states'].items():
        if fw_state == 'READY' or fw_state == 'WAITING':
            fw_ids_dos_offset.append(fw_id)
len(fw_ids_dos_offset)

In [ ]:
map(int, fw_ids_dos_offset)

Projections for BoltzTraP builder: set to READY and update_duplicates


In [ ]:
fw_ids_vasp_db_rerun = []
for fw_id, fw_info in fws_info.items():
    if fw_info['launch_dir']: # GGA Uniform launch_dir exists
        wf = lpad.workflows.find_one({'nodes': fw_id}, {'_id': 0, 'links': 1})
        for daughter in wf['links'][str(fw_id)]:
            fw = lpad.fireworks.find_one(
                {'fw_id': daughter, 'spec.task_type': 'VASP db insertion'}, {'fw_id': 1, '_id': 0}
            )
            if fw:
                fw_ids_vasp_db_rerun.append(fw['fw_id'])
                break
len(fw_ids_vasp_db_rerun)

In [ ]:
lpad.fireworks.update_many(
    {'fw_id': {'$in': fw_ids_vasp_db_rerun}},
    {"$set":{"state":"READY", "spec._tasks.0.update_duplicates": True}}
).raw_result

SNL and Task Collections for atomate transition


In [ ]:
with open('snl_tasks_atomate.json', 'r') as f:
    data = json.load(f)

query = {} if not data else {'task_id': {'$nin': data.keys()}}
has_bs_piezo_dos = {'has_bandstructure': True, 'piezo': {'$exists': 1}, 'dos': {'$exists': 1}}
#query.update(has_bs_piezo_dos)
has_bs_dos = {'has_bandstructure': True, 'dos': {'$exists': 1}}
query.update(has_bs_dos)
docs = db_jp.materials.find(query, {'task_ids': 1, '_id': 0, 'task_id': 1, 'snl.snl_id': 1})

for idx,doc in tqdm(enumerate(docs), total=docs.count()):
    mpid = doc['task_id']
    data[mpid] = {'tasks': {}}
    if set(has_bs_piezo_dos.keys()).issubset(query.keys()):
        data[mpid]['tags'] = ['has_bs_piezo_dos']
    if set(has_bs_dos.keys()).issubset(query.keys()):
        data[mpid]['tags'] = ['has_bs_dos']
    for task_id in doc['task_ids']:
        tasks = list(db_vasp.tasks.find({'task_id': task_id}, {'dir_name': 1, '_id': 0}))
        if len(tasks) > 1:
            data[mpid]['error'] = 'found {} tasks'.format(len(tasks))
            continue
        elif not tasks:
            data[mpid]['error'] = 'no task found'
            continue
        dir_name = tasks[0]['dir_name']
        launch_dir = os.path.join(GARDEN, dir_name)
        if not os.path.exists(launch_dir):
            data[mpid]['error'] = '{} not found'.format(dir_name)
            break
        data[mpid]['tasks'][task_id] = launch_dir
    data[mpid]['snl_id'] = doc['snl']['snl_id']
    if not idx%2000:
        with open('snl_tasks_atomate.json', 'w') as f:
            json.dump(data, f)
        #break
        
with open('snl_tasks_atomate.json', 'w') as f:
    json.dump(data, f)