In [ ]:
# import pandas as pd
# import requests
# from requests.auth import HTTPBasicAuth
# import sys
# if sys.version_info[0] < 3:
# from StringIO import StringIO
# else:
# from io import StringIO
# url = 'https://metabolomics.app.labkey.host/_webdav/Metabolite%20Atlas/%40files/Probe_peak_height.tab?contentDisposition=attachment'
# r = requests.get(url, auth=HTTPBasicAuth(u,p))
# df = pd.read_csv(StringIO(r.text),sep='\t')
# df.head()
In [1]:
%matplotlib notebook
import numpy as np
import sys, os
import glob
import json
import copy
from itertools import product
sys.path.insert(0,'/global/homes/b/bpb/repos/metatlas')
# # sys.path.insert(1,'/global/projecta/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
from ast import literal_eval
import metatlas.metatlas_objects as metob
# from metatlas.helpers import mzmine_helpers as mzm
from metatlas.helpers import mzmine_batch_tools_adap as mzm
from metatlas.helpers import dill2plots as dp
from metatlas.helpers import metatlas_get_data_helper_fun as ma_data
from metatlas.helpers import chromatograms_mp_plots as cp
import collections
import pandas as pd
from matplotlib import pyplot as plt
# import multiprocessing as mp
from distutils.util import strtobool
from datetime import datetime
from shutil import copyfile
from ast import literal_eval
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
# pd.set_option("display.max_colwidth", 1000000)
In [1]:
# %matplotlib inline
# paths = ['/global/cscratch1/sd/bpb/mzmine_jobs/20171214_KBL_JL_C18_Poplar_FungInnoc_positive_8C552483/*height.csv',
# '/global/cscratch1/sd/bpb/mzmine_jobs/20171214_KBL_JL_C18_Poplar_FungInnoc_negative_8302CCEF/*height.csv']
# mz = [229.981110,227.966560]
# rt = [4.77,4.77]
# polarity = ['positive','negative']
# rt_range = [0.2,0.2]
# mz_range = [0.01,0.01]
# fig,ax = plt.subplots(nrows=2,ncols=1,figsize=(12,8),sharex=True)
# ax = ax.flatten()
# df_summary = []
# for i,p in enumerate(paths):
# my_file = glob.glob(p)[-1]
# df = pd.read_csv(my_file)
# print(df.shape[0])
# pk_cols = [c for c in df.columns if 'Peak height' in c]
# idx_mz = abs(df['row m/z'] - mz[i])<0.01
# idx_rt = abs(df['row retention time'] - rt[i])<0.2
# idx_mzrt = idx_mz & idx_rt
# print(sum(idx_mz),sum(idx_rt),sum(idx_mzrt))
# s = df.loc[idx_mzrt,pk_cols]
# temp = pd.merge(pd.DataFrame(s.median(axis=1),columns=['sum intensity']),
# df.loc[idx_mzrt,['row m/z','row retention time']],
# left_index=True,right_index=True)
# x = np.asarray([int(vv[0].replace('_Run','').replace('.','')) if len(vv)>0 else 0 for vv in [re.findall("_Run\d+\.",v) for v in s.columns]])
# y = s.values.flatten()
# idx = np.argsort(x).flatten()
# x = x[idx]
# y = y[idx]
# ax[i].plot(x,y,'.')
# if i==1:
# ax[i].set_xlabel('Run Order Index')
# ax[i].set_ylabel('%s mode\nABMBA peak height (au)'%polarity[i])
# temp['polarity'] = polarity[i]
# temp['rt_range'] = rt_range[i]
# temp['mz_range'] = mz_range[i]
# df_summary.append(temp)
# pd.concat(df_summary)
In [2]:
# paths = ['/global/cscratch1/sd/bpb/mzmine_jobs/20171214_KBL_JL_C18_Poplar_FungInnoc_positive_8C552483/*height.csv',
# '/global/cscratch1/sd/bpb/mzmine_jobs/20171214_KBL_JL_C18_Poplar_FungInnoc_negative_8302CCEF/*height.csv']
# mz = [218.12812,216.11357]
# rt = [2.34,2.34]
# polarity = ['positive','negative']
# rt_range = [0.2,0.2]
# mz_range = [0.01,0.01]
# fig,ax = plt.subplots(nrows=1,ncols=2)
# ax = ax.flatten()
# df_summary = []
# for i,p in enumerate(paths):
# my_file = glob.glob(p)[-1]
# df = pd.read_csv(my_file)
# print(df.shape[0])
# pk_cols = [c for c in df.columns if 'Peak height' in c]
# idx_mz = abs(df['row m/z'] - mz[i])<0.01
# idx_rt = abs(df['row retention time'] - rt[i])<0.2
# idx_mzrt = idx_mz & idx_rt
# print(sum(idx_mz),sum(idx_rt),sum(idx_mzrt))
# s = df.loc[idx_mzrt,pk_cols].median(axis=1)
# temp = pd.merge(pd.DataFrame(s,columns=['sum intensity']),
# df.loc[idx_mzrt,['row m/z','row retention time']],
# left_index=True,right_index=True)
# temp['polarity'] = polarity[i]
# temp['rt_range'] = rt_range[i]
# temp['mz_range'] = mz_range[i]
# df_summary.append(temp)
# pd.concat(df_summary)
Given a new batch file, log it at a workflow
For a workflow perform and evaluate parameter scanning
For a workflow setup ability to run that job on datasets
Make a new parameter template from a batch file
Use a parameter template to search for peaks
Scan parameters within a range for a template
An example can be seen here:
https://docs.google.com/spreadsheets/d/17bf-GmYfpYKyqW_attFLli3_Glva2cnCuHlFP6FMP-Y/edit#gid=0
Role this into the LIMS once the LIMS is stablized.
You shouldn't have to do this very often.
In [ ]:
# mzm = reload(mzm)
# # xml_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/batch_files/bootcamp_template_batch_file.xml'
# # xml_file = '/global/homes/b/bpb/Downloads/batch_louie_nomsms_noisotope.xml'
# # xml_file = '/global/homes/b/bpb/Downloads/batch_louie_withmsms_withisotope.xml'
# xml_file = '/global/homes/b/bpb/Downloads/batch_template_2p39.xml'
# csv_file = '/global/homes/b/bpb/Downloads/mzmine_workflow.csv'
# df=mzm.mzmine_xml_to_csv(xml_file=xml_file,csv_file=csv_file)
# df.head()
In [ ]:
# mzm = reload(mzm)
# xml_file = '/global/homes/b/bpb/Downloads/1904_MZMine_2_39_QE_RP_UCSD_JGI_sens_deconv_basicadduct_v4dupl.xml'
# csv_file = '/global/homes/b/bpb/Downloads/mzmine_workflow.csv'
# df=mzm.mzmine_xml_to_csv(xml_file=xml_file,csv_file=csv_file,pop_input_files=False)
# df.head()
In [2]:
dp = reload(dp)
df = dp.get_google_sheet(notebook_name='mzmine_jobs_20190625',sheet_name='JOBS',literal_cols=['file_filters','groups'])
df = df[df['done']==False]
mzmine_things = df.T.to_dict().values()
print(len(mzmine_things))
# print(mzmine_things[-1])
In [3]:
temp = []
for m in mzmine_things:
temp.append(m['parameter_sheet'])
unique_parameter_sheets_tablenames = pd.unique(temp)
print('There are %d unique parameter sheets'%len(unique_parameter_sheets_tablenames))
unique_parameter_sheets = {}
for s in unique_parameter_sheets_tablenames:
df = dp.get_google_sheet(notebook_name=s,sheet_name='Sheet1')
new_cols = []
for c in df.columns:
if c.startswith('('):
new_cols.append(literal_eval(c))
else:
new_cols.append(c)
df.columns=new_cols
unique_parameter_sheets[s] = df
In [4]:
with open('/global/homes/b/bpb/gnps_password.txt','r') as fid:
gnps_password = fid.read().strip()
In [5]:
for i,m in enumerate(mzmine_things):
df = unique_parameter_sheets[m['parameter_sheet']]
d = df[df['param_id']==m['parameter_id']].to_dict(orient='records')[-1]
d.pop('param_id', None)
d = collections.OrderedDict(d)
new_basename = os.path.join(m['basedir'],m['unique_basename'])
#replace the polarity in the crop filter module
for k,v in d.items():
if 'polarity' in k:
d[k] = m['polarity'].upper()
#rename all the output files
for k,v in d.items():
try:
if 'placeholder_filename' in v:
if 'gnps-job' in v:
d[k] = v.replace('placeholder_filename',m['unique_basename'])
else:
d[k] = v.replace('placeholder_filename',new_basename)
except TypeError:
pass
# #replace gnps password
# for k,v in d.items():
# try:
# if 'justapassword' in v:
# d[k] = v.replace('justapassword',gnps_password)
# except TypeError:
# pass
mzmine_things[i]['param_dict_flat'] = d
#This is a good place to make the values strings. The xml maker needs strings later on so might as well do it here
str_d = {}
for k,v in d.items():
str_d[k] = str(v)
#unflatten it
mzmine_things[i]['param_dict_unflat'] = mzm.unflatten(str_d)
In [6]:
mzm = reload(mzm)
for i,m in enumerate(mzmine_things):
mzmine_things[i]['file_list'] = mzm.get_files(m['groups'],m['filename_substring'],m['file_filters'],m['is_group'])
In [32]:
# rsync -zavh /project/projectdirs/metatlas/raw_data/ /global/cscratch1/sd/bpb/raw_data
# rsync -zavh /project/projectdirs/metatlas/projects/jgi_projects/ /global/cscratch1/sd/bpb/mzmine_jobs
In [7]:
for i,m in enumerate(mzmine_things):
# add on raw data files
new_basename = os.path.join(m['basedir'],m['unique_basename'])
new_save_step = {'@method': "net.sf.mzmine.modules.projectmethods.projectsave.ProjectSaveAsModule",
'parameter': {'@name':"Project file",'current_file':'%s_workspace.mzmine'%new_basename}}
new_raw_data = {'@method': 'net.sf.mzmine.modules.rawdatamethods.rawdataimport.RawDataImportModule',
'parameter': {'@name': 'Raw data file names',
'file': []}}
new_raw_data['parameter']['file'] = mzmine_things[i]['file_list']
mzmine_things[i]['param_dict_unflat']['batch']['batchstep'].insert(0,new_raw_data)
In [8]:
for i,m in enumerate(mzmine_things):
t2 = mzm.dict_to_etree(mzmine_things[i]['param_dict_unflat'])
mzm.indent_tree(t2)
s2 = mzm.tree_to_xml(t2)
mzmine_things[i]['xml_string'] = s2
In [9]:
for i,m in enumerate(mzmine_things):
mzmine_launcher = mzm.get_latest_mzmine_binary(version=m['mzmine_version'])
mzmine_things[i]['mzmine_launcher'] = mzmine_launcher
new_basename = os.path.join(m['basedir'],m['unique_basename'])
if not os.path.isdir(m['basedir']):
os.mkdir(m['basedir'])
batch_filename = '%s_batch-params.xml'%new_basename
with open(batch_filename,'w') as fid:
fid.write('%s'%m['xml_string'])
mzmine_things[i]['batch_filename'] = batch_filename
In [10]:
# # s = []
# for m in mzmine_things[21]['file_list']:
[(i,g) for i,g in enumerate(os.path.basename(mzmine_things[0]['file_list'][0]).split('_'))]
# print('-'.join(os.path.basename(m).split('_')[9].split('-')[:-1]))
# print('')
# # sorted(s)
Out[10]:
In [11]:
for i,m in enumerate(mzmine_things):
print(i,m['unique_basename'])
new_basename = os.path.join(m['basedir'],m['unique_basename'])
metadata_filename = '%s_%s.tab'%(new_basename,'metadata')
mzml_files = [os.path.basename(f) for f in m['file_list']]
if '20181109TV_' in m['unique_basename']:
attributes = [''.join(f.split('_')[9].split('-')[:-1]) for f in mzml_files]
elif '20161212_TS_BSC_EthAc_extracts' in m['unique_basename']:
attributes = [''.join(f.split('_')[11]) for f in mzml_files]
elif '20181116_TV' in m['unique_basename']:
attributes = [''.join(f.split('_')[11].split('-')[:-1]) for f in mzml_files]
elif '20151019_KZ_RootExu_C18_QExactive' in m['unique_basename']:
attributes = ['_'.join(f.split('_')[8:10]) for f in mzml_files]
elif '_AS_' in m['unique_basename']:
attributes = ['-'.join(f.split('_')[11].split('-')[:]) for f in mzml_files]
elif ('_MO' in m['unique_basename']) or ('candice' in m['unique_basename']):
attributes = ['-'.join(f.split('_')[12].split('-')[:-1]) for f in mzml_files]
elif '20190729_AK_JE_Ekwealor' in m['unique_basename']:
attributes = [f.split('_')[10] for f in mzml_files]
elif '20171214_KBL_JL_C18_Poplar_FungInnoc' in m['unique_basename']:
attributes = [f.split('_')[11] for f in mzml_files]
elif '20190829_AK_NS_ENIGMA_bsc_community_QEHF_Ag679775-924_USHXH0126' in m['unique_basename']:
attributes = [f.split('_')[10] for f in mzml_files]
else:
attributes = [f.split('_')[12] for f in mzml_files]
# except:
# attributes = ['no group' for f in mzml_files]
print(len(pd.unique(attributes)),attributes)
pd.DataFrame(data={'filename':mzml_files,'ATTRIBUTE_sampletype':attributes}).set_index('filename',drop=True).to_csv(metadata_filename,sep='\t')
print('')
In [12]:
for i,m in enumerate(mzmine_things):
print('%s %s'%(m['mzmine_launcher'],m['batch_filename']))
In [13]:
# s = json.dumps([b['batch_filename'] for b in mzmine_things])
d = {"jgi_mzmine.inputs":[b['batch_filename'] for b in mzmine_things]}
with open('/global/homes/b/bpb/mzmine_job_adap.json','w') as fid:
fid.write('%s'%json.dumps(d))
In [14]:
mzm = reload(mzm)
print(mzm.SLURM_HEADER)
In [ ]:
In [15]:
mzm = reload(mzm)
for i,m in enumerate(mzmine_things):
new_basename = os.path.join(m['basedir'],m['unique_basename'])
sbatch_filename = '%s_%s.sbatch'%(new_basename,'sbatch')
s = '%s %s'%(m['mzmine_launcher'],m['batch_filename'])
sbatch_header = mzm.SLURM_HEADER
with open(sbatch_filename,'w') as fid:
fid.write('%s\n%s\n'%(mzm.SLURM_HEADER.replace('slurm',new_basename),s))
print('sbatch %s'%sbatch_filename)
In [ ]:
# %system sbatch /global/cscratch1/sd/bpb/mzmine_jobs/20190418_JJ_RM_Garden_v2EcoFAB_2019Apr_QE144_Ag68377-924_USHXG01160_positive_BD1728C6/20190418_JJ_RM_Garden_v2EcoFAB_2019Apr_QE144_Ag68377-924_USHXG01160_positive_BD1728C6_sbatch.sbatch
# %system sbatch /global/cscratch1/sd/bpb/mzmine_jobs/20190418_JJ_RM_Garden_v2EcoFAB_2019Apr_QE144_Ag68377-924_USHXG01160_negative_E828977F/20190418_JJ_RM_Garden_v2EcoFAB_2019Apr_QE144_Ag68377-924_USHXG01160_negative_E828977F_sbatch.sbatch
In [ ]:
%system squeue -u bpb
In [ ]:
# import requests
# task_id = '0d7e35bb071b4679996580af5ef0abe7'
# positive_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=gnps_molecular_network_graphml/" % (task_id)
# positive_graphml = os.path.join('/global/homes/b/bpb/Downloads/', "candice_combo.graphml")
# local_file = open(positive_graphml, "w")
# local_file.write(requests.get(positive_url).text)
# local_file.close()
In [16]:
sys.path.insert(1,'/global/homes/b/bpb/repos/GNPS_quickstart/')
import uuid
execfile('/global/homes/b/bpb/repos/GNPS_quickstart/util.py')
def copy_and_submit_to_gnps(m,override=False):
print(m['unique_basename'])
new_basename = os.path.join(m['basedir'],m['unique_basename'])
taskid_filename = '%s_%s.txt'%(new_basename,'gnps-uuid')
if (os.path.isfile(taskid_filename)) & (override==False):
print('Already submitted')
with open(taskid_filename,'r') as fid:
print(fid.read())
return False
metadata_filename = '%s_%s.tab'%(new_basename,'metadata')
if os.path.isfile(metadata_filename):
upload_to_gnps(metadata_filename,m['unique_basename'],'samplemetadata','bpbowen',gnps_password)
else:
print('METADATA NOT FOUND %s'%metadata_filename)
return False
mgf_filename = '%s_%s.mgf'%(new_basename,'MSMS')
if os.path.isfile(mgf_filename):
upload_to_gnps(mgf_filename,m['unique_basename'],'featurems2','bpbowen',gnps_password)
else:
print('SPECTRA NOT FOUND %s'%mgf_filename)
return False
quant_filename = '%s_%s.csv'%(new_basename,'peak-area')
if os.path.isfile(quant_filename):
upload_to_gnps(quant_filename,m['unique_basename'],'featurequantification','bpbowen',gnps_password)
else:
print('QUANT NOT FOUND %s'%quant_filename)
return False
task_id = launch_GNPS_featurenetworking_workflow( m['unique_basename'],m['unique_basename'],
'bpbowen', gnps_password, 'ben.bowen@gmail.com',
'MZMINE2', [], 'HIGHRES',
os.path.basename(metadata_filename),
os.path.basename(mgf_filename),
os.path.basename(quant_filename),
uuid.uuid1()) #I don't think this uuid does anything
if task_id is not None:
with open(taskid_filename,'w') as fid:
fid.write('https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=%s'%task_id)
# SUCCESS!!!
return task_id
else:
return False
for i,m in enumerate(mzmine_things):
taskid = copy_and_submit_to_gnps(m,override=False)
print(i,taskid)
print('')
https://github.com/mwang87/MergePolarity/blob/master/merge_polarity.py
In [51]:
df = pd.DataFrame(mzmine_things)
df = df[['basedir','unique_basename','polarity']]
# df.drop(columns=['file_list','file_filters','groups','param_dict_flat','param_dict_unflat','xml_string','mzmine_launcher'],inplace=True)
for i,m in df.iterrows():
new_basename = os.path.join(m['basedir'],m['unique_basename'])
taskid_filename = '%s_%s.txt'%(new_basename,'gnps-uuid')
if (os.path.isfile(taskid_filename)):
with open(taskid_filename,'r') as fid:
df.loc[i,'task_id'] = fid.read().split('=')[-1]
df.loc[i,'stripped_basename'] = '_'.join(m['unique_basename'].replace('positive','').replace('negative','').split('_')[:-1])
# df.loc[i,'stripped_basename'] = '_'.join(m['unique_basename'].replace('Pos','').replace('Neg','').split('_')[:-1])
df.fillna('',inplace=True)
df
Out[51]:
In [53]:
df = df.loc[2:]
In [54]:
d = {}
print('module load python')
for i in df['stripped_basename'].unique():
if len(df[df['stripped_basename']==i].index)>1:
d[i] = {}
d[i]['positive'] = {}
d[i]['negative'] = {}
for j in df[df['stripped_basename']==i].index:
d[i][df['polarity'][j]]['job_id'] = df['task_id'][j]
d[i][df['polarity'][j]]['basedir'] = df['basedir'][j]
d[i][df['polarity'][j]]['unique_basename'] = df['unique_basename'][j]
# mydir = '/global/cscratch1/sd/bpb/mzmine_jobs/%s'%i
if (d[i]['positive']['job_id']!='') & (d[i]['negative']['job_id']!=''):
print('#%s'%i)
# if not os.path.isdir(mydir):
# os.mkdir(mydir)
pos_outfile = os.path.join(d[i]['positive']['basedir'],'positive_network.graphml')
neg_outfile = os.path.join(d[i]['negative']['basedir'],'negative_network.graphml')
merged_outfile = os.path.join(d[i]['positive']['basedir'],'%s_merged_network.graphml'%d[i]['positive']['unique_basename'])
command = 'python /global/homes/b/bpb/repos/MergePolarity/merge_polarity.py'
# --positive-graphml %s --negative-graphml %s
print('%s --output-graphml %s --positive-network-task %s --negative-network-task %s --RT_TOLERANCE 0.15'%(
command,
merged_outfile,
# pos_outfile,
# neg_outfile,
d[i]['positive']['job_id'],
d[i]['negative']['job_id']))
In [55]:
for i,m in enumerate(mzmine_things):
print("changing permissions for %s"%m['basedir'])
os.system("chgrp -R metatlas %s"%m['basedir'])
os.system("chmod o+rx /global/cscratch1/sd/bpb")
os.system("chmod o+rx /global/cscratch1/sd/bpb/mzmine_jobs")
Out[55]:
In [56]:
for i,m in enumerate(mzmine_things):
print("%s"%m['basedir'])
In [ ]:
def group_consecutive(data,stepsize=10.0,do_ppm=True):
"""
split a numpy array where consecutive elements are greater than stepsize
can be ppm or value
if idx is not None then returns indices of elements otherwise returns values
The main use case is an unsorted list of m/zs as "data" and optionally
their numerical index as "idx". Typically a user would want to retrieve
the group indices in the original order that they provided their list
of m/zs.
usage:
"""
if type(data) is np.ndarray:
# cool way to sort and unsort array:
idx_sorted = data.argsort()
sort_w_unsort = np.column_stack((np.arange(idx_sorted.size),idx_sorted))
# sort_w_unsort[:,0] are the original indices of data
# sort_w_unsort[:,1] are the sorted indices of data
data_sorted = data[sort_w_unsort[:,1]]
# np.argsort(sort_w_unsort[:,1]) returns the indices to map the sorted data back to the original
# data_unsorted = data_sorted[np.argsort(sort_w_unsort[:,1])]
if do_ppm:
d = np.diff(data_sorted) / data_sorted[:-1] * 1e6
else:
d = np.diff(data_sorted)
# make groups of the array
data_groups = np.split(data_sorted, np.where(d > 2.0*stepsize)[0]+1)
# replace each group of values with group index
for i,data_slice in enumerate(data_groups):
data_groups[i] = data_groups[i]*0 + i
group_indices = np.concatenate(data_groups)
# reorder the group indices
group_indices = group_indices[np.argsort(sort_w_unsort[:,1])]
return group_indices.astype(int)#
else:
print('not a numpy array. convert it and sort it first')
def map_mzgroups_to_data(mz_atlas,mz_group_indices,mz_data):
"""
mz_atlas: m/z values from atlas
mz_group_indices: integer index from "group_consecutive"
mz_data: m/z values from raw data
"""
from scipy import interpolate
f = interpolate.interp1d(mz_atlas,np.arange(mz_atlas.size),kind='nearest',bounds_error=False,fill_value='extrapolate') #get indices of all mz values in the atlas
idx = f(mz_data) # iterpolate to find the nearest mz in the data for each mz in an atlas
idx = idx.astype('int')
# d = 1e6#np.abs(mz_data - mz_atlas[idx]) / mz_data * 1.0e6
# output_mat = np.column_stack((d,))
return mz_group_indices[idx]#output_mat
g = map_mzgroups_to_data(atlas['mz'].values[:],
atlas['group_index'].values[:],
msdata['mz'].values[:])
msdata['group_index'] = g#[:,1]
# msdata['group_index_ppm'] = g[:,0]
df = pd.merge(atlas,msdata,left_on='group_index',right_on='group_index',how='outer',suffixes=('_atlas','_data'))
#grab all datapoints including "extra"
mz_condition = abs(df['mz_data']-df['mz_atlas'])/df['mz_atlas']*1e6<df['ppm_tolerance']
rt_min_condition = df['rt']>=(df['rt_min']-df['extra_time'])
rt_max_condition = df['rt']<=(df['rt_max']+df['extra_time'])
df = df[(mz_condition) & (rt_min_condition) & (rt_max_condition)]
In [ ]:
# %system cat /project/projectdirs/metatlas/projects/jgi_projects/20190522_KBL_JM_504478_Plant_Arab_QE-HF_C18_USDAY47560_positive_1F910A37/20190522_KBL_JM_504478_Plant_Arab_QE-HF_C18_USDAY47560_positive_1F910A37_batch-params.xml
In [ ]:
# with open('/project/projectdirs/metatlas/projects/jgi_projects/20190522_KBL_JM_504478_Plant_Arab_QE-HF_C18_USDAY47560_positive_1F910A37/20190522_KBL_JM_504478_Plant_Arab_QE-HF_C18_USDAY47560_positive_1F910A37_batch-params.xml','r') as fid:
# s = fid.read()
# print(s)
In [ ]:
os.path.isfile(new_basename)
In [ ]:
m = mzmine_things[0]
with open('/global/homes/b/bpb/Downloads/gnps_section_mzmine_batch.xml','r') as fid:
d = fid.read()
new_basename = os.path.join(m['basedir'],'%s_%s'%(m['unique_basename'],'msms.mgf'))
d = d.replace('/global/homes/b/bpb/Downloads/gnps_upload.mgf',new_basename)
print(d)
new_batch = os.path.join(m['basedir'],'%s_%s'%(m['unique_basename'],'gnps-submission.xml'))
with open(new_batch,'w') as fid:
fid.write(d)
mzmine_launcher = mzm.get_latest_mzmine_binary(version=m['mzmine_version'])
print('%s %s'%(mzmine_launcher,new_batch))
In [ ]:
for i,m in enumerate(mzmine_things):
print('%s %s'%(m['mzmine_launcher'],m['batch_filename']))
In [ ]:
%system cat "/project/projectdirs/metatlas/projects/jgi_projects/20190522_KBL_JM_504478_Plant_Arab_QE-HF_C18_USDAY47560_positive_13EEF8DE/20190522_KBL_JM_504478_Plant_Arab_QE-HF_C18_USDAY47560_positive_13EEF8DE_peak-height.csv" | wc -l
In [ ]:
# df = dp.get_google_sheet(notebook_name='mzmine_jobs_20190625',sheet_name='JOBS',literal_cols=['file_filters','groups'])
# mzmine_things = df.T.to_dict().values()
import os.path, time
for i,m in enumerate(mzmine_things):
# new_basename = os.path.join(m['basedir'],'%s_%s'%(m['unique_basename'],'peak-height.csv'))
# new_basename = os.path.join(m['basedir'],'%s_%s'%(m['unique_basename'],'msms.mgf'))
new_basename = os.path.join(m['basedir'],'%s%s'%(m['unique_basename'],'.err'))
print(i,os.path.isfile(new_basename),new_basename)
print("last modified: %s" % time.ctime(os.path.getmtime(new_basename)))
with open(new_basename,'r') as fid:
s = fid.read()
print(s[-100:])
print('')
# df = pd.read_csv(new_basename)
# print(df.shape)
# print(os.listdir(m['basedir']))
In [ ]:
filename
In [ ]:
In [ ]:
mzmine_things[0]['file_list']
In [ ]:
def make_new_mgf_text(t):
data = [(tt.strip().split('\n')[:7],tt.strip().split('\n')[7:][:-1]) for tt in t.split('END IONS')]
mgf_str = ''
for data_slice in data:
spectrum = [(float(dd.split(' ')[0]),float(dd.split(' ')[-1])) for dd in data_slice[-1]]
peak_max,peak_min = sp.peakdet([s[1] for s in spectrum],10.0,[s[0] for s in spectrum])
header = '\n'.join(data_slice[0])
spec_str = '\n'.join(['%5.4f %5.1f'%(p[0],p[1]) for p in peak_max])
mgf_str = '%s\n\n%s\n%s\nEND IONS\n'%(mgf_str,header,spec_str)
return mgf_str
for i,m in enumerate(mzmine_things):
filename = os.path.join(m['basedir'],'%s_%s'%(m['unique_basename'],'MSMS.mgf'))
with open(filename,'r') as fid:
t = fid.read()
mgf_text = make_new_mgf_text(t)
new_filename = os.path.join(m['basedir'],'%s_%s'%(m['unique_basename'],'MSMS-centroid.mgf'))
print new_filename
with open(new_filename,'w') as fid:
fid.write('%s'%mgf_text)
In [ ]:
%matplotlib inline
from metatlas.helpers import spectralprocessing as sp
peak_max,peak_min = sp.peakdet([s[1] for s in spectrum],10.0,[s[0] for s in spectrum])
fig,ax = plt.subplots()
ax.set_title(len(peak_max))
ax.vlines([x[0] for x in peak_max],[0.0 for x in peak_max],[x[1] for x in peak_max])
In [ ]:
mzm = reload(mzm)
df = dp.get_google_sheet(notebook_name='mzmine_jobs_20190625',sheet_name='JOBS',literal_cols=['file_filters','groups'])
df = df[df['done']==False]
mzmine_things = df.T.to_dict().values()
mz_tolerance = 0.01
rt_tolerance = 0.2
true_pos_filename = '/global/homes/b/bpb/Downloads/mana_bootcamp_true_positives.xlsx'
# base_path = '/project/projectdirs/metatlas/projects/jgi_projects/'
base_path = '/global/cscratch1/sd/bpb/mzmine_jobs/'
# project_path = '20190401_JGI_Metabolomics_Bootcamp_ReversePhase_18DCF58D'
# project_path = '20190401_JGI_Metabolomics_Bootcamp_HILIC_D34FE46C'
# project_path = '20190401_JGI_Metabolomics_Bootcamp_ReversePhase_positive_2800C348'
project_path = '20190401_JGI_Metabolomics_Bootcamp_ReversePhase_positive_4E39E174'
# project_path = '20190501_bootcamp_PeterAndeer'
# project_path = '20190501_bootcamp_BrianDeFelice'
feature_table_extension = 'height.csv' # for mzmine files
rt_column = 'row retention time'
mz_column = 'row m/z'
headerrows = 0
sep=','
n=4
# feature_table_extension = '.txt' # for mzmine files
# rt_column = 'Average Rt(min)'
# mz_column = 'Average Mz'
# headerrows = 3
# sep='\t'
data = []
for i,m in enumerate(mzmine_things):
bd = m['basedir'].split('/')[-1]
if 'HILIC' in bd.upper():
chromatography = 'hilic'
else:
chromatography = 'reverse phase'
istd,bio,total = mzm.summarize_results(n,true_pos_filename,base_path,bd,feature_table_extension,rt_column,mz_column,headerrows,sep,mz_tolerance,rt_tolerance)
d = {'job':bd,'type':'istd','chromatography':chromatography,'job_type':'%s_%s'%(bd,'istd'),'total':total}
for j,h in enumerate(istd):
d['num_%d'%j] = h
data.append(d)
d = {'job':bd,'type':'bio','chromatography':chromatography,'job_type':'%s_%s'%(bd,'bio'),'total':total}
for j,h in enumerate(bio):
d['num_%d'%j] = h
data.append(d)
df = pd.DataFrame(data)
df.set_index('job_type',drop=True,inplace=True)
df.head()
# print(bio)
In [ ]:
df.index = [i[-30:-5] for i in df.index]
In [ ]:
%matplotlib inline
mzm = reload(mzm)
width = 0.4
fig,ax = plt.subplots(nrows=2,ncols=2,figsize=(21,15))
cols = [c for c in df.columns if 'num_' in c]
df_f = df[(df['type']=='bio') & (df['chromatography']=='reverse phase')]
df_f[cols].plot(kind='bar', sharex=False, sharey=False, stacked=True,ax=ax[0,0],width=width,position=0)
ax[0,0].set_yticks(np.linspace(0,1,11))
ax[0,0].grid(True)
ax[0,0].legend(loc='upper center',fontsize=20)
ax[0,0].set_title('Bio / Reverse-Phase')
ax[0,0].set_ylabel('Percent Correct')
ax2 = ax[0,0].twinx() # Create another axes that shares the same x-axis as ax.
df_f['total'].plot(kind='bar',ax=ax2, sharex=True, sharey=False, color='purple', width=width, position=1)
ax2.legend(loc='lower center',fontsize=20)
ax2.set_ylabel('Num Features')
ax2.set_xlim(-1,df_f.shape[0])
# ax2.set_xticklabels([i for i in df_f.index])
df_f = df[(df['type']=='bio') & (df['chromatography']=='hilic')]
df_f[cols].plot(kind='bar', stacked=True,ax=ax[0,1], width=width,position=0)
ax[0,1].set_yticks(np.linspace(0,1,11))
ax[0,1].grid(True)
ax[0,1].get_legend().remove()
ax[0,1].set_title('Bio / HILIC')
ax2 = ax[0,1].twinx() # Create another axes that shares the same x-axis as ax.
df_f['total'].plot(kind='bar',ax=ax2, sharex=True, sharey=False, color='purple', width=width, position=1)
ax2.set_ylabel('Num Features')
ax2.set_xlim(-1,df_f.shape[0])
ax[0,1].legend(loc='upper center',fontsize=20)
df_f = df[(df['type']=='istd') & (df['chromatography']=='reverse phase')]
df_f[cols].plot(kind='bar', stacked=True,ax=ax[1,0], width=width,position=0)
ax[1,0].set_yticks(np.linspace(0,1,11))
ax[1,0].grid(True)
ax[1,0].get_legend().remove()
ax[1,0].set_title('ISTD / Reverse-Phase')
ax2 = ax[1,0].twinx() # Create another axes that shares the same x-axis as ax.
df_f['total'].plot(kind='bar',ax=ax2, sharex=True, sharey=False, color='purple', width=width, position=1)
ax2.set_ylabel('Num Features')
ax2.set_xlim(-1,df_f.shape[0])
ax[1,0].legend(loc='upper center',fontsize=20)
df_f = df[(df['type']=='istd') & (df['chromatography']=='hilic')]
df_f[cols].plot(kind='bar', stacked=True,ax=ax[1,1], width=width,position=0)
ax[1,1].set_yticks(np.linspace(0,1,11))
ax[1,1].grid(True)
ax[1,1].get_legend().remove()
ax[1,1].set_title('ISTD / HILIC')
ax2 = ax[1,1].twinx() # Create another axes that shares the same x-axis as ax.
df_f['total'].plot(kind='bar',ax=ax2, sharex=True, sharey=False, color='purple', width=width, position=1)
ax2.set_ylabel('Num Features')
ax2.set_xlim(-1,df_f.shape[0])
ax[1,1].legend(loc='upper center',fontsize=20)
plt.tight_layout()
In [ ]:
mzm = reload(mzm)
df = dp.get_google_sheet(notebook_name='mzmine_jobs_20190625',sheet_name='JOBS',literal_cols=['file_filters','groups'])
# mzmine_things = df.T.to_dict().values()
job = df.loc[44,:]
job
In [ ]:
os.listdir(job['basedir'])
In [ ]:
new_basename = os.path.join(job['basedir'],job['unique_basename'])
peak_height_filename = '%s_%s.csv'%(new_basename,'peak-height')
metadata_filename = '%s_%s.tab'%(new_basename,'metadata')
df = pd.read_csv(peak_height_filename)
mzml_files = [c.replace(' Peak height','') for c in df.columns if 'Peak height' in c]
pd.DataFrame(data={'filename':mzml_files,'ATTRIBUTE_sampletype':[m.split('_')[12] for m in mzml_files]}).set_index('filename',drop=True).to_csv(metadata_filename,sep='\t')
In [ ]:
# with open('/project/projectdirs/metatlas/projects/mzmine_parameters/batch_files/bootcamp_template_batch_file.xml','r') as fid:
# xml_str = fid.read()
# d = mzm.xml_to_dict(xml_str)
# t = mzm.dict_to_etree(d)
# mzm.indent_tree(t)
# s1 = mzm.tree_to_xml(t)
# df = mzm.flatten(d,enumerate_types=(list,))
# #sorted(['.'.join(tuple(str(x) for x in tup)) for tup in df.keys()])
# df2 = mzm.unflatten(df)
# # t2 = mzm.dict_to_etree(xml_2)
# # mzm.indent_tree(t2)
# # s2 = mzm.tree_to_xml(t2)
# # t = dict_to_etree(new_d)
# # indent_tree(t)
# # xml_batch_str = tree_to_xml(t,filename=task.input_xml)
# # job_runner = '%s %s'%(task.mzmine_launcher,task.input_xml)
In [ ]:
df = dp.get_google_sheet(notebook_name='ADAP Bootcamp MZMine Untargeted Job Submission',sheet_name='min_max')
s = df.iloc[0]
test_params = {y:x for x,y in s.to_dict().iteritems() if len(x)>0 }
for k,v in test_params.iteritems():
v2 = literal_eval(v)
# test_params[k] = {'min':v2[0],'max':v2[1],'n':v2[2],'vals':np.linspace(v2[0],v2[1],num=v2[2])}
test_params[k] = np.linspace(v2[0],v2[1],num=v2[2])
df.columns = df.iloc[0]
df = df.iloc[1:]
# do ast.literal_eval on lists
literal_cols = ['file_filters','groups']
for col in literal_cols:
df[col] = df[col].apply(literal_eval)
df['is_group'] = [bool(strtobool(a)) for a in df['is_group']]
df['remove_isotopes'] = [bool(strtobool(a)) for a in df['remove_isotopes']]
df['peak_with_msms'] = [bool(strtobool(a)) for a in df['peak_with_msms']]
numeric_columns = [ u'max_peak_duration', u'min_rt', u'max_rt', u'min_peak_duration',u'min_rt', u'max_rt', u'min_peak_duration', u'max_peak_duration', u'smoothing_scans', u'min_num_scans', u'group_intensity_threshold',
u'min_peak_height', u'mz_tolerance', u'peak_to_valley_ratio', u'rt_tol_multifile',
u'rt_tol_perfile', u'ms1_noise_level', u'ms2_noise_level', u'chromatographic_threshold',
u'search_for_minimum_rt_range', u'minimum_relative_height', u'mz_range_scan_pairing',
u'rt_range_scan_pairing','min_peaks_in_row','gapfill_intensity_tolerance',
'adap_sn_threshold','adap_peak_width_mult','adap_area_threshold','adap_rt_wavelet_min','adap_rt_wavelet_max']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce', axis=1)
mzmine_things = df.T.to_dict().values()
In [ ]:
# groups = ['20160825_KBL_C18_MP_Solar_Berk','20170728_KBL_C18_MP_Solar_Spain']
mzm = reload(mzm)
current_month = datetime.now().strftime('%m') #// 02 //This is 0 padded
current_year = datetime.now().strftime('%Y') #// 2018
year_month_dir = '%s%s'%(current_year,current_month)
for i,m in enumerate(mzmine_things):
mzmine_things[i]['files'] = mzm.get_files(m['groups'],m['filename_substring'],m['file_filters'],m['is_group'])
print(m['basename'],m['polarity'],len(mzmine_things[i]['files']))
new_dir = os.path.join(mzm.DATA_PATH,year_month_dir)
if not os.path.isdir(new_dir):
os.mkdir(new_dir)
for j,f in enumerate(mzmine_things[i]['files']):
new_file_path = os.path.join(new_dir,os.path.basename(f))
if not os.path.isfile(new_file_path):
copyfile(f, new_file_path)
mzmine_things[i]['files'][j] = new_file_path
In [ ]:
all_variables = sorted(test_params)
combinations = list(product(*(test_params[var] for var in all_variables)))
In [ ]:
#For each mzmine thing, and for each parameter, make a new mzmine thing with that parameter
new_mzmine_things = []
for i,m in enumerate(mzmine_things): #iterate across hilic and rp
counter = 0
for values in combinations: #iterate through all the possible combinations
new_mzmine_things.append(copy.deepcopy(m)) #append all the static info from an mzmine_thing
new_mzmine_things[-1]['basename'] = '%d_%s'%(counter,m['basename']) #give the output files a unique filename
new_mzmine_things[-1]['unique_basename'] = '%d_%s'%(counter,m['unique_basename']) #this might not be used. I can't remember.
counter += 1
for j,var in enumerate(all_variables): #iterate across the mzmine variables that need to be changed
new_mzmine_things[-1][var] = values[j] #use the new value for a variable
In [ ]:
import glob as glob
status = []
for i,params in enumerate(new_mzmine_things):
project_name = '%s_%s'%(params['unique_basename'],params['polarity'])
results_dir = os.path.join(params['basedir'],project_name)
peak_height_file = os.path.join(params['basedir'],'%s_%s_peak_height.csv'%(params['basename'],params['polarity']))
# see if final mzmine feature table exists.
job_done_1 = os.path.isfile(peak_height_file)
# if mzmine workspace exists store this so it can be removed from job script for reruns
new_mzmine_things[i]['mzmine_done'] = job_done_1
# see if sheets/peak_height.tab exists
job_done_2 = False
identification_folder = os.path.join(results_dir,'identification')
num_id_figures = 0
if os.path.isfile(peak_height_file):
with open(peak_height_file,'r') as fid:
peaks = fid.read()
num_peaks = len(peaks.split('\n'))-1
num_id_figures = len(glob.glob(os.path.join(identification_folder,'*.pdf')))
if num_peaks > 0:
job_done_2 = True
else:
num_peaks = 0
# if metatlas is done note this too, but these jobs shouldn't be rerun (since they are done)
new_mzmine_things[i]['metatlas_done'] = job_done_2
# see if the filtered mzmine workspace has been created
job_done_3 = os.path.isfile(os.path.join(results_dir,'%s.mzmine'%project_name))
# this won't change the job script but completes the book-keeping
new_mzmine_things[i]['small_mzmine_done'] = job_done_3
status.append({'0_basedir':params['basedir'].split('/')[-1],
'1_job':'%s_%s'%(params['unique_basename'],params['polarity']),
'2_features_done':job_done_1,
'3_num_features':num_peaks})
pd.DataFrame(status).to_csv('/global/homes/b/bpb/Downloads/mzmine_jobs_and_status.csv',index=False)
# pd.DataFrame(status)
In [ ]:
# new_mzmine_things=[]
# print(len(mzmine_things))
# for i,params in enumerate(mzmine_things):
# # see if final mzmine file exists. this will happen even if peak_height.tab is missing
# if (not params['metatlas_done']) or (not params['small_mzmine_done']) or (not params['mzmine_done']):
# new_mzmine_things.append(params)
# mzmine_things = new_mzmine_things
# print(len(mzmine_things))
In [ ]:
# for m in pd.unique([m['basedir'] for m in new_mzmine_things]):
# print('rm -r %s'%m)
In [ ]:
# for m in pd.unique([m['basedir'] for m in new_mzmine_things]):
# print('chmod -R 770 %s'%m)
# print('chgrp -R metatlas %s'%m)
In [ ]:
In [ ]:
mzm = reload(mzm)
def remove_workspace_from_job(job_filename):
with open(job_filename,'r') as fid:
s = fid.read()
split_str = '</batchstep>\n'
s = s.split(split_str)
idx = [i for i,ss in enumerate(s) if 'peak_area' in ss][-1]
s = s[:idx]
s.append('</batch>\n')
s = split_str.join(s)
with open(job_filename,'w') as fid:
fid.write('%s'%s)
with open('/global/homes/b/bpb/bootcamp/bootcamp_jobs.sh','w') as bcf:
for i,params in enumerate(new_mzmine_things):
#setup all the job scripts when job actually runs it will copy params to params-used
job_script = mzm.create_job_script(params)
# wipe out the peak area, mgf, and workspace export
with open(job_script,'r') as fid:
s = fid.read()
job_filename = s.split(' ')[-1].strip()
remove_workspace_from_job(job_filename)
#dump the parameters to a json file
with open( os.path.join(params['basedir'],'logs','%s_%s_params.json'%(params['basename'],params['polarity'])), 'w') as fid:
fid.write(json.dumps(params))
with open(job_script,'r') as fid:
s = fid.read()
bcf.write('%s\n'%s.split('\n')[-2])
# print('sbatch %s'%job_script)
In [ ]:
chgrp -R metatlas mzmine_jobs/ADAP_20190401_JGI_Metabolomics_Bootcamp_*
In [ ]:
24000*10/4/200/60
In [ ]:
import time
d = '/global/cscratch1/sd/jaws/prod/cromwell-executions/jgi_mzmine/d6a9b53b-fe5b-4782-b7c0-d019573eb9e7/call-run_mzmine'
for i in range(0,17):
stat = os.stat('%s/shard-%d/execution/stderr.submit'%(d,i))
print(time.ctime(stat.st_mtime))
In [ ]:
import json
In [ ]:
with open('/global/homes/b/bpb/bootcamp/bootcamp_jobs.sh','r') as bcf:
s = bcf.read()
job_list = s.split('/global/common/software/m2650/mzmine_parameters/MZmine/MZmine-2.39/startMZmine_NERSC_Headless_Cori.sh ')
job_list = job_list[1:]
job_list = [j.strip() for j in job_list]
n = 6000
# using list comprehension
final = [job_list[i * n:(i + 1) * n] for i in range((len(job_list) + n - 1) // n )]
# print (final)
for i,small_list in enumerate(final):
d = {'jgi_mzmine.inputs':small_list}
with open('/global/homes/b/bpb/adap_job_array_%d.json'%i,'w') as fid:
json.dump(d,fid)
In [ ]:
len(final)
In [ ]:
import glob
files = glob.glob('/global/cscratch1/sd/bpb/mzmine_jobs/20190401_JGI_Metabolomics_Bootcamp_HILIC_D34FE46C/*.csv')
files
In [ ]:
In [ ]:
In [ ]:
In [ ]:
all_batch_files = glob.glob('/project/projectdirs/metatlas/projects/jgi_projects/20190401_JGI_Metabolomics_Bootcamp_HILIC_D34FE46C/*.csv')
all_batch_files = all_batch_files + glob.glob('/project/projectdirs/metatlas/projects/jgi_projects/20190401_JGI_Metabolomics_Bootcamp_ReversePhase_18DCF58D/*.csv')
all_batch_files = sorted(all_batch_files)
all_batch_files = [os.path.basename(a).replace('_peak_height.csv','.xml') for a in all_batch_files]
print('%s\n%s'%(all_jobs[0].split(' ')[-1],all_batch_files[0]))
os.path.basename(all_jobs[0].split(' ')[-1]) in all_batch_files
In [ ]:
In [ ]:
In [ ]:
len(all_completed_jobs)
In [ ]:
len(all_completed_jobs)
In [ ]:
# break the jobs into 10 smaller job sets
job_template = """#!/bin/bash
#SBATCH --qos=regular
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --constraint=haswell
#SBATCH --error="gp.err"
#SBATCH --output="gp.out"
export MALLOC_ARENA_MAX=1
source jobs.sh
"""
# job_template = """#!/bin/bash
# #SBATCH -t 00:15:00
# #SBATCH -C haswell
# #SBATCH -N 1
# #SBATCH -q realtime
# #SBATCH -A m1541
# #SBATCH --exclusive
# source jobs.sh
# """
# job_template="""#!/bin/bash
# #SBATCH -N 4
# #SBATCH -C haswell
# #SBATCH --exclusive
# #SBATCH -q realtime
# #SBATCH --error="gp.err"
# #SBATCH --output="gp.out"
# #SBATCH -A m1541
# #SBATCH -t 04:00:00
# #SBATCH -L project
# # cori specific tells it to not allocate memory on a per thread basis Java
# export MALLOC_ARENA_MAX=1
# module load taskfarmer
# export THREADS=4
# runcommands.sh jobs.sh
# """
# job_template = """#!/bin/bash
# #SBATCH -N 20 -c 64
# #SBATCH --error="gp.err"
# #SBATCH -L project
# #SBATCH --output="gp.out"
# #SBATCH --qos=premium
# #SBATCH -A m1541
# #SBATCH --constraint=haswell
# #SBATCH -L project
# #SBATCH -t 24:00:00
# export HDF5_USE_FILE_LOCKING=FALSE
# module load taskfarmer
# export THREADS=4
# runcommands.sh jobs.sh"""
with open('/global/homes/b/bpb/bootcamp/bootcamp_jobs.sh','r') as bcf:
s = bcf.read()
# remove jobs that have been done
all_jobs = s.split('\n')
all_completed_jobs = glob.glob('/project/projectdirs/metatlas/projects/jgi_projects/20190401_JGI_Metabolomics_Bootcamp_HILIC_D34FE46C/*.csv')
all_completed_jobs = all_completed_jobs + glob.glob('/project/projectdirs/metatlas/projects/jgi_projects/20190401_JGI_Metabolomics_Bootcamp_ReversePhase_18DCF58D/*.csv')
all_completed_jobs = sorted(all_completed_jobs)
all_completed_jobs = [os.path.basename(a).replace('_peak_height.csv','.xml') for a in all_completed_jobs]
filtered_jobs = [x for x in all_jobs if os.path.basename(x.split(' ')[-1]) not in all_completed_jobs]
len(all_jobs),len(filtered_jobs),len(all_completed_jobs)
# filtered_jobs = filtered_jobs[:500]
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in xrange(0, len(l), n):
yield l[i:i + n]
chunked_jobs = list(chunks(filtered_jobs,120))
print('rm -r /global/homes/b/bpb/bootcamp/jobs/job_chunk_*')
with open('/global/homes/b/bpb/bootcamp/knl_job_file.sh','w') as knl_file:
for i,cj in enumerate(chunked_jobs):
d = '/global/homes/b/bpb/bootcamp/jobs/job_chunk_%d'%i
if not os.path.isdir(d):
os.mkdir(d)
job_file = os.path.join(d,'jobs_%d.sh'%i)
sbatch_file = os.path.join(d,'job-%d.sbatch'%i)
# job_string = '\n'.join(cj)
small_chunks = list(chunks(cj,4)) #make make job chunks in 3 each
all_small_jobs = []
for sc in small_chunks:
all_small_jobs.append(' & \n'.join(sc))
job_string = ' &\nwait\n'.join(all_small_jobs)
job_string = '%s &\nwait\n'%job_string
with open(job_file,'w') as fid: # write all the jobs to the file
fid.write('%s'%job_string)
with open(sbatch_file,'w') as fid:
fid.write('%s'%job_template.replace('jobs.sh',job_file).replace('gp.','log_%d_job.'%i))
knl_file.write('sbatch %s\n'%sbatch_file)
In [ ]: