1. Import Python Packages


In [ ]:
# import pandas as pd
# import requests
# from requests.auth import HTTPBasicAuth
# import sys
# if sys.version_info[0] < 3: 
#     from StringIO import StringIO
# else:
#     from io import StringIO
    
# url = 'https://metabolomics.app.labkey.host/_webdav/Metabolite%20Atlas/%40files/Probe_peak_height.tab?contentDisposition=attachment'
# r = requests.get(url, auth=HTTPBasicAuth(u,p))
# df = pd.read_csv(StringIO(r.text),sep='\t')
# df.head()

In [1]:
%matplotlib notebook
import numpy as np

import sys, os
import glob
import json
import copy
from itertools import product

sys.path.insert(0,'/global/homes/b/bpb/repos/metatlas')

# # sys.path.insert(1,'/global/projecta/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
from ast import literal_eval
import metatlas.metatlas_objects as metob
# from metatlas.helpers import mzmine_helpers as mzm
from metatlas.helpers import mzmine_batch_tools_adap as mzm
from metatlas.helpers import dill2plots as dp
from metatlas.helpers import metatlas_get_data_helper_fun as ma_data
from metatlas.helpers import chromatograms_mp_plots as cp

import collections


import pandas as pd

from matplotlib import pyplot as plt
# import multiprocessing as mp

from distutils.util import strtobool
from datetime import datetime
from shutil import copyfile

from ast import literal_eval
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
# pd.set_option("display.max_colwidth", 1000000)


('NERSC=', True)

In [1]:
# %matplotlib inline
# paths = ['/global/cscratch1/sd/bpb/mzmine_jobs/20171214_KBL_JL_C18_Poplar_FungInnoc_positive_8C552483/*height.csv',
#          '/global/cscratch1/sd/bpb/mzmine_jobs/20171214_KBL_JL_C18_Poplar_FungInnoc_negative_8302CCEF/*height.csv']

# mz = [229.981110,227.966560]
# rt = [4.77,4.77]
# polarity = ['positive','negative']
# rt_range = [0.2,0.2]
# mz_range = [0.01,0.01]

# fig,ax = plt.subplots(nrows=2,ncols=1,figsize=(12,8),sharex=True)
# ax = ax.flatten()
# df_summary = []
# for i,p in enumerate(paths):
#     my_file = glob.glob(p)[-1]
#     df = pd.read_csv(my_file)
#     print(df.shape[0])
#     pk_cols = [c for c in df.columns if 'Peak height' in c]
#     idx_mz = abs(df['row m/z'] - mz[i])<0.01
#     idx_rt = abs(df['row retention time'] - rt[i])<0.2
#     idx_mzrt = idx_mz & idx_rt
#     print(sum(idx_mz),sum(idx_rt),sum(idx_mzrt))
#     s = df.loc[idx_mzrt,pk_cols]
    
#     temp = pd.merge(pd.DataFrame(s.median(axis=1),columns=['sum intensity']),
#                    df.loc[idx_mzrt,['row m/z','row retention time']],
#                    left_index=True,right_index=True)
    
#     x = np.asarray([int(vv[0].replace('_Run','').replace('.','')) if len(vv)>0 else 0 for vv in [re.findall("_Run\d+\.",v) for v in s.columns]])
#     y = s.values.flatten()
    

    
#     idx = np.argsort(x).flatten()
#     x = x[idx]
#     y = y[idx]
#     ax[i].plot(x,y,'.')
#     if i==1:
#         ax[i].set_xlabel('Run Order Index')
        
#     ax[i].set_ylabel('%s mode\nABMBA peak height (au)'%polarity[i])
    
#     temp['polarity'] = polarity[i]
#     temp['rt_range'] = rt_range[i]
#     temp['mz_range'] = mz_range[i]
#     df_summary.append(temp)

# pd.concat(df_summary)

In [2]:
# paths = ['/global/cscratch1/sd/bpb/mzmine_jobs/20171214_KBL_JL_C18_Poplar_FungInnoc_positive_8C552483/*height.csv',
#          '/global/cscratch1/sd/bpb/mzmine_jobs/20171214_KBL_JL_C18_Poplar_FungInnoc_negative_8302CCEF/*height.csv']

# mz = [218.12812,216.11357]
# rt = [2.34,2.34]
# polarity = ['positive','negative']
# rt_range = [0.2,0.2]
# mz_range = [0.01,0.01]

# fig,ax = plt.subplots(nrows=1,ncols=2)
# ax = ax.flatten()
# df_summary = []
# for i,p in enumerate(paths):
#     my_file = glob.glob(p)[-1]
#     df = pd.read_csv(my_file)
#     print(df.shape[0])
#     pk_cols = [c for c in df.columns if 'Peak height' in c]
#     idx_mz = abs(df['row m/z'] - mz[i])<0.01
#     idx_rt = abs(df['row retention time'] - rt[i])<0.2
#     idx_mzrt = idx_mz & idx_rt
#     print(sum(idx_mz),sum(idx_rt),sum(idx_mzrt))
#     s = df.loc[idx_mzrt,pk_cols].median(axis=1)
    
#     temp = pd.merge(pd.DataFrame(s,columns=['sum intensity']),
#                    df.loc[idx_mzrt,['row m/z','row retention time']],
#                    left_index=True,right_index=True)
#     temp['polarity'] = polarity[i]
#     temp['rt_range'] = rt_range[i]
#     temp['mz_range'] = mz_range[i]
#     df_summary.append(temp)

# pd.concat(df_summary)

outline of operational steps

Given a new batch file, log it at a workflow

For a workflow perform and evaluate parameter scanning

For a workflow setup ability to run that job on datasets

  1. Make a new parameter template from a batch file

  2. Use a parameter template to search for peaks

  3. Scan parameters within a range for a template

Make a new csv parameter entry from a batch file. You do this and then put the parameters into a new google sheet with a column called param_id.

An example can be seen here:

https://docs.google.com/spreadsheets/d/17bf-GmYfpYKyqW_attFLli3_Glva2cnCuHlFP6FMP-Y/edit#gid=0

Role this into the LIMS once the LIMS is stablized.

You shouldn't have to do this very often.


In [ ]:
# mzm = reload(mzm)

# # xml_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/batch_files/bootcamp_template_batch_file.xml'
# # xml_file = '/global/homes/b/bpb/Downloads/batch_louie_nomsms_noisotope.xml'
# # xml_file = '/global/homes/b/bpb/Downloads/batch_louie_withmsms_withisotope.xml'
# xml_file = '/global/homes/b/bpb/Downloads/batch_template_2p39.xml'

# csv_file = '/global/homes/b/bpb/Downloads/mzmine_workflow.csv'
# df=mzm.mzmine_xml_to_csv(xml_file=xml_file,csv_file=csv_file)
# df.head()

In [ ]:
# mzm = reload(mzm)

# xml_file = '/global/homes/b/bpb/Downloads/1904_MZMine_2_39_QE_RP_UCSD_JGI_sens_deconv_basicadduct_v4dupl.xml'
# csv_file = '/global/homes/b/bpb/Downloads/mzmine_workflow.csv'
# df=mzm.mzmine_xml_to_csv(xml_file=xml_file,csv_file=csv_file,pop_input_files=False)
# df.head()

Turn spreadsheet of specified jobs into job entry.

Each job entry will tell you a specific google sheet with parameter entries in it. An example is here:

https://docs.google.com/spreadsheets/d/1nQCz3Qpr_lmiz6MZJznaBFlvujySeFW5EG-C3mGPSDE/edit#gid=2011516151


In [2]:
dp = reload(dp)
df = dp.get_google_sheet(notebook_name='mzmine_jobs_20190625',sheet_name='JOBS',literal_cols=['file_filters','groups'])
df = df[df['done']==False]

mzmine_things = df.T.to_dict().values()
print(len(mzmine_things))
# print(mzmine_things[-1])


2

see all the unique types of mzmine parameters that are needed and fetch them


In [3]:
temp = []
for m in mzmine_things:
    temp.append(m['parameter_sheet'])
unique_parameter_sheets_tablenames = pd.unique(temp)

print('There are %d unique parameter sheets'%len(unique_parameter_sheets_tablenames))
unique_parameter_sheets = {}
for s in unique_parameter_sheets_tablenames:
    df = dp.get_google_sheet(notebook_name=s,sheet_name='Sheet1')
    new_cols = []
    for c in df.columns:
        if c.startswith('('):
            new_cols.append(literal_eval(c))
        else:
            new_cols.append(c)
    df.columns=new_cols
    unique_parameter_sheets[s] = df


There are 1 unique parameter sheets

for each mzmine thing, get its parameter set and turn it into a dict


In [4]:
with open('/global/homes/b/bpb/gnps_password.txt','r') as fid:
    gnps_password = fid.read().strip()

In [5]:
for i,m in enumerate(mzmine_things):
    df = unique_parameter_sheets[m['parameter_sheet']]
    d = df[df['param_id']==m['parameter_id']].to_dict(orient='records')[-1]
    d.pop('param_id', None)
    d = collections.OrderedDict(d)
    new_basename = os.path.join(m['basedir'],m['unique_basename'])
    #replace the polarity in the crop filter module
    for k,v in d.items():
        if 'polarity' in k:
            d[k] = m['polarity'].upper()
    
    #rename all the output files
    for k,v in d.items():
        try:
            if 'placeholder_filename' in v:
                if 'gnps-job' in v:
                    d[k] = v.replace('placeholder_filename',m['unique_basename'])
                else:
                    d[k] = v.replace('placeholder_filename',new_basename)
        except TypeError:
            pass
    
#     #replace gnps password
#     for k,v in d.items():
#         try:
#             if 'justapassword' in v:
#                 d[k] = v.replace('justapassword',gnps_password)
#         except TypeError:
#             pass
    
    mzmine_things[i]['param_dict_flat'] = d
    
    #This is a good place to make the values strings.  The xml maker needs strings later on so might as well do it here
    str_d = {}
    for k,v in d.items():
        str_d[k] = str(v)
        
    #unflatten it
    mzmine_things[i]['param_dict_unflat'] = mzm.unflatten(str_d)

In [6]:
mzm = reload(mzm)
for i,m in enumerate(mzmine_things):
    mzmine_things[i]['file_list'] = mzm.get_files(m['groups'],m['filename_substring'],m['file_filters'],m['is_group'])


/global/common/software/m2650/python-cori/lib/python2.7/site-packages/pymysql/cursors.py:166: Warning: (1287, "'@@tx_isolation' is deprecated and will be removed in a future release. Please use '@@transaction_isolation' instead")
  result = self._query(query)

move all the files to cscratch


In [32]:
# rsync -zavh /project/projectdirs/metatlas/raw_data/ /global/cscratch1/sd/bpb/raw_data
# rsync -zavh /project/projectdirs/metatlas/projects/jgi_projects/ /global/cscratch1/sd/bpb/mzmine_jobs

In [7]:
for i,m in enumerate(mzmine_things):
    # add on raw data files
    new_basename = os.path.join(m['basedir'],m['unique_basename'])
    new_save_step = {'@method': "net.sf.mzmine.modules.projectmethods.projectsave.ProjectSaveAsModule",
                     'parameter': {'@name':"Project file",'current_file':'%s_workspace.mzmine'%new_basename}}
        
    new_raw_data = {'@method': 'net.sf.mzmine.modules.rawdatamethods.rawdataimport.RawDataImportModule',
                    'parameter': {'@name': 'Raw data file names',
                                  'file': []}}
    new_raw_data['parameter']['file'] = mzmine_things[i]['file_list']
    mzmine_things[i]['param_dict_unflat']['batch']['batchstep'].insert(0,new_raw_data)

turn tabular parameter entry back into xml format


In [8]:
for i,m in enumerate(mzmine_things):
    t2 = mzm.dict_to_etree(mzmine_things[i]['param_dict_unflat'])
    mzm.indent_tree(t2)
    s2 = mzm.tree_to_xml(t2)
    mzmine_things[i]['xml_string'] = s2

Setup Job Launch Command


In [9]:
for i,m in enumerate(mzmine_things):
    mzmine_launcher = mzm.get_latest_mzmine_binary(version=m['mzmine_version'])
    mzmine_things[i]['mzmine_launcher'] = mzmine_launcher
    new_basename = os.path.join(m['basedir'],m['unique_basename'])
    if not os.path.isdir(m['basedir']):
        os.mkdir(m['basedir'])
    batch_filename = '%s_batch-params.xml'%new_basename
    with open(batch_filename,'w') as fid:
        fid.write('%s'%m['xml_string'])
    mzmine_things[i]['batch_filename'] = batch_filename

Save GNPS metadata file


In [10]:
# # s = []
# for m in mzmine_things[21]['file_list']:
[(i,g) for i,g in enumerate(os.path.basename(mzmine_things[0]['file_list'][0]).split('_'))]
#     print('-'.join(os.path.basename(m).split('_')[9].split('-')[:-1]))
#     print('')
# # sorted(s)


Out[10]:
[(0, u'20190918'),
 (1, u'AG'),
 (2, u'VB'),
 (3, u'BioSFA'),
 (4, u'AlgC18test'),
 (5, u'Pilot'),
 (6, u'QE144'),
 (7, u'EPC18'),
 (8, u'USDAY46920'),
 (9, u'POS'),
 (10, u'MSMS-v2'),
 (11, u'9'),
 (12, u'exud-MeOH'),
 (13, u'1'),
 (14, u'37.mzML')]

In [11]:
for i,m in enumerate(mzmine_things):
    print(i,m['unique_basename'])
    new_basename = os.path.join(m['basedir'],m['unique_basename'])
    metadata_filename = '%s_%s.tab'%(new_basename,'metadata')
    mzml_files = [os.path.basename(f) for f in m['file_list']]
    if '20181109TV_' in m['unique_basename']:
        attributes = [''.join(f.split('_')[9].split('-')[:-1]) for f in mzml_files]
    elif '20161212_TS_BSC_EthAc_extracts' in m['unique_basename']:
        attributes = [''.join(f.split('_')[11]) for f in mzml_files]            
    elif '20181116_TV' in m['unique_basename']:
        attributes = [''.join(f.split('_')[11].split('-')[:-1]) for f in mzml_files]        
    elif '20151019_KZ_RootExu_C18_QExactive' in m['unique_basename']:
        attributes = ['_'.join(f.split('_')[8:10]) for f in mzml_files]
    elif '_AS_' in m['unique_basename']:
        attributes = ['-'.join(f.split('_')[11].split('-')[:]) for f in mzml_files]
    elif ('_MO' in m['unique_basename']) or ('candice' in m['unique_basename']):
        attributes = ['-'.join(f.split('_')[12].split('-')[:-1]) for f in mzml_files]
    elif '20190729_AK_JE_Ekwealor' in m['unique_basename']:
        attributes = [f.split('_')[10] for f in mzml_files]
    elif '20171214_KBL_JL_C18_Poplar_FungInnoc' in m['unique_basename']:
        attributes = [f.split('_')[11] for f in mzml_files]
    elif '20190829_AK_NS_ENIGMA_bsc_community_QEHF_Ag679775-924_USHXH0126' in m['unique_basename']:
        attributes = [f.split('_')[10] for f in mzml_files]
    else:
        attributes = [f.split('_')[12] for f in mzml_files]
#     except:
#         attributes = ['no group' for f in mzml_files]
    print(len(pd.unique(attributes)),attributes)
    pd.DataFrame(data={'filename':mzml_files,'ATTRIBUTE_sampletype':attributes}).set_index('filename',drop=True).to_csv(metadata_filename,sep='\t')
    print('')


(0, '20190918_AG_VB_BioSFA_AlgC18test_Pilot_QE144_EPC18_USDAY46920_positive_F379EF3C')
(10, [u'exud-MeOH', u'ExCtrl-cart', u'ExCtrl-EtOAc', u'exud-EtOAc', u'ExCtrl-cart', u'exud-cart', u'exud-EtOAc', u'exud-MeOH', u'ExCtrl-MeOH', u'pell-EtOAc', u'pell-cart', u'ExCtrl-MeOH', u'pell-MeOH', u'QC-PrimMet-SOPv3', u'pell-MeOH', u'ExCtrl-EtOAc', u'exud-cart', u'pell-cart', u'pell-EtOAc'])

(1, '20190918_AG_VB_BioSFA_AlgC18test_Pilot_QE144_EPC18_USDAY46920_negative_55220FEA')
(10, [u'ExCtrl-cart', u'exud-EtOAc', u'QC-PrimMet-SOPv3', u'exud-MeOH', u'ExCtrl-MeOH', u'pell-cart', u'ExCtrl-cart', u'pell-MeOH', u'ExCtrl-MeOH', u'exud-MeOH', u'ExCtrl-EtOAc', u'ExCtrl-EtOAc', u'pell-EtOAc', u'exud-cart', u'exud-cart', u'exud-EtOAc', u'pell-cart', u'pell-MeOH', u'pell-EtOAc'])

write each xml file to disk and test them


In [12]:
for i,m in enumerate(mzmine_things):
    print('%s %s'%(m['mzmine_launcher'],m['batch_filename']))


/global/common/software/m2650/mzmine_parameters/MZmine/MZmine-2.39/startMZmine_NERSC_Headless_Cori.sh /global/cscratch1/sd/bpb/mzmine_jobs/20190918_AG_VB_BioSFA_AlgC18test_Pilot_QE144_EPC18_USDAY46920_positive_F379EF3C/20190918_AG_VB_BioSFA_AlgC18test_Pilot_QE144_EPC18_USDAY46920_positive_F379EF3C_batch-params.xml
/global/common/software/m2650/mzmine_parameters/MZmine/MZmine-2.39/startMZmine_NERSC_Headless_Cori.sh /global/cscratch1/sd/bpb/mzmine_jobs/20190918_AG_VB_BioSFA_AlgC18test_Pilot_QE144_EPC18_USDAY46920_negative_55220FEA/20190918_AG_VB_BioSFA_AlgC18test_Pilot_QE144_EPC18_USDAY46920_negative_55220FEA_batch-params.xml

Make JAWS formated json file


In [13]:
# s = json.dumps([b['batch_filename'] for b in mzmine_things])
d = {"jgi_mzmine.inputs":[b['batch_filename'] for b in mzmine_things]}
with open('/global/homes/b/bpb/mzmine_job_adap.json','w') as fid:
    fid.write('%s'%json.dumps(d))

make SBATCH files


In [14]:
mzm = reload(mzm)
print(mzm.SLURM_HEADER)


#!/bin/bash
#SBATCH -t 04:00:00
#SBATCH -C haswell
#SBATCH -N 1
#SBATCH --error="slurm.err"
#SBATCH --output="slurm.out"
#SBATCH -q realtime
#SBATCH -A m1541
#SBATCH --exclusive
module load java



In [ ]:


In [15]:
mzm = reload(mzm)
for i,m in enumerate(mzmine_things):
    new_basename = os.path.join(m['basedir'],m['unique_basename'])
    sbatch_filename = '%s_%s.sbatch'%(new_basename,'sbatch')
    s = '%s %s'%(m['mzmine_launcher'],m['batch_filename'])
    sbatch_header = mzm.SLURM_HEADER
    
    with open(sbatch_filename,'w') as fid:
        fid.write('%s\n%s\n'%(mzm.SLURM_HEADER.replace('slurm',new_basename),s))
    print('sbatch %s'%sbatch_filename)


sbatch /global/cscratch1/sd/bpb/mzmine_jobs/20190918_AG_VB_BioSFA_AlgC18test_Pilot_QE144_EPC18_USDAY46920_positive_F379EF3C/20190918_AG_VB_BioSFA_AlgC18test_Pilot_QE144_EPC18_USDAY46920_positive_F379EF3C_sbatch.sbatch
sbatch /global/cscratch1/sd/bpb/mzmine_jobs/20190918_AG_VB_BioSFA_AlgC18test_Pilot_QE144_EPC18_USDAY46920_negative_55220FEA/20190918_AG_VB_BioSFA_AlgC18test_Pilot_QE144_EPC18_USDAY46920_negative_55220FEA_sbatch.sbatch

In [ ]:
# %system sbatch /global/cscratch1/sd/bpb/mzmine_jobs/20190418_JJ_RM_Garden_v2EcoFAB_2019Apr_QE144_Ag68377-924_USHXG01160_positive_BD1728C6/20190418_JJ_RM_Garden_v2EcoFAB_2019Apr_QE144_Ag68377-924_USHXG01160_positive_BD1728C6_sbatch.sbatch

# %system sbatch /global/cscratch1/sd/bpb/mzmine_jobs/20190418_JJ_RM_Garden_v2EcoFAB_2019Apr_QE144_Ag68377-924_USHXG01160_negative_E828977F/20190418_JJ_RM_Garden_v2EcoFAB_2019Apr_QE144_Ag68377-924_USHXG01160_negative_E828977F_sbatch.sbatch

In [ ]:
%system squeue -u bpb

Send each job to GNPS


In [ ]:
# import requests
# task_id = '0d7e35bb071b4679996580af5ef0abe7'
# positive_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=gnps_molecular_network_graphml/" % (task_id)
# positive_graphml = os.path.join('/global/homes/b/bpb/Downloads/', "candice_combo.graphml")

# local_file = open(positive_graphml, "w")
# local_file.write(requests.get(positive_url).text)
# local_file.close()

In [16]:
sys.path.insert(1,'/global/homes/b/bpb/repos/GNPS_quickstart/')
import uuid
execfile('/global/homes/b/bpb/repos/GNPS_quickstart/util.py')
def copy_and_submit_to_gnps(m,override=False):
    
    
    print(m['unique_basename'])
    new_basename = os.path.join(m['basedir'],m['unique_basename'])
    taskid_filename = '%s_%s.txt'%(new_basename,'gnps-uuid')
    if (os.path.isfile(taskid_filename)) & (override==False):
        print('Already submitted')
        with open(taskid_filename,'r') as fid:
            print(fid.read())
        return False

    metadata_filename = '%s_%s.tab'%(new_basename,'metadata')
    if os.path.isfile(metadata_filename):
        upload_to_gnps(metadata_filename,m['unique_basename'],'samplemetadata','bpbowen',gnps_password)
    else:
        print('METADATA NOT FOUND %s'%metadata_filename)
        return False
        
    mgf_filename = '%s_%s.mgf'%(new_basename,'MSMS')
    if os.path.isfile(mgf_filename):
        upload_to_gnps(mgf_filename,m['unique_basename'],'featurems2','bpbowen',gnps_password)
    else:
        print('SPECTRA NOT FOUND %s'%mgf_filename)
        return False
        
    quant_filename = '%s_%s.csv'%(new_basename,'peak-area')
    if os.path.isfile(quant_filename):
        upload_to_gnps(quant_filename,m['unique_basename'],'featurequantification','bpbowen',gnps_password)
    else:
        print('QUANT NOT FOUND %s'%quant_filename)
        return False
    
    task_id = launch_GNPS_featurenetworking_workflow( m['unique_basename'],m['unique_basename'], 
                                           'bpbowen', gnps_password, 'ben.bowen@gmail.com',
                                           'MZMINE2', [], 'HIGHRES',
                                           os.path.basename(metadata_filename),
                                           os.path.basename(mgf_filename),
                                           os.path.basename(quant_filename),
                                          uuid.uuid1()) #I don't think this uuid does anything
    if task_id is not None:
        with open(taskid_filename,'w') as fid:
            fid.write('https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=%s'%task_id)
        # SUCCESS!!!
        return task_id
    else:
        return False

for i,m in enumerate(mzmine_things):
    taskid = copy_and_submit_to_gnps(m,override=False)
    print(i,taskid)
    print('')


/global/common/software/m2650/python-cori/lib/python2.7/site-packages/IPython/kernel/__main__.py:4: DeprecationWarning: Python 2 suport will be removed in ftputil 4.0.0
20190918_AG_VB_BioSFA_AlgC18test_Pilot_QE144_EPC18_USDAY46920_positive_F379EF3C
/global/common/software/m2650/python-cori/lib/python2.7/site-packages/IPython/kernel/__main__.py:87: DeprecationWarning: `use_list_a_option` will default to `False` in ftputil 4.x.x
MAKING DIR
MAKING Group DIR
MAKING Group DIR
MAKING Group DIR
/global/common/software/m2650/python-cori/lib/python2.7/site-packages/urllib3/connectionpool.py:858: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
/global/common/software/m2650/python-cori/lib/python2.7/site-packages/urllib3/connectionpool.py:858: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
252f48fa7db54db797db3fba2a2b4025
Launched Task: : 252f48fa7db54db797db3fba2a2b4025
(0, u'252f48fa7db54db797db3fba2a2b4025')

20190918_AG_VB_BioSFA_AlgC18test_Pilot_QE144_EPC18_USDAY46920_negative_55220FEA
MAKING DIR
MAKING Group DIR
MAKING Group DIR
MAKING Group DIR
/global/common/software/m2650/python-cori/lib/python2.7/site-packages/urllib3/connectionpool.py:858: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
/global/common/software/m2650/python-cori/lib/python2.7/site-packages/urllib3/connectionpool.py:858: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
cef9aa2683944829a734609f95a857e7
Launched Task: : cef9aa2683944829a734609f95a857e7
(1, u'cef9aa2683944829a734609f95a857e7')


In [51]:
df = pd.DataFrame(mzmine_things)
df = df[['basedir','unique_basename','polarity']]
# df.drop(columns=['file_list','file_filters','groups','param_dict_flat','param_dict_unflat','xml_string','mzmine_launcher'],inplace=True)
for i,m in df.iterrows():
    new_basename = os.path.join(m['basedir'],m['unique_basename'])
    taskid_filename = '%s_%s.txt'%(new_basename,'gnps-uuid')
    if (os.path.isfile(taskid_filename)):
        with open(taskid_filename,'r') as fid:
            df.loc[i,'task_id'] = fid.read().split('=')[-1]
    df.loc[i,'stripped_basename'] = '_'.join(m['unique_basename'].replace('positive','').replace('negative','').split('_')[:-1])
#     df.loc[i,'stripped_basename'] = '_'.join(m['unique_basename'].replace('Pos','').replace('Neg','').split('_')[:-1])

df.fillna('',inplace=True)
df


Out[51]:
basedir unique_basename polarity task_id stripped_basename
0 /global/cscratch1/sd/bpb/mzmine_jobs/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC... 20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_positive_763DE1F7 positive a8a459391a51435db7b85b60e3b09cfb 20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_
1 /global/cscratch1/sd/bpb/mzmine_jobs/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC... 20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_negative_4FF0A628 negative 98e6ae87c6074d7ea51975555221360a 20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_
2 /global/cscratch1/sd/bpb/mzmine_jobs/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC... 20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_positive_6AD23F7C positive 6345d02fd4ac498d8e8da8ec5664a6f4 20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_
3 /global/cscratch1/sd/bpb/mzmine_jobs/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC... 20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_negative_CDBCC477 negative 84cbbe4cc8d34ce58e83d97423529d9d 20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_
4 /global/cscratch1/sd/bpb/mzmine_jobs/20161212_TS_BSC_EthAc_extracts_positive_7C475F10 20161212_TS_BSC_EthAc_extracts_positive_7C475F10 positive 00a5f481f5434c84bccce13752b347ce 20161212_TS_BSC_EthAc_extracts_
5 /global/cscratch1/sd/bpb/mzmine_jobs/20161212_TS_BSC_EthAc_extracts_negative_63389345 20161212_TS_BSC_EthAc_extracts_negative_63389345 negative dd10d9cfe6bc4ab5bca8a891e89c2b76 20161212_TS_BSC_EthAc_extracts_

In [53]:
df = df.loc[2:]

In [54]:
d = {}
print('module load python')
for i in df['stripped_basename'].unique():
    if len(df[df['stripped_basename']==i].index)>1:
        d[i] = {}
        d[i]['positive'] = {}
        d[i]['negative'] = {}
        for j in df[df['stripped_basename']==i].index:
            d[i][df['polarity'][j]]['job_id'] = df['task_id'][j]
            d[i][df['polarity'][j]]['basedir'] = df['basedir'][j]
            d[i][df['polarity'][j]]['unique_basename'] = df['unique_basename'][j]
#         mydir = '/global/cscratch1/sd/bpb/mzmine_jobs/%s'%i
        if (d[i]['positive']['job_id']!='') & (d[i]['negative']['job_id']!=''):
            print('#%s'%i)
#             if not os.path.isdir(mydir):
#                 os.mkdir(mydir)
            pos_outfile = os.path.join(d[i]['positive']['basedir'],'positive_network.graphml')
            neg_outfile = os.path.join(d[i]['negative']['basedir'],'negative_network.graphml')
            merged_outfile = os.path.join(d[i]['positive']['basedir'],'%s_merged_network.graphml'%d[i]['positive']['unique_basename'])
            command = 'python /global/homes/b/bpb/repos/MergePolarity/merge_polarity.py'
#              --positive-graphml %s --negative-graphml %s
            print('%s --output-graphml %s --positive-network-task %s --negative-network-task %s --RT_TOLERANCE 0.15'%(
                command,
                merged_outfile,
#                 pos_outfile,
#                 neg_outfile,
                d[i]['positive']['job_id'],
                d[i]['negative']['job_id']))


module load python
#20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_
python /global/homes/b/bpb/repos/MergePolarity/merge_polarity.py --output-graphml /global/cscratch1/sd/bpb/mzmine_jobs/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_positive_6AD23F7C/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_positive_6AD23F7C_merged_network.graphml --positive-network-task 6345d02fd4ac498d8e8da8ec5664a6f4 --negative-network-task 84cbbe4cc8d34ce58e83d97423529d9d --RT_TOLERANCE 0.15
#20161212_TS_BSC_EthAc_extracts_
python /global/homes/b/bpb/repos/MergePolarity/merge_polarity.py --output-graphml /global/cscratch1/sd/bpb/mzmine_jobs/20161212_TS_BSC_EthAc_extracts_positive_7C475F10/20161212_TS_BSC_EthAc_extracts_positive_7C475F10_merged_network.graphml --positive-network-task 00a5f481f5434c84bccce13752b347ce --negative-network-task dd10d9cfe6bc4ab5bca8a891e89c2b76 --RT_TOLERANCE 0.15

CHGRP to metatlas


In [55]:
for i,m in enumerate(mzmine_things):
    print("changing permissions for %s"%m['basedir'])
    os.system("chgrp -R metatlas %s"%m['basedir'])
os.system("chmod o+rx /global/cscratch1/sd/bpb")
os.system("chmod o+rx /global/cscratch1/sd/bpb/mzmine_jobs")


changing permissions for /global/cscratch1/sd/bpb/mzmine_jobs/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_positive_763DE1F7
changing permissions for /global/cscratch1/sd/bpb/mzmine_jobs/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_negative_4FF0A628
changing permissions for /global/cscratch1/sd/bpb/mzmine_jobs/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_positive_6AD23F7C
changing permissions for /global/cscratch1/sd/bpb/mzmine_jobs/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_negative_CDBCC477
changing permissions for /global/cscratch1/sd/bpb/mzmine_jobs/20161212_TS_BSC_EthAc_extracts_positive_7C475F10
changing permissions for /global/cscratch1/sd/bpb/mzmine_jobs/20161212_TS_BSC_EthAc_extracts_negative_63389345
Out[55]:
0

In [56]:
for i,m in enumerate(mzmine_things):
    print("%s"%m['basedir'])


/global/cscratch1/sd/bpb/mzmine_jobs/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_positive_763DE1F7
/global/cscratch1/sd/bpb/mzmine_jobs/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_negative_4FF0A628
/global/cscratch1/sd/bpb/mzmine_jobs/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_positive_6AD23F7C
/global/cscratch1/sd/bpb/mzmine_jobs/20191004_AG_AO_JGI2ndMet_Kphytohabitans_SetMerged_QE144_EPC18_USDAY46920_negative_CDBCC477
/global/cscratch1/sd/bpb/mzmine_jobs/20161212_TS_BSC_EthAc_extracts_positive_7C475F10
/global/cscratch1/sd/bpb/mzmine_jobs/20161212_TS_BSC_EthAc_extracts_negative_63389345

End of MZMINE + GNPS + Merge


In [ ]:
def group_consecutive(data,stepsize=10.0,do_ppm=True):
    """
    split a numpy array where consecutive elements are greater than stepsize
    can be ppm or value
    if idx is not None then returns indices of elements otherwise returns values
    
    The main use case is an unsorted list of m/zs as "data" and optionally 
    their numerical index as "idx". Typically a user would want to retrieve 
    the group indices in the original order that they provided their list 
    of m/zs.
    
    usage:
    
       
    """
    if type(data) is np.ndarray:
        # cool way to sort and unsort array:
        idx_sorted = data.argsort()
        sort_w_unsort = np.column_stack((np.arange(idx_sorted.size),idx_sorted))
        # sort_w_unsort[:,0] are the original indices of data
        # sort_w_unsort[:,1] are the sorted indices of data
        data_sorted = data[sort_w_unsort[:,1]]
        # np.argsort(sort_w_unsort[:,1]) returns the indices to map the sorted data back to the original
        # data_unsorted = data_sorted[np.argsort(sort_w_unsort[:,1])]
        
        if do_ppm:
            d = np.diff(data_sorted) / data_sorted[:-1] * 1e6
        else:
            d = np.diff(data_sorted)
        
        # make groups of the array
        data_groups = np.split(data_sorted, np.where(d > 2.0*stepsize)[0]+1)
        # replace each group of values with group index
        for i,data_slice in enumerate(data_groups):
            data_groups[i] = data_groups[i]*0 + i
        group_indices = np.concatenate(data_groups)
        # reorder the group indices
        group_indices = group_indices[np.argsort(sort_w_unsort[:,1])]
        return group_indices.astype(int)#
    else:
        print('not a numpy array. convert it and sort it first')

def map_mzgroups_to_data(mz_atlas,mz_group_indices,mz_data):
    """
    mz_atlas: m/z values from atlas
    mz_group_indices: integer index from "group_consecutive"
    mz_data: m/z values from raw data
    
    """
    from scipy import interpolate

    f = interpolate.interp1d(mz_atlas,np.arange(mz_atlas.size),kind='nearest',bounds_error=False,fill_value='extrapolate') #get indices of all mz values in the atlas
    idx = f(mz_data)   # iterpolate to find the nearest mz in the data for each mz in an atlas
    idx = idx.astype('int')
#     d = 1e6#np.abs(mz_data - mz_atlas[idx]) / mz_data * 1.0e6
#     output_mat = np.column_stack((d,))
    return mz_group_indices[idx]#output_mat

g = map_mzgroups_to_data(atlas['mz'].values[:],
                           atlas['group_index'].values[:],
                           msdata['mz'].values[:])
msdata['group_index'] = g#[:,1]
#     msdata['group_index_ppm'] = g[:,0]

df = pd.merge(atlas,msdata,left_on='group_index',right_on='group_index',how='outer',suffixes=('_atlas','_data'))

#grab all datapoints including "extra"
mz_condition = abs(df['mz_data']-df['mz_atlas'])/df['mz_atlas']*1e6<df['ppm_tolerance']
rt_min_condition = df['rt']>=(df['rt_min']-df['extra_time'])
rt_max_condition = df['rt']<=(df['rt_max']+df['extra_time'])
df = df[(mz_condition) & (rt_min_condition) & (rt_max_condition)]

In [ ]:
# %system cat /project/projectdirs/metatlas/projects/jgi_projects/20190522_KBL_JM_504478_Plant_Arab_QE-HF_C18_USDAY47560_positive_1F910A37/20190522_KBL_JM_504478_Plant_Arab_QE-HF_C18_USDAY47560_positive_1F910A37_batch-params.xml

In [ ]:
# with open('/project/projectdirs/metatlas/projects/jgi_projects/20190522_KBL_JM_504478_Plant_Arab_QE-HF_C18_USDAY47560_positive_1F910A37/20190522_KBL_JM_504478_Plant_Arab_QE-HF_C18_USDAY47560_positive_1F910A37_batch-params.xml','r') as fid:
#     s = fid.read()
# print(s)

In [ ]:
os.path.isfile(new_basename)

In [ ]:
m = mzmine_things[0]
with open('/global/homes/b/bpb/Downloads/gnps_section_mzmine_batch.xml','r') as fid:
    d = fid.read()
new_basename = os.path.join(m['basedir'],'%s_%s'%(m['unique_basename'],'msms.mgf'))
d = d.replace('/global/homes/b/bpb/Downloads/gnps_upload.mgf',new_basename)
print(d)
new_batch = os.path.join(m['basedir'],'%s_%s'%(m['unique_basename'],'gnps-submission.xml'))
with open(new_batch,'w') as fid:
    fid.write(d)
mzmine_launcher = mzm.get_latest_mzmine_binary(version=m['mzmine_version'])
print('%s %s'%(mzmine_launcher,new_batch))

In [ ]:
for i,m in enumerate(mzmine_things):
    print('%s %s'%(m['mzmine_launcher'],m['batch_filename']))

see if your jobs finished


In [ ]:
%system cat "/project/projectdirs/metatlas/projects/jgi_projects/20190522_KBL_JM_504478_Plant_Arab_QE-HF_C18_USDAY47560_positive_13EEF8DE/20190522_KBL_JM_504478_Plant_Arab_QE-HF_C18_USDAY47560_positive_13EEF8DE_peak-height.csv" | wc -l

In [ ]:
# df = dp.get_google_sheet(notebook_name='mzmine_jobs_20190625',sheet_name='JOBS',literal_cols=['file_filters','groups'])
# mzmine_things = df.T.to_dict().values()
import os.path, time

for i,m in enumerate(mzmine_things):
#     new_basename = os.path.join(m['basedir'],'%s_%s'%(m['unique_basename'],'peak-height.csv'))
#     new_basename = os.path.join(m['basedir'],'%s_%s'%(m['unique_basename'],'msms.mgf'))
    new_basename = os.path.join(m['basedir'],'%s%s'%(m['unique_basename'],'.err'))

    print(i,os.path.isfile(new_basename),new_basename)
    print("last modified: %s" % time.ctime(os.path.getmtime(new_basename)))
    with open(new_basename,'r') as fid:
        s = fid.read()
    print(s[-100:])
    print('')
#     df = pd.read_csv(new_basename)
#     print(df.shape)
#     print(os.listdir(m['basedir']))

turn profile into centroid


In [ ]:
filename

In [ ]:


In [ ]:
mzmine_things[0]['file_list']

In [ ]:
def make_new_mgf_text(t):
    data = [(tt.strip().split('\n')[:7],tt.strip().split('\n')[7:][:-1]) for tt in t.split('END IONS')]
    mgf_str = ''
    for data_slice in data:
        spectrum = [(float(dd.split(' ')[0]),float(dd.split(' ')[-1])) for dd in data_slice[-1]]
        peak_max,peak_min = sp.peakdet([s[1] for s in spectrum],10.0,[s[0] for s in spectrum])
        header = '\n'.join(data_slice[0])
        spec_str = '\n'.join(['%5.4f %5.1f'%(p[0],p[1]) for p in peak_max])
        mgf_str = '%s\n\n%s\n%s\nEND IONS\n'%(mgf_str,header,spec_str)
    return mgf_str


for i,m in enumerate(mzmine_things):
    filename = os.path.join(m['basedir'],'%s_%s'%(m['unique_basename'],'MSMS.mgf'))
    with open(filename,'r') as fid:
        t = fid.read()
        mgf_text = make_new_mgf_text(t)
    new_filename = os.path.join(m['basedir'],'%s_%s'%(m['unique_basename'],'MSMS-centroid.mgf'))
    print new_filename
    with open(new_filename,'w') as fid:
        fid.write('%s'%mgf_text)

In [ ]:
%matplotlib inline
from metatlas.helpers import spectralprocessing as sp
peak_max,peak_min = sp.peakdet([s[1] for s in spectrum],10.0,[s[0] for s in spectrum])
fig,ax = plt.subplots()
ax.set_title(len(peak_max))
ax.vlines([x[0] for x in peak_max],[0.0 for x in peak_max],[x[1] for x in peak_max])

compare results to true positives table


In [ ]:
mzm = reload(mzm)
df = dp.get_google_sheet(notebook_name='mzmine_jobs_20190625',sheet_name='JOBS',literal_cols=['file_filters','groups'])
df = df[df['done']==False]

mzmine_things = df.T.to_dict().values()
mz_tolerance = 0.01
rt_tolerance = 0.2

true_pos_filename = '/global/homes/b/bpb/Downloads/mana_bootcamp_true_positives.xlsx'

# base_path = '/project/projectdirs/metatlas/projects/jgi_projects/'
base_path = '/global/cscratch1/sd/bpb/mzmine_jobs/'

# project_path = '20190401_JGI_Metabolomics_Bootcamp_ReversePhase_18DCF58D'
# project_path = '20190401_JGI_Metabolomics_Bootcamp_HILIC_D34FE46C'
# project_path = '20190401_JGI_Metabolomics_Bootcamp_ReversePhase_positive_2800C348'
project_path = '20190401_JGI_Metabolomics_Bootcamp_ReversePhase_positive_4E39E174'
# project_path = '20190501_bootcamp_PeterAndeer'
# project_path = '20190501_bootcamp_BrianDeFelice'

feature_table_extension = 'height.csv' # for mzmine files
rt_column = 'row retention time'
mz_column = 'row m/z'
headerrows = 0
sep=','
n=4
# feature_table_extension = '.txt' # for mzmine files
# rt_column = 'Average Rt(min)'
# mz_column = 'Average Mz'
# headerrows = 3
# sep='\t'
data = []
for i,m in enumerate(mzmine_things):
    bd = m['basedir'].split('/')[-1]
    if 'HILIC' in bd.upper():
        chromatography = 'hilic'
    else:
        chromatography = 'reverse phase'
    istd,bio,total = mzm.summarize_results(n,true_pos_filename,base_path,bd,feature_table_extension,rt_column,mz_column,headerrows,sep,mz_tolerance,rt_tolerance)
    d = {'job':bd,'type':'istd','chromatography':chromatography,'job_type':'%s_%s'%(bd,'istd'),'total':total}
    for j,h in enumerate(istd):
        d['num_%d'%j] = h
    data.append(d)
    d = {'job':bd,'type':'bio','chromatography':chromatography,'job_type':'%s_%s'%(bd,'bio'),'total':total}
    for j,h in enumerate(bio):
        d['num_%d'%j] = h
    data.append(d)
df = pd.DataFrame(data)
df.set_index('job_type',drop=True,inplace=True)
df.head()
    
#     print(bio)

In [ ]:
df.index = [i[-30:-5] for i in df.index]

THE DEDUPLICATOR IS LIKELY REMOVING PEAKS THAT ARE REALLY DIFFERENT PEAKS


In [ ]:
%matplotlib inline
mzm = reload(mzm)
width = 0.4

fig,ax = plt.subplots(nrows=2,ncols=2,figsize=(21,15))
cols = [c for c in df.columns if 'num_' in c]
df_f = df[(df['type']=='bio') & (df['chromatography']=='reverse phase')]
df_f[cols].plot(kind='bar', sharex=False, sharey=False, stacked=True,ax=ax[0,0],width=width,position=0)
ax[0,0].set_yticks(np.linspace(0,1,11))
ax[0,0].grid(True)
ax[0,0].legend(loc='upper center',fontsize=20)
ax[0,0].set_title('Bio / Reverse-Phase')
ax[0,0].set_ylabel('Percent Correct')
ax2 = ax[0,0].twinx() # Create another axes that shares the same x-axis as ax.
df_f['total'].plot(kind='bar',ax=ax2, sharex=True, sharey=False, color='purple', width=width, position=1)
ax2.legend(loc='lower center',fontsize=20)
ax2.set_ylabel('Num Features')
ax2.set_xlim(-1,df_f.shape[0])

# ax2.set_xticklabels([i for i in df_f.index])


df_f = df[(df['type']=='bio') & (df['chromatography']=='hilic')]
df_f[cols].plot(kind='bar', stacked=True,ax=ax[0,1], width=width,position=0)
ax[0,1].set_yticks(np.linspace(0,1,11))
ax[0,1].grid(True)
ax[0,1].get_legend().remove()
ax[0,1].set_title('Bio / HILIC')
ax2 = ax[0,1].twinx() # Create another axes that shares the same x-axis as ax.
df_f['total'].plot(kind='bar',ax=ax2, sharex=True, sharey=False, color='purple', width=width, position=1)
ax2.set_ylabel('Num Features')
ax2.set_xlim(-1,df_f.shape[0])
ax[0,1].legend(loc='upper center',fontsize=20)


df_f = df[(df['type']=='istd') & (df['chromatography']=='reverse phase')]
df_f[cols].plot(kind='bar', stacked=True,ax=ax[1,0], width=width,position=0)
ax[1,0].set_yticks(np.linspace(0,1,11))
ax[1,0].grid(True)
ax[1,0].get_legend().remove()
ax[1,0].set_title('ISTD / Reverse-Phase')
ax2 = ax[1,0].twinx() # Create another axes that shares the same x-axis as ax.
df_f['total'].plot(kind='bar',ax=ax2, sharex=True, sharey=False, color='purple', width=width, position=1)
ax2.set_ylabel('Num Features')
ax2.set_xlim(-1,df_f.shape[0])
ax[1,0].legend(loc='upper center',fontsize=20)



df_f = df[(df['type']=='istd') & (df['chromatography']=='hilic')]
df_f[cols].plot(kind='bar', stacked=True,ax=ax[1,1], width=width,position=0)
ax[1,1].set_yticks(np.linspace(0,1,11))
ax[1,1].grid(True)
ax[1,1].get_legend().remove()
ax[1,1].set_title('ISTD / HILIC')
ax2 = ax[1,1].twinx() # Create another axes that shares the same x-axis as ax.
df_f['total'].plot(kind='bar',ax=ax2, sharex=True, sharey=False, color='purple', width=width, position=1)
ax2.set_ylabel('Num Features')
ax2.set_xlim(-1,df_f.shape[0])
ax[1,1].legend(loc='upper center',fontsize=20)

plt.tight_layout()

Check out the results and see if they make sense


In [ ]:
mzm = reload(mzm)
df = dp.get_google_sheet(notebook_name='mzmine_jobs_20190625',sheet_name='JOBS',literal_cols=['file_filters','groups'])
# mzmine_things = df.T.to_dict().values()
job = df.loc[44,:]
job

In [ ]:
os.listdir(job['basedir'])

Make GNPS metadata


In [ ]:
new_basename = os.path.join(job['basedir'],job['unique_basename'])
peak_height_filename = '%s_%s.csv'%(new_basename,'peak-height')
metadata_filename = '%s_%s.tab'%(new_basename,'metadata')

df = pd.read_csv(peak_height_filename)
mzml_files = [c.replace(' Peak height','') for c in df.columns if 'Peak height' in c]
pd.DataFrame(data={'filename':mzml_files,'ATTRIBUTE_sampletype':[m.split('_')[12] for m in mzml_files]}).set_index('filename',drop=True).to_csv(metadata_filename,sep='\t')

setup some specific things for each set not defined in the mzmine params recipe

  1. polarity for filtering any fps files that might be loaded up
  2. file list for processing
  3. export file names

In [ ]:
# with open('/project/projectdirs/metatlas/projects/mzmine_parameters/batch_files/bootcamp_template_batch_file.xml','r') as fid:
#     xml_str = fid.read()
    
# d = mzm.xml_to_dict(xml_str)
# t = mzm.dict_to_etree(d)
# mzm.indent_tree(t)
# s1 = mzm.tree_to_xml(t)


# df = mzm.flatten(d,enumerate_types=(list,))
# #sorted(['.'.join(tuple(str(x) for x in tup)) for tup in df.keys()])
# df2 = mzm.unflatten(df)
# # t2 = mzm.dict_to_etree(xml_2)
# # mzm.indent_tree(t2)
# # s2 = mzm.tree_to_xml(t2)



# # t = dict_to_etree(new_d)
# #     indent_tree(t)
# #     xml_batch_str = tree_to_xml(t,filename=task.input_xml)
# #     job_runner = '%s %s'%(task.mzmine_launcher,task.input_xml)

In [ ]:
df = dp.get_google_sheet(notebook_name='ADAP Bootcamp MZMine Untargeted Job Submission',sheet_name='min_max')
s = df.iloc[0]

test_params = {y:x for x,y in s.to_dict().iteritems() if len(x)>0 }

for k,v in test_params.iteritems():
    v2 = literal_eval(v)
#     test_params[k] = {'min':v2[0],'max':v2[1],'n':v2[2],'vals':np.linspace(v2[0],v2[1],num=v2[2])}
    test_params[k] = np.linspace(v2[0],v2[1],num=v2[2])

df.columns = df.iloc[0]
df = df.iloc[1:]

# do ast.literal_eval on lists
literal_cols = ['file_filters','groups']
for col in literal_cols:
    df[col] = df[col].apply(literal_eval)

df['is_group'] = [bool(strtobool(a)) for a in df['is_group']]
df['remove_isotopes'] = [bool(strtobool(a)) for a in df['remove_isotopes']]
df['peak_with_msms'] = [bool(strtobool(a)) for a in df['peak_with_msms']]

numeric_columns = [ u'max_peak_duration', u'min_rt', u'max_rt', u'min_peak_duration',u'min_rt', u'max_rt', u'min_peak_duration', u'max_peak_duration', u'smoothing_scans', u'min_num_scans', u'group_intensity_threshold',
                   u'min_peak_height', u'mz_tolerance', u'peak_to_valley_ratio', u'rt_tol_multifile',
                   u'rt_tol_perfile', u'ms1_noise_level', u'ms2_noise_level', u'chromatographic_threshold', 
                   u'search_for_minimum_rt_range', u'minimum_relative_height', u'mz_range_scan_pairing',
                   u'rt_range_scan_pairing','min_peaks_in_row','gapfill_intensity_tolerance',
                  'adap_sn_threshold','adap_peak_width_mult','adap_area_threshold','adap_rt_wavelet_min','adap_rt_wavelet_max']

df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce', axis=1)

mzmine_things = df.T.to_dict().values()

In [ ]:
# groups = ['20160825_KBL_C18_MP_Solar_Berk','20170728_KBL_C18_MP_Solar_Spain']
mzm = reload(mzm)
current_month = datetime.now().strftime('%m') #// 02 //This is 0 padded
current_year = datetime.now().strftime('%Y')  #// 2018
year_month_dir = '%s%s'%(current_year,current_month)
for i,m in enumerate(mzmine_things):
    mzmine_things[i]['files'] = mzm.get_files(m['groups'],m['filename_substring'],m['file_filters'],m['is_group'])
    print(m['basename'],m['polarity'],len(mzmine_things[i]['files']))
    new_dir = os.path.join(mzm.DATA_PATH,year_month_dir)
    if not os.path.isdir(new_dir):
        os.mkdir(new_dir)
    for j,f in enumerate(mzmine_things[i]['files']):    
        new_file_path = os.path.join(new_dir,os.path.basename(f))
        if not os.path.isfile(new_file_path):
            copyfile(f, new_file_path)
        mzmine_things[i]['files'][j] = new_file_path

In [ ]:
all_variables = sorted(test_params) 
combinations = list(product(*(test_params[var] for var in all_variables)))

In [ ]:
#For each mzmine thing, and for each parameter, make a new mzmine thing with that parameter
new_mzmine_things = []
for i,m in enumerate(mzmine_things): #iterate across hilic and rp
    counter = 0
    for values in combinations: #iterate through all the possible combinations
        new_mzmine_things.append(copy.deepcopy(m)) #append all the static info from an mzmine_thing
        new_mzmine_things[-1]['basename'] = '%d_%s'%(counter,m['basename']) #give the output files a unique filename
        new_mzmine_things[-1]['unique_basename'] = '%d_%s'%(counter,m['unique_basename']) #this might not be used. I can't remember.
        counter += 1
        for j,var in enumerate(all_variables): #iterate across the mzmine variables that need to be changed
            new_mzmine_things[-1][var] = values[j] #use the new value for a variable

In [ ]:
import glob as glob
status = []
for i,params in enumerate(new_mzmine_things):
    project_name = '%s_%s'%(params['unique_basename'],params['polarity'])
    results_dir = os.path.join(params['basedir'],project_name)
    peak_height_file = os.path.join(params['basedir'],'%s_%s_peak_height.csv'%(params['basename'],params['polarity']))
    # see if final mzmine feature table exists.
    job_done_1 = os.path.isfile(peak_height_file)
    # if mzmine workspace exists store this so it can be removed from job script for reruns
    new_mzmine_things[i]['mzmine_done'] = job_done_1
    
    # see if sheets/peak_height.tab exists
    job_done_2 = False
    identification_folder = os.path.join(results_dir,'identification')
    num_id_figures = 0
    if os.path.isfile(peak_height_file):
        with open(peak_height_file,'r') as fid:
            peaks = fid.read()
            num_peaks = len(peaks.split('\n'))-1
        num_id_figures = len(glob.glob(os.path.join(identification_folder,'*.pdf')))
        if num_peaks > 0:
            job_done_2 = True
    else:
        num_peaks = 0
    # if metatlas is done note this too, but these jobs shouldn't be rerun (since they are done)
    new_mzmine_things[i]['metatlas_done'] = job_done_2
    
    # see if the filtered mzmine workspace has been created
    job_done_3 = os.path.isfile(os.path.join(results_dir,'%s.mzmine'%project_name))
    # this won't change the job script but completes the book-keeping
    new_mzmine_things[i]['small_mzmine_done'] = job_done_3
    
    status.append({'0_basedir':params['basedir'].split('/')[-1],
                   '1_job':'%s_%s'%(params['unique_basename'],params['polarity']),
                   '2_features_done':job_done_1,
                  '3_num_features':num_peaks})
pd.DataFrame(status).to_csv('/global/homes/b/bpb/Downloads/mzmine_jobs_and_status.csv',index=False)    
# pd.DataFrame(status)

In [ ]:
# new_mzmine_things=[]
# print(len(mzmine_things))
# for i,params in enumerate(mzmine_things):
#     # see if final mzmine file exists.  this will happen even if peak_height.tab is missing
#     if (not params['metatlas_done']) or (not params['small_mzmine_done']) or (not params['mzmine_done']):
#         new_mzmine_things.append(params)
# mzmine_things = new_mzmine_things
# print(len(mzmine_things))

In [ ]:
# for m in pd.unique([m['basedir'] for m in new_mzmine_things]):
#     print('rm -r %s'%m)

In [ ]:
# for m in pd.unique([m['basedir'] for m in new_mzmine_things]):
#     print('chmod -R 770 %s'%m)
#     print('chgrp -R metatlas %s'%m)

In [ ]:


In [ ]:
mzm = reload(mzm)
def remove_workspace_from_job(job_filename):
    with open(job_filename,'r') as fid:
        s = fid.read()

    split_str = '</batchstep>\n'
    s = s.split(split_str)
    idx = [i for i,ss in enumerate(s) if 'peak_area' in ss][-1]
    s = s[:idx]
    s.append('</batch>\n')
    s = split_str.join(s)
    with open(job_filename,'w') as fid:
        fid.write('%s'%s)


with open('/global/homes/b/bpb/bootcamp/bootcamp_jobs.sh','w') as bcf:
    for i,params in enumerate(new_mzmine_things):
        #setup all the job scripts when job actually runs it will copy params to params-used
        job_script = mzm.create_job_script(params)

        
#         wipe out the peak area, mgf, and workspace export
        with open(job_script,'r') as fid:
            s = fid.read()
        job_filename = s.split(' ')[-1].strip()
        remove_workspace_from_job(job_filename)
        
        #dump the parameters to a json file
        with open( os.path.join(params['basedir'],'logs','%s_%s_params.json'%(params['basename'],params['polarity'])), 'w') as fid:
            fid.write(json.dumps(params))
        with open(job_script,'r') as fid:
            s = fid.read()
        bcf.write('%s\n'%s.split('\n')[-2])
        
    #     print('sbatch %s'%job_script)

In [ ]:
chgrp -R metatlas mzmine_jobs/ADAP_20190401_JGI_Metabolomics_Bootcamp_*

In [ ]:
24000*10/4/200/60

In [ ]:
import time
d = '/global/cscratch1/sd/jaws/prod/cromwell-executions/jgi_mzmine/d6a9b53b-fe5b-4782-b7c0-d019573eb9e7/call-run_mzmine'
for i in range(0,17):
    stat = os.stat('%s/shard-%d/execution/stderr.submit'%(d,i))
    print(time.ctime(stat.st_mtime))

In [ ]:
import json

In [ ]:
with open('/global/homes/b/bpb/bootcamp/bootcamp_jobs.sh','r') as bcf:
    s = bcf.read()
job_list = s.split('/global/common/software/m2650/mzmine_parameters/MZmine/MZmine-2.39/startMZmine_NERSC_Headless_Cori.sh ')
job_list = job_list[1:]
job_list = [j.strip() for j in job_list]
n = 6000
 
# using list comprehension 
final = [job_list[i * n:(i + 1) * n] for i in range((len(job_list) + n - 1) // n )]  
# print (final) 
for i,small_list in enumerate(final):
    d = {'jgi_mzmine.inputs':small_list}
    with open('/global/homes/b/bpb/adap_job_array_%d.json'%i,'w') as fid:
        json.dump(d,fid)

In [ ]:
len(final)

In [ ]:
import glob
files = glob.glob('/global/cscratch1/sd/bpb/mzmine_jobs/20190401_JGI_Metabolomics_Bootcamp_HILIC_D34FE46C/*.csv')
files

In [ ]:


In [ ]:


In [ ]:

OLD STUFF BELOW HERE


In [ ]:
all_batch_files = glob.glob('/project/projectdirs/metatlas/projects/jgi_projects/20190401_JGI_Metabolomics_Bootcamp_HILIC_D34FE46C/*.csv')
all_batch_files  = all_batch_files + glob.glob('/project/projectdirs/metatlas/projects/jgi_projects/20190401_JGI_Metabolomics_Bootcamp_ReversePhase_18DCF58D/*.csv')
all_batch_files = sorted(all_batch_files)
all_batch_files = [os.path.basename(a).replace('_peak_height.csv','.xml') for a in all_batch_files]
print('%s\n%s'%(all_jobs[0].split(' ')[-1],all_batch_files[0]))
os.path.basename(all_jobs[0].split(' ')[-1]) in all_batch_files

In [ ]:


In [ ]:


In [ ]:
len(all_completed_jobs)

In [ ]:
len(all_completed_jobs)

In [ ]:
# break the jobs into 10 smaller job sets

job_template = """#!/bin/bash
#SBATCH --qos=regular
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --constraint=haswell
#SBATCH --error="gp.err"
#SBATCH --output="gp.out"
export MALLOC_ARENA_MAX=1

source jobs.sh
"""

# job_template = """#!/bin/bash
# #SBATCH -t 00:15:00
# #SBATCH -C haswell
# #SBATCH -N 1
# #SBATCH -q realtime
# #SBATCH -A m1541
# #SBATCH --exclusive

# source jobs.sh
# """

# job_template="""#!/bin/bash
# #SBATCH -N 4
# #SBATCH -C haswell
# #SBATCH --exclusive
# #SBATCH -q realtime
# #SBATCH --error="gp.err"
# #SBATCH --output="gp.out"
# #SBATCH -A m1541
# #SBATCH -t 04:00:00
# #SBATCH -L project

# # cori specific tells it to not allocate memory on a per thread basis Java
# export MALLOC_ARENA_MAX=1

# module load taskfarmer
# export THREADS=4
# runcommands.sh jobs.sh
# """

# job_template = """#!/bin/bash
# #SBATCH -N 20 -c 64
# #SBATCH --error="gp.err"
# #SBATCH -L project 
# #SBATCH --output="gp.out"
# #SBATCH --qos=premium
# #SBATCH -A m1541
# #SBATCH --constraint=haswell
# #SBATCH -L project
# #SBATCH -t 24:00:00


# export HDF5_USE_FILE_LOCKING=FALSE

# module load taskfarmer
# export THREADS=4
# runcommands.sh jobs.sh"""

with open('/global/homes/b/bpb/bootcamp/bootcamp_jobs.sh','r') as bcf:
    s = bcf.read()

# remove jobs that have been done
all_jobs = s.split('\n')

all_completed_jobs = glob.glob('/project/projectdirs/metatlas/projects/jgi_projects/20190401_JGI_Metabolomics_Bootcamp_HILIC_D34FE46C/*.csv')
all_completed_jobs  = all_completed_jobs + glob.glob('/project/projectdirs/metatlas/projects/jgi_projects/20190401_JGI_Metabolomics_Bootcamp_ReversePhase_18DCF58D/*.csv')
all_completed_jobs = sorted(all_completed_jobs)
all_completed_jobs = [os.path.basename(a).replace('_peak_height.csv','.xml') for a in all_completed_jobs]

filtered_jobs = [x for x in all_jobs if os.path.basename(x.split(' ')[-1]) not in all_completed_jobs]
len(all_jobs),len(filtered_jobs),len(all_completed_jobs)

# filtered_jobs = filtered_jobs[:500]

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in xrange(0, len(l), n):
        yield l[i:i + n]
        
chunked_jobs = list(chunks(filtered_jobs,120))
print('rm -r /global/homes/b/bpb/bootcamp/jobs/job_chunk_*')
with open('/global/homes/b/bpb/bootcamp/knl_job_file.sh','w') as knl_file:
    for i,cj in enumerate(chunked_jobs):
        d = '/global/homes/b/bpb/bootcamp/jobs/job_chunk_%d'%i
        if not os.path.isdir(d):
            os.mkdir(d)
        job_file = os.path.join(d,'jobs_%d.sh'%i)
        sbatch_file = os.path.join(d,'job-%d.sbatch'%i)
    #     job_string = '\n'.join(cj)
        small_chunks = list(chunks(cj,4)) #make make job chunks in 3 each
        all_small_jobs = []
        for sc in small_chunks:
            all_small_jobs.append(' & \n'.join(sc))
        job_string = ' &\nwait\n'.join(all_small_jobs)
        job_string = '%s &\nwait\n'%job_string
        with open(job_file,'w') as fid: # write all the jobs to the file
            fid.write('%s'%job_string)
        with open(sbatch_file,'w') as fid:
            fid.write('%s'%job_template.replace('jobs.sh',job_file).replace('gp.','log_%d_job.'%i))
        knl_file.write('sbatch %s\n'%sbatch_file)

In [ ]: