1. Import Python Packages


In [ ]:
%matplotlib notebook
import sys, os
import glob
import json
sys.path.insert(0,'/global/homes/b/bpb/repos/metatlas')

# sys.path.insert(1,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )

import metatlas.metatlas_objects as metob
from metatlas.helpers import mzmine_helpers as mzm
from metatlas.helpers import dill2plots as dp
from metatlas.helpers import metatlas_get_data_helper_fun as ma_data
from metatlas.helpers import chromatograms_mp_plots as cp

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import multiprocessing as mp

from ast import literal_eval
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
# pd.set_option("display.max_colwidth", 1000000)

In [ ]:
# groups = dp.select_groups_for_analysis(name = 'Scoelicolor%Day6',
#                                        most_recent = True,
#                                        remove_empty = True,
#                                        include_list = [], exclude_list = ['Ref','pellet','_Del_'])#['QC','Blank'])

# files = []
# for g in groups:
#     for f in g.items:
#         files.append(f)
# sorted(list(pd.unique([f.name for f in files])))

In [ ]:
# CHECK TO SEE IF THERE IS A NO MSMS FILTER.  ADD MSMS REQUIRED FLAG TO def pk_checker(metatlas_dataset,atlas_df,compound_idx,params):
# Use Katie Murphy's as first analysis to go onto portal.  Do raw data too.
# Add cameron curries project

In [ ]:
# Go from dataframe to list of dictionaries
#pd.DataFrame(mzmine_things).T.to_dict().values()
# pd.DataFrame(mzmine_things).to_csv('/global/homes/b/bpb/Downloads/mzmine_tasks.csv',index=False)
df = pd.read_csv('/global/homes/b/bpb/Downloads/mzmine_tasks.csv')
literal_cols = ['blank_str','file_filters','groups']

for col in literal_cols:
    df[col] = df[col].apply(literal_eval)
df = df[df.process==True]

mzmine_things = df.T.to_dict().values()
df.tail()

Create job scripts in output folder and print


In [ ]:
# df = pd.read_csv('/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel_Spain_vs_Berkeley/20160824_and_20170808_C18_MP_Solar_negative.sbatch')
# for d in df['#!/bin/sh']:
#     print(d)

In [ ]:
import glob as glob
status = []
for i,params in enumerate(mzmine_things):
    project_name = '%s_%s'%(params['basename'],params['polarity'])
    results_dir = os.path.join(params['basedir'],project_name)
    
    # see if final mzmine feature table exists.
    job_done_1 = os.path.isfile(os.path.join(params['basedir'],'intermediate_results','%s_%s.csv'%(params['basename'],params['polarity'])),)
    # if mzmine workspace exists store this so it can be removed from job script for reruns
    mzmine_things[i]['mzmine_done'] = job_done_1
    
    # see if sheets/peak_height.tab exists
    job_done_2 = False
    peak_height_file = os.path.join(results_dir,'sheets','peak_height.tab')
    identification_folder = os.path.join(results_dir,'identification')
    num_id_figures = 0
    if os.path.isfile(peak_height_file):
        with open(peak_height_file,'r') as fid:
            peaks = fid.read()
            num_peaks = len(peaks.split('\n'))-1
        num_id_figures = len(glob.glob(os.path.join(identification_folder,'*.pdf')))
        if num_id_figures > 1:
            job_done_2 = True
    else:
        num_peaks = 0
    # if metatlas is done note this too, but these jobs shouldn't be rerun (since they are done)
    mzmine_things[i]['metatlas_done'] = job_done_2
    
    # see if the filtered mzmine workspace has been created
    job_done_3 = os.path.isfile(os.path.join(results_dir,'%s.mzmine'%project_name))
    # this won't change the job script but completes the book-keeping
    mzmine_things[i]['small_mzmine_done'] = job_done_3
    
    status.append({'0_basedir':params['basedir'].split('/')[-1],'1_job':'%s_%s'%(params['basename'],params['polarity']),
                   '2_features_done':job_done_1,
                  '3_sheets__and_msms_done':job_done_2,
                   '4_workspace_done':job_done_3,
                  '5_num_features':num_peaks,
                  '6_num_id_figures':num_id_figures})
pd.DataFrame(status)

In [ ]:
new_mzmine_things=[]
print(len(mzmine_things))
for i,params in enumerate(mzmine_things):
    # see if final mzmine file exists.  this will happen even if peak_height.tab is missing
    if (not params['metatlas_done']) or (not params['small_mzmine_done']) or (not params['mzmine_done']):
        new_mzmine_things.append(params)
mzmine_things = new_mzmine_things
print(len(mzmine_things))

In [ ]:
for m in pd.unique([m['basedir'] for m in mzmine_things]):
    print('rm -r %s'%m)

In [ ]:
# groups = ['20160825_KBL_C18_MP_Solar_Berk','20170728_KBL_C18_MP_Solar_Spain']
mzm = reload(mzm)
for i,m in enumerate(mzmine_things):
    mzmine_things[i]['files'] = mzm.get_files(m['groups'],m['filename_substring'],m['file_filters'],m['is_group'])
    print(m['basename'],m['polarity'],len(mzmine_things[i]['files']))

In [ ]:
mzm = reload(mzm)
for i,params in enumerate(mzmine_things):
    #setup all the job scripts when job actually runs it will copy params to params-used
    job_script = mzm.create_job_script(params)
    
    #dump the parameters to a json file
    with open( os.path.join(params['basedir'],'logs','%s_%s_params.json'%(params['basename'],params['polarity'])), 'w') as fid:
        fid.write(json.dumps(params))

    if len(params['files'])>500:
        print('sbatch %s'%job_script).replace('.sbatch','_denovo.sbatch')
    else:
        print('sbatch %s'%job_script)

In [ ]:


In [ ]:
mzm = reload(mzm)
for i,params in enumerate(mz[bpb@cori06 ~ 02:14 PM] ~ > salloc -N 1 -q interactive -C haswell -t 4:00:00 -L project
salloc: Pending job allocation 10461042
salloc: job 10461042 queued and waiting for resources
salloc: job 10461042 has been allocated resources
salloc: Granted job allocation 10461042
salloc: Waiting for resource configuration
salloc: Nodes nid00119 are ready for job
[bpb@nid00119 ~ 02:14 PM] ~ > echo $HOSTNAME
cori06mine_things):
    #setup all the job scripts when job actually runs it will copy params to params-used
    job_script = mzm.create_job_script(params)
    #dump the parameters to a json file
    with open( os.path.join(params['basedir'],'logs','%s_%s_params.json'%(params['basename'],params['polarity'])), 'w') as fid:
        fid.write(json.dumps(params))
    print('source <(grep "^[^#;]" %s)'%job_script)

Gather files for a specific mzmine task


In [ ]:
tar_str = 'tar -czvf pactolus_files.tar.gz '
for i in df[df.basedir.str.contains('erbe')].index:
    for f in mzmine_things[i]['files']:
        print('cp %s ~/pactolus_files'%f.replace('.mzML','.pactolus.gz'))
#         tar_str = '%s %s'%(tar_str,f.replace('.mzML','.pactolus.gz'))
#         print(os.path.isfile(f.replace('.mzML','.pactolus.gz')))
# print tar_str

In [ ]:
%system cat /project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy/20170728_KBL_C18_PZ-KM_RootFnl_positive.sbatch

In [ ]:
# [bpb@cori06 ~ 02:14 PM] ~ > salloc -N 1 -q interactive -C haswell -t 4:00:00 -L project
# salloc: Pending job allocation 10461042
# salloc: job 10461042 queued and waiting for resources
# salloc: job 10461042 has been allocated resources
# salloc: Granted job allocation 10461042
# salloc: Waiting for resource configuration
# salloc: Nodes nid00119 are ready for job
# [bpb@nid00119 ~ 02:14 PM] ~ > echo $HOSTNAME
# cori06

In [ ]:
mzm = reload(mzm)

json_file = "/project/projectdirs/metatlas/projects/jgi_projects/LeBoldus_innoc_and_noninnoc_poplar/logs/"

mzm.clean_and_filter_mzmine_output(json_file)

In [ ]:
def pk_checker(metatlas_dataset,atlas_df,compound_idx,params):
    # across all files for a compound, make sure that
    # the peak height is above threshold,
    # the peak is within rt-span tolerance and
    # there is a minimal value on both sides of the peak
    # there is msms between rt_min and rt_max
    # this condition only needs to occur in one file
    has_msms = []
    for met_data in metatlas_dataset:
        try:
            has_msms.append(len(met_data[compound_idx]['data']['msms']['data']['mz'])>0)
        except:
            has_msms.append(False)
    rt_valid = [rt_checker(met_data,atlas_df,compound_idx,params) for met_data in metatlas_dataset]
    minmax_valid = [min_checker(met_data,atlas_df,compound_idx,params) for met_data in metatlas_dataset]
    is_valid = any((has_msms) and (rt_valid) and (minmax_valid))
    return is_valid

def rt_checker(met_data,atlas_df,compound_idx,params):
    """
    simply checks is actual peak is within rt_timespan of stated peak
    """
    try:
        valid = abs(met_data[compound_idx]['data']['ms1_summary']['rt_peak'] - atlas_df.loc[compound_idx,'rt_peak']) < params['rt_timespan']
    except:
        valid = False
    return valid

def min_checker(met_data,atlas_df,compound_idx,params):
    """
    looks forward and backward by rt_timespan and requires that the measured peak height be 
    greater than minima.
    """
    try:
        measured_rt_peak = met_data[compound_idx]['data']['ms1_summary']['rt_peak']
        peak_height = met_data[compound_idx]['data']['ms1_summary']['peak_height']
        if pd.isnull(measured_rt_peak) or peak_height < params['min_intensity']:
            return False
        else:
            eic = met_data[compound_idx]['data']['eic']
            condition_1 = np.asarray(eic['rt']) > measured_rt_peak
            condition_2 = np.asarray(eic['rt']) < (measured_rt_peak + params['rt_timespan'])
            condition_3 = np.asarray(eic['rt']) < measured_rt_peak
            condition_4 = np.asarray(eic['rt']) > (measured_rt_peak - params['rt_timespan'])
            intensity = np.asarray(eic['intensity'])
            
            forward_idx = (condition_1) & (condition_2)
            if any(forward_idx):
                forward_pass = peak_height > (intensity[forward_idx].min() * params['peak_to_valley_ratio'])
            else:
                forward_pass = False
            
            backward_idx = (condition_3) & (condition_4)
            if any(backward_idx):
                backward_pass = peak_height > (intensity[backward_idx].min() * params['peak_to_valley_ratio'])
            else:
                backward_pass = False
            return ((forward_pass) and (backward_pass))
    except:
        return False


def peak_height_df(metatlas_dataset,attribute='peak_height',zero_nans=True):
    """
    Turn a list of lists in a metatlas dataset into a 
    peak height dataframe where rows are features
    and columns are samples
    
    Valid attributes are:'mz_centroid','mz_peak',
    'num_ms1_datapoints','peak_area','peak_height',
    'rt_centroid','rt_peak'
    
    infs, nans, and nulls are converted to zero by default.
    """
    d = []
    for m in metatlas_dataset: #iterate over features
        row = []
        for mm in m: #iterate over files
            try:
                row.append(mm['data']['ms1_summary'][attribute])
            except:
                row.append(0)
        d.append(row)
    df = pd.DataFrame(d).T
    if zero_nans:
        df[pd.isnull(df)] = 0
    return df

def peak_in_top_n(metatlas_dataset,n_peaks=1000,prior_boolean=None):
    """
    
    """
    df = peak_height_df(metatlas_dataset)
    if prior_boolean is not None: #make dataframe
        top_peaks = df[prior_boolean].max(axis=1).rank(method='min',ascending=False)<=n_peaks
        df.loc[top_peaks.index,'top_peaks'] = top_peaks
        df['top_peaks'].fillna(False,inplace=True)
    else:
        top_peaks = df.max(axis=1).rank(method='min',ascending=False)<=n_peaks
        df['top_peaks'] = top_peaks
    return df['top_peaks'].tolist()

In [ ]:
os.listdir('/global/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy/20170728_KBL_C18_PZ-KM_RootFnl_positive/identification/')

I'm testing this one: /global/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy/20170728_KBL_C18_PZ-KM_RootFnl_positive


In [ ]:
# json_filename = "/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel_Spain_vs_Berkeley/20160824_and_20170808_C18_MP_Solar_negative_params.json"

json_filename = '/global/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy/20170728_KBL_C18_PZ-KM_RootFnl_negative_params.json'

with open(json_filename) as data_file:    
    params = json.load(data_file)
file_to_convert = os.path.join(params['basedir'],'%s_%s.csv'%(params['basename'],params['polarity']))
print('# Working on %s %s'%(params['basename'],params['polarity']))
if os.path.isfile(file_to_convert):
    # take the comprehensive csv from mzmine and make a peak-height only version of it
    df,original_mzmine = mzm.metatlas_formatted_atlas_from_mzmine_output(file_to_convert,params['polarity'],
                                                                             make_atlas=False,min_rt=0.55,
                                                                            remove_fragments=False,
                                                                            remove_adducts=False,
                                                                            remove_clusters=False,)

    #Filter features found in the blank
    df = mzm.clean_up_mzmine_dataframe(df)
    df_blank_compare = df.transpose().groupby(['b' if any([s in g.lower() for s in params['blank_str']]) else 's' for g in df.columns]).max().transpose()
    if 'b' in df_blank_compare.columns:
        df_features_not_in_blank = df_blank_compare[df_blank_compare['s'] > (params['sample_to_blank'] * df_blank_compare['b'])]
        print('# %d features total'%(df.shape[0]))
        print('# %d features removed by blank\n'%(df_blank_compare.shape[0] - df_features_not_in_blank.shape[0]))
    else:
        print('# No files have "blank" or "injbl" in their names.')
        df_features_not_in_blank = df_blank_compare
    df_features_not_in_blank.reset_index(inplace=True)
    print('#There are now %d features not in blank'%df_features_not_in_blank.shape[0])


    #Make an Atlas
    cids = []
    for j,row in df_features_not_in_blank.iterrows():
        my_mz_ref = metob.MzReference(mz=row.mz,mz_tolerance=row.mz_tolerance,detected_polarity=params['polarity'],lcms_run=None)
        my_rt_ref = metob.RtReference(rt_peak=row.rt_peak,rt_min=row.rt_min,rt_max=row.rt_max,lcms_run=None)
        my_id = metob.CompoundIdentification(rt_references=[my_rt_ref],mz_references=[my_mz_ref],name=row.label)
        cids.append(my_id)
    my_atlas = metob.Atlas(name='untargeted atlas',compound_identifications=cids)
    atlas_df = ma_data.make_atlas_df(my_atlas)

    #Make Groups
    all_files = [f.replace('Peak height','').replace('filtered','').strip() for f in df.columns if '.mzML' in f]
    metatlas_files = []      
    for f in all_files:
        f = metob.retrieve('Lcmsruns',name=f,username='*')[-1]
        if isinstance(f,type(metob.LcmsRun())):
            metatlas_files.append(f)
        else:
            print('%s NOT FOUND'%f)
            break
    groups = metob.Group(name='untargeted group',items=metatlas_files)

    #Get Data
    all_files = []
    for my_file in groups.items:
        all_files.append((my_file,groups,atlas_df,my_atlas))
    pool = mp.Pool(processes=min(12, len(all_files)))
    metatlas_dataset = pool.map(ma_data.get_data_for_atlas_df_and_file, all_files)
    pool.close()
    pool.terminate()

    # remove peaks that aren't valid.  See pk_checker for details on validity.
    num_features = len(metatlas_dataset[0])
    num_files = len(metatlas_dataset)

    valid_peaks = [pk_checker(metatlas_dataset,atlas_df,compound_idx,params) for compound_idx in range(num_features)]

    valid_peaks = peak_in_top_n(metatlas_dataset,n_peaks=10,prior_boolean=valid_peaks)
    
    df_filtered_peaks_atlas = atlas_df.loc[valid_peaks]
    file_to_convert = os.path.join(params['basedir'],'%s_%s.csv'%(params['basename'],params['polarity']))
    df_filtered_peaks_atlas[['mz','rt_peak','label']].sort_values('rt_peak').to_csv(file_to_convert.replace('.csv','') + '_formatted_peakfiltered.csv',index=False)
    print('# There are now %d filtered peaks'%df_filtered_peaks_atlas.shape[0])

    # make a filtered atlas, get data, and make_plots
    filtered_ids = []
    for i,valid in enumerate(valid_peaks):
        if valid:
            filtered_ids.append(my_atlas.compound_identifications[i])
    my_atlas.compound_identifications = filtered_ids
    atlas_df = ma_data.make_atlas_df(my_atlas)
    #Get Data
    all_files = []
    for my_file in groups.items:
        all_files.append((my_file,groups,atlas_df,my_atlas))
    pool = mp.Pool(processes=min(12, len(all_files)))
    metatlas_dataset = pool.map(ma_data.get_data_for_atlas_df_and_file, all_files)
    pool.close()
    pool.terminate()

    output_dir = os.path.join(params['basedir'],'%s_%s'%(params['basename'],params['polarity']))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    atlas_identifications = dp.export_atlas_to_spreadsheet(my_atlas,os.path.join(output_dir,'atlas_export.csv'))
    # atlas_identifications = dp.export_atlas_to_spreadsheet(myAtlas,'%s/sheets/%s.csv'%(plot_location_label,myAtlas.name))
    peak_height = dp.make_output_dataframe(input_fname = '',input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_height' , output_loc=os.path.join(output_dir,'sheets'))
    peak_area = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_area' , output_loc=os.path.join(output_dir,'sheets'))
    mz_peak = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_peak' , output_loc=os.path.join(output_dir,'sheets'))
    rt_peak = dp.make_output_dataframe(input_fname = my_file, input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [],fieldname='rt_peak' , output_loc=os.path.join(output_dir,'sheets'))
    mz_centroid = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_centroid' , output_loc=os.path.join(output_dir,'sheets'))
    rt_centroid = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='rt_centroid' , output_loc=os.path.join(output_dir,'sheets'))
    dp.make_identification_figure_v2(input_dataset = metatlas_dataset, input_fname = my_file, include_lcmsruns = [],exclude_lcmsruns = params['blank_str'].append('QC'), output_loc=os.path.join(output_dir,'identification'))

In [ ]:
1+1

In [ ]:
# mzmine_things = [
#     #potato
# #     raw data folder:
# #     /global/project/projectdirs/metatlas/raw_data/erbilgin/20161026_OE_Turnbaughswtpotato2
# #     filename structure:
# #     20161026_SK-OE_Turnbaugh_SwtPotato2_QE144_C18-USDAY32305_<pol>_<sample_type>_<??>_<??>_<??>
# #     sample_type codes:
# #     InjBl-MeOH = injection blank
# #     QC-SK160922-sopv2 = qc
# #     if it has "B" in it = extraction blank
# #     if it has a R or C = raw or cooked. The number is the bio replicate.
# #     Ignore all sample_type codes that have "H" in them; only keep the ones that have "S" in them
#                 {'polarity':'positive', #polarity of this analysis
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20161026_OE_Turnbaughswtpotato2', #output location
#                   'basename':'20161026_OE_Turnbaughswtpotato2_raw-v-cooked', #output file prefix string
#                  'groups':['20161026_OE_Turnbaughswtpotato2'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['_b','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                 'min_peak_duration' : 0.025,
#                 'max_peak_duration' : 30.0,
#                 'rt_tol_perfile' : 0.015,
#                 'rt_tol_multifile' : 0.15,
#                 'min_peak_height': 1e6,
#                 'noise_floor': 3e4,
#                 'mz_tolerance': 10.0,
#                 'min_sn_ratio': 2.0,
#                 'min_rt':1.1,
#                 'max_rt':100,
#                   'filename_substring':'%',#files must contain this substring % is the wildcard
#                   'file_filters':['_QC','_NEG','_ITSD','_H_'] #files containing these strings will be excluded
#                  },
#                 {'polarity':'negative', #polarity of this analysis
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20161026_OE_Turnbaughswtpotato2', #output location
#                   'basename':'20161026_OE_Turnbaughswtpotato2_raw-v-cooked', #output file prefix string
#                  'groups':['20161026_OE_Turnbaughswtpotato2'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['_b','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                  'min_peak_duration' : 0.025,
#                 'max_peak_duration' : 30.0,
#                 'rt_tol_perfile' : 0.015,
#                 'rt_tol_multifile' : 0.15,
#                 'min_peak_height': 1e6,
#                 'noise_floor': 3e4,
#                 'mz_tolerance': 10.0,
#                 'min_sn_ratio': 2.0,
#                 'min_rt':1.1,
#                 'max_rt':100,
#                   'filename_substring':'%',#files must contain this substring % is the wildcard
#                   'file_filters':['_QC','_POS','_ITSD','_H_'] #files containing these strings will be excluded
#                  },
    
# #     20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18
#                     {'polarity':'positive', #polarity of this analysis
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output location
#                   'basename':'20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output file prefix string
#                  'groups':['20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['waterctrl','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e5,
#                   'filename_substring':'%',#files must contain this substring % is the wildcard
#                   'file_filters':['_QC','_NEG'] #files containing these strings will be excluded
#                  },
#                 {'polarity':'negative', #polarity of this analysis
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output location
#                   'basename':'20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output file prefix string
#                  'groups':['20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['waterctrl','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e5,
#                   'filename_substring':'%',#files must contain this substring % is the wildcard
#                   'file_filters':['_QC','_POS'] #files containing these strings will be excluded
#                  },


#                         {'polarity':'positive', #polarity of this analysis
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output location
#                   'basename':'20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_C18', #output file prefix string
#                  'groups':['20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_C18'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['waterctrl','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e5,
#                   'filename_substring':'%',#files must contain this substring % is the wildcard
#                   'file_filters':['_QC','_NEG'] #files containing these strings will be excluded
#                  },
#                 {'polarity':'negative', #polarity of this analysis
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output location
#                   'basename':'20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_C18', #output file prefix string
#                  'groups':['20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_C18'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['waterctrl','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e5,
#                   'filename_substring':'%',#files must contain this substring % is the wildcard
#                   'file_filters':['_QC','_POS'] #files containing these strings will be excluded
#                  },
    
#                         {'polarity':'positive', #polarity of this analysis
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output location
#                   'basename':'20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_HIL', #output file prefix string
#                  'groups':['20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_HIL'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['waterctrl','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e5,
#                   'filename_substring':'%',#files must contain this substring % is the wildcard
#                   'file_filters':['_QC','_NEG'] #files containing these strings will be excluded
#                  },
#                 {'polarity':'negative', #polarity of this analysis
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output location
#                   'basename':'20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_HIL', #output file prefix string
#                  'groups':['20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_HIL'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['waterctrl','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e5,
#                   'filename_substring':'%',#files must contain this substring % is the wildcard
#                   'file_filters':['_QC','_POS'] #files containing these strings will be excluded
#                  },
#     #P simiae mutant lb dataset
# #     20170217_KBL_C18_RexMalm_Psimiae
#                 {'polarity':'positive', #polarity of this analysis
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170217_KBL_C18_RexMalm_Psimiae', #output location
#                   'basename':'20170217_KBL_C18_RexMalm_Psimiae_supernatant', #output file prefix string
#                  'groups':['20170217_KBL_C18_RexMalm_Psimiae'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',#files must contain this substring % is the wildcard
#                   'file_filters':['_QC','_NEG','_ITSD','_pellet_'] #files containing these strings will be excluded
#                  },
#                 {'polarity':'negative', #polarity of this analysis
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170217_KBL_C18_RexMalm_Psimiae', #output location
#                   'basename':'20170217_KBL_C18_RexMalm_Psimiae_supernatant', #output file prefix string
#                  'groups':['20170217_KBL_C18_RexMalm_Psimiae'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',#files must contain this substring % is the wildcard
#                   'file_filters':['_QC','_POS','_ITSD','_pellet_'] #files containing these strings will be excluded
#                  },
    
#                 #LeBoldus
#                 {'polarity':'positive', #polarity of this analysis
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/LeBoldus_innoc_and_noninnoc_poplar', #output location
#                   'basename':'20170731_KBL_C18LipidV7_PZ-KM_RootFnl', #output file prefix string
#                  'groups':['20170501_KBL_C18_15min_JL-KD_PopFun'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',#files must contain this substring % is the wildcard
#                   'file_filters':['_QC','_NEG','_ITSD'] #files containing these strings will be excluded
#                  },
#                 {'polarity':'negative',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/LeBoldus_innoc_and_noninnoc_poplar',
#                   'basename':'20170731_KBL_C18LipidV7_PZ-KM_RootFnl',
#                  'groups':['20170501_KBL_C18_15min_JL-KD_PopFun'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_POS','_ITSD']
#                  },
#                 {'polarity':'positive',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/LeBoldus_innoc_and_noninnoc_poplar',
#                   'basename':'20170503_KBL_ZHILIC_JL_PoplarFungus',
#                  'groups':['20170503_KBL_ZHILIC_JL_PoplarFungus'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_NEG','_ITSD']
#                  },
#                 {'polarity':'negative',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/LeBoldus_innoc_and_noninnoc_poplar',
#                   'basename':'20170503_KBL_ZHILIC_JL_PoplarFungus',
#                  'groups':['20170503_KBL_ZHILIC_JL_PoplarFungus'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_POS','_ITSD']
#                  },
    
#                 #BRAUNI
#                 {'polarity':'positive',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170919_KBL_C18_TD-DB_Brauni',
#                   'basename':'20170919_KBL_C18_TD-DB_BrauniiLipids',
#                  'groups':['20170919_KBL_C18_TD-DB_BrauniiLipids'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_NEG','_ITSD']
#                  },
#                 {'polarity':'negative',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170919_KBL_C18_TD-DB_Brauni',
#                   'basename':'20170919_KBL_C18_TD-DB_BrauniiLipids',
#                  'groups':['20170919_KBL_C18_TD-DB_BrauniiLipids'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_POS','_ITSD']
#                  },
#                 {'polarity':'positive',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170919_KBL_C18_TD-DB_Brauni',
#                   'basename':'20170914_KBL_C18Lipid_TD-DB_Braunii',
#                  'groups':['20170914_KBL_C18Lipid_TD-DB_Braunii'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_NEG','_ITSD']
#                  },
#                 {'polarity':'negative',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170919_KBL_C18_TD-DB_Brauni',
#                   'basename':'20170914_KBL_C18Lipid_TD-DB_Braunii',
#                  'groups':['20170914_KBL_C18Lipid_TD-DB_Braunii'],
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_POS','_ITSD']
#                  },
    
#                 #Zerbe-Murphy
#                 {'polarity':'positive',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy',
#                   'basename':'20170728_KBL_C18_PZ-KM_RootFnl',
#                  'groups':['20170728_KBL_C18_PZ-KM_RootFnl'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_NEG','_ITSD']
#                  },
#                 {'polarity':'negative',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy',
#                   'basename':'20170728_KBL_C18_PZ-KM_RootFnl',
#                  'groups':['20170728_KBL_C18_PZ-KM_RootFnl'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_POS','_ITSD']
#                  },
#                 {'polarity':'positive',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy',
#                   'basename':'20170731_KBL_C18LipidV7_PZ-KM_RootFnl',
#                  'groups':['20170731_KBL_C18LipidV7_PZ-KM_RootFnl'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_NEG','_ITSD']
#                  },
#                 {'polarity':'negative',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy',
#                   'basename':'20170731_KBL_C18LipidV7_PZ-KM_RootFnl',
#                  'groups':['20170731_KBL_C18LipidV7_PZ-KM_RootFnl'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_POS','_ITSD']
#                  },
    
#                 #RIPP project
#                 {'polarity':'positive',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/RITT_Project_20170901_KBL_C18_DP-PB_PoplarRhizo',
#                   'basename':'20170901_KBL_C18_DP-PB_PoplarRhizo_media',
#                  'groups':['20170901_KBL_C18_DP-PB_PoplarRhizo'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_NEG','_ITSD','pellet']
#                  },
#                 {'polarity':'negative',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/RITT_Project_20170901_KBL_C18_DP-PB_PoplarRhizo',
#                   'basename':'20170901_KBL_C18_DP-PB_PoplarRhizo_media',
#                  'groups':['20170901_KBL_C18_DP-PB_PoplarRhizo'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_POS','_ITSD','pellet']
#                  },
#                 {'polarity':'positive',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/RITT_Project_20170901_KBL_C18_DP-PB_PoplarRhizo',
#                   'basename':'20170901_KBL_C18_DP-PB_PoplarRhizo_pellet',
#                  'groups':['20170901_KBL_C18_DP-PB_PoplarRhizo'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_NEG','_ITSD','media']
#                  },
#                 {'polarity':'negative',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/RITT_Project_20170901_KBL_C18_DP-PB_PoplarRhizo',
#                   'basename':'20170901_KBL_C18_DP-PB_PoplarRhizo_pellet',
#                  'groups':['20170901_KBL_C18_DP-PB_PoplarRhizo'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_POS','_ITSD','media']
#                  },


#                 #Solar Panels Spain vs Berkeley
#                 {'polarity':'positive',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel_Spain_vs_Berkeley',
#                   'basename':'20160824_and_20170808_C18_MP_Solar',
#                  'groups':['20160825_KBL_C18_MP_Solar_Berk','20170728_KBL_C18_MP_Solar_Spain'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_NEG','_ITSD']
#                  },
#                 {'polarity':'negative',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel_Spain_vs_Berkeley',
#                   'basename':'20160824_and_20170808_C18_MP_Solar',
#                  'groups':['20160825_KBL_C18_MP_Solar_Berk','20170728_KBL_C18_MP_Solar_Spain'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_POS','_ITSD']
#                  },
#                 {'polarity':'positive',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel_Spain_vs_Berkeley',
#                   'basename':'20160824_and_20170808_C18lipid_MP_Solar',
#                  'groups':['20160824_KBL_C18Lipid_SolarPanel_MP','20170808_KBL_C18lipid_MP_Solar_Spain'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_NEG','_ITSD']
#                  },
#                 {'polarity':'negative',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel_Spain_vs_Berkeley',
#                   'basename':'20160824_and_20170808_C18lipid_MP_Solar',
#                  'groups':['20160824_KBL_C18Lipid_SolarPanel_MP','20170808_KBL_C18lipid_MP_Solar_Spain'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_POS','_ITSD']
#                  },
    
#                 #P simiae plate flip corrected
#                 {'polarity':'positive',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/P_Simiae_plate_flip',
#                   'basename':'20170428_KBL_C18_Psimiae_DiffSubs',
#                  'groups':['20170428_KBL_C18_Psimiae_DiffSubs_FlipCorrected'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_NEG','_ITSD']
#                  },
#                 {'polarity':'negative',
#                   'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/P_Simiae_plate_flip',
#                   'basename':'20170428_KBL_C18_Psimiae_DiffSubs',
#                  'groups':['20170428_KBL_C18_Psimiae_DiffSubs_FlipCorrected'], #list of folders containg datafiles
#                  'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
#                  'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
#                  'sample_to_blank':3,#cutoff for excluding features in the blank
#                  'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
#                  'peak_to_valley_ratio':3,
#                  'min_intensity':1e6,
#                   'filename_substring':'%',
#                   'file_filters':['_QC','_POS','_ITSD']
#                  },

# ]

The above job scripts will create a mzmine workspace and outputs that are filtered and have improved shaped peaks


In [ ]:
%%time
metob.store([my_atlas])

In [ ]:
dp = reload(dp)
%matplotlib notebook
a = dp.adjust_rt_for_selected_compound(metatlas_dataset,compound_idx=213,include_lcmsruns = [],alpha=0.75,width=10,height=6)

In [ ]:
output_dir = '/global/homes/b/bpb/Downloads/test_code_mzmine/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [ ]:
dp.make_identification_figure_v2(input_dataset = metatlas_dataset, input_fname = my_file, include_lcmsruns = [],exclude_lcmsruns = ['InjBl','QC','Blank','blank'], output_loc=os.path.join(output_dir,'identification'))

In [ ]:
n_files = len(metatlas_dataset)
n_features = len(metatlas_dataset[0])
has_msms = [False]*n_features #initialize all features to false
not_in_blank = [False]*n_features #initialize all features to false
good_peak_shape = [False]*n_features #initialize all features to false
top_1000 = [False]*n_features #initialize all features to false

In [ ]:
for m in metatlas_dataset:
    for n in m:
        print(n['data']['ms1_summary']['peak_height'])

In [ ]:
getattr(my_mz_ref.lcms_run,'unique_id','None')

In [ ]:
for i,m in enumerate(mzmine_things):
    file_to_convert = os.path.join(m['basedir'],'%s_%s.csv'%(m['basename'],m['polarity']))
#     file_info = os.stat(file_to_convert)
#     print(file_info[-1])
    if os.path.isfile(file_to_convert):
        # take the comprehensive csv from mzmine and make a peak-height only version of it
        df,original_mzmine = mzm.metatlas_formatted_atlas_from_mzmine_output(file_to_convert,m['polarity'],
                                                                                 make_atlas=False,min_rt=0.55,
                                                                                remove_fragments=False,
                                                                                remove_adducts=False,
                                                                                remove_clusters=False)
        df.to_csv(file_to_convert.replace('.csv','') + '_formatted.csv',index=True)
        
        # filter the peaks by those found in the blank at high intensity.
        df = clean_up_mzmine_dataframe(df)
        df_blank_compare = df.transpose().groupby(['b' if (('blank') or ('injbl')) in g.lower() else 's' for g in df.columns]).max().transpose()
        print('%s_%s'%(m['basename'],m['polarity']))
        if 'b' in df_blank_compare.columns:
            keep_features_not_in_blank = df_blank_compare[df_blank_compare['s'] > (3 * df_blank_compare['b'])]
            print('%d features total'%(df.shape[0]))
            print('%d features removed by blank\n'%(df_blank_compare.shape[0] - keep_features_not_in_blank.shape[0]))
        else:
            print('No files have "blank" or "injbl" in their names.')
            keep_features_not_in_blank = df_blank_compare
        keep_features_not_more_than_1000 = keep_features_not_in_blank[keep_features_not_in_blank['s'].rank(method='max') <= 1000]
        df_blank_filtered = df.loc[keep_features_not_more_than_1000.index]
        df_blank_filtered.to_csv(file_to_convert.replace('.csv','') + '_formatted_blankremoved.csv',index=True)

    else:
        print('#REDO:')
        job_cmd = make_task_and_job(m['basedir'],m['basename'],m['polarity'],m['files'])
        print('%s'%job_cmd)

In [ ]:


In [ ]:
temp = df_blank_filtered.head().reset_index()
# .reindex(columns=['mz','label','mz_tolerance','rt_peak','rt_min','rt_max','inchi_key','detected_polarity','adduct_assignments'])
temp.head()

In [ ]:
from shutil import copyfile

In [ ]:
%system mkdir /global/project/projectdirs/metatlas/raw_data/bpb/20170428_KBL_C18_Psimiae_DiffSubs_FlipCorrected

In [ ]:
162*2

In [ ]:
%system ls -l $new_dir | wc -l

In [ ]:
flip_file = '/global/homes/b/bpb/Downloads/20170428_KBL_RM_C18_Psimiae_PlateFlip_NewFileNames_2 (2).xlsx'
new_dir = '/global/project/projectdirs/metatlas/raw_data/bpb/20170428_KBL_C18_Psimiae_DiffSubs_FlipCorrected'
df = pd.read_excel(flip_file)
for i,row in df.iterrows():
    print(os.path.basename(row['New file name']),os.path.isfile(row['Old file name']))
    new_loc = os.path.join(new_dir,os.path.basename(row['New file name']))
    copyfile(row['Old file name'],new_loc)

In [ ]:
# df_super = df[[c for c in df.columns if not 'super' in c]]
# df_super = df_super[[c for c in df_super.columns if not 'kana' in c]]
# df_super = df_super[[c for c in df_super.columns if not 'max_intensity' in c]]
# df_super = df_super[[c for c in df_super.columns if not 'Excontrol' in c]]
# df_super = df_super[[c for c in df_super.columns if not 'adduct' in c]]

# df_super.set_index(['label','mz','mz_tolerance','rt_peak','rt_min','rt_max','inchi_key','detected_polarity'],inplace=True)
# df_super.columns = [''.join(c.split('_')[2:3]) if 'pellet' in c else c for c in df_super.columns ]

In [ ]:
df_super = df[[c for c in df.columns if not 'pellet' in c]]
df_super = df_super[[c for c in df_super.columns if not 'kana' in c]]
df_super = df_super[[c for c in df_super.columns if not 'max_intensity' in c]]
df_super = df_super[[c for c in df_super.columns if not 'Excontrol' in c]]
df_super = df_super[[c for c in df_super.columns if not 'adduct' in c]]
df_super['rt_min'] = df_super['rt_peak'] - 0.2
df_super.set_index(['label','mz','mz_tolerance','rt_peak','rt_min','rt_max','inchi_key','detected_polarity'],inplace=True)
df_super.columns = [''.join(c.split('_')[2:3]) if 'super' in c else c for c in df_super.columns ]

In [ ]:
import numpy as np
from scipy.stats import ttest_ind

group1 = df_super[[c for c in df_super.columns if c == 'LB']]
group2 = df_super[[c for c in df_super.columns if c == 'PS']]

# group2 = df[df['group'] == 'GROUP2']['data'].astype(float)

t, p = ttest_ind(group1.T, group2.T)

stats_df = pd.DataFrame(index=df_super.index)
stats_df['p_value'] = p
stats_df['t_score'] = t
stats_df['log2_fold_change'] = group2.min(axis=1).apply(lambda x: np.log2(x+1)) - \
                                group1.max(axis=1).apply(lambda x: np.log2(x+1)) 
stats_df['in_control'] = group1.median(axis=1) > 1e5
stats_df['median_control'] = group1.median(axis=1)
stats_df['median_treatment'] = group2.median(axis=1)
stats_df.head()

In [ ]:
fig, ax = plt.subplots(figsize=(8,7))
stats_df['log2_fold_change'].hist(bins=100)
ax.set_yscale('log')
ax.set_xlabel('LOG2 Fold Change')
ax.set_ylabel('#features')
plt.show()

In [ ]:
fig, ax = plt.subplots(figsize=(8,7))
ax.plot(stats_df['log2_fold_change'],stats_df['p_value'],'.')
# stats_df['fold_change'].apply(lambda x: np.log2(x+1)).hist(bins=100)
ax.set_yscale('log')
ax.set_xlabel('LOG2 fold change')
ax.set_ylabel('p_value')
plt.show()

In [ ]:
filtered_df = stats_df[(stats_df['log2_fold_change']>7) & (stats_df['median_treatment']>1e5)]
fig, ax = plt.subplots(figsize=(8,7))
# color_map = {filtered_df.log2_fold_change.min():"r", filtered_df.log2_fold_change.max():"g"} 
# z_as_colors = map(color_map.get, filtered_df.log2_fold_change) 
sc = ax.scatter(filtered_df.index.get_level_values('mz'),
                filtered_df.index.get_level_values('rt_peak'),
                c = filtered_df.log2_fold_change)
# stats_df['fold_change'].apply(lambda x: np.log2(x+1)).hist(bins=100)
# ax.set_yscale('log')
plt.colorbar(sc)
ax.set_xlabel('mz')
ax.set_ylabel('rt')

In [ ]:
# (~stats_df['in_control']) & 
filtered_df = stats_df[(stats_df['log2_fold_change']>7) & (stats_df['median_treatment']>3e5)]
print filtered_df.shape
# pd.options.display.precision = 5
# pd.set_option('display.max',200)
filtered_df.sort_values('log2_fold_change', ascending=False).head(20)

In [ ]:
df_super.iloc[df_super.index.get_level_values('label') == '754.3994@3.75']

In [ ]:
filtered_df.head()

In [ ]:
# filtered_df.index.names
df = pd.DataFrame(columns=filtered_df.index.names)
for i,row in enumerate(filtered_df.index.get_values()):
    for j,c in enumerate(df.columns):
        df.loc[i,c] = row[j]
# df.columns = 
# filtered_df.index.get_values()
# #.reset_index(level=[0,1,2])
# atlas_df = atlas_df[atlas_df.columns[:6]]
# # atlas_df.reset_index(inplace=True)
# # atlas_df = atlas_df[[]]
# # atlas_df.reset_index(inplace=True)
# atlas_df.fillna('')
# atlas_df

In [ ]:


In [ ]:
myAtlas = dp.make_atlas_from_spreadsheet(df,'20170406_ions_made_psimiae_in_LB_pos_c18',filetype='dataframe',
                                       sheetname='',
                                       polarity = 'positive',
                                       store=True,
                                      mz_tolerance = 10)
atlas_df = ma_data.make_atlas_df(myAtlas)

In [ ]:
groups = dp.select_groups_for_analysis(name = '%rexmalm_pos_super%',
                                       most_recent = True,
                                       remove_empty = True,#'Strain=SB214'
                                       include_list = [], exclude_list = [],)#['QC','Blank'])

In [ ]:
for g in groups:
    for i in g.items:
        print i.name

In [ ]:
all_files = []
for my_group in groups:
    for my_file in my_group.items:
        all_files.append((my_file,my_group,atlas_df,myAtlas))
        
pool = mp.Pool(processes=min(10, len(all_files)))
# from metatlas.helpers.metatlas_get_data_helper_fun import get_data_for_atlas_df_and_file
metatlas_dataset = pool.map(ma_data.get_data_for_atlas_df_and_file, all_files)
pool.close()
pool.terminate()
#If you're code crashes here, make sure to terminate any processes left open.

In [ ]:
output_dir = '/global/homes/b/bpb/Downloads/psim_c18_important_pos_super/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
atlas_identifications = dp.export_atlas_to_spreadsheet(myAtlas,os.path.join(output_dir,'atlas_export.csv'))
# dp = reload(dp)
# atlas_identifications = dp.export_atlas_to_spreadsheet(myAtlas,'%s/sheets/%s.csv'%(plot_location_label,myAtlas.name))
peak_height = dp.make_output_dataframe(input_fname = '',input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_height' , output_loc=os.path.join(output_dir,'sheets'))
# peak_area = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_area' , output_loc=os.path.join(output_dir,'sheets'))
# mz_peak = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_peak' , output_loc=os.path.join(output_dir,'sheets'))
# rt_peak = dp.make_output_dataframe(input_fname = my_file, input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [],fieldname='rt_peak' , output_loc=os.path.join(output_dir,'sheets'))
# mz_centroid = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_centroid' , output_loc=os.path.join(output_dir,'sheets'))
# rt_centroid = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='rt_centroid' , output_loc=os.path.join(output_dir,'sheets'))

In [ ]:
peak_height.head()

In [ ]:
peak_height.shape

In [ ]:
# from scipy import cluster
# mat = peak_height.as_matrix()
# mat[np.isnan(mat)] = np.nanmin(mat)
# labels = cluster.hierarchy.fclusterdata(mat,0.2,criterion='distance',method='average',metric='correlation')
# results = pd.DataFrame(data=labels, columns=['cluster'], index=peak_height.index)

In [ ]:
peak_height.index = ['@'.join(c.split('_')[:2]).replace('p','.') for c in peak_height.index]

In [ ]:
peak_height.columns = peak_height.columns.droplevel(0)

In [ ]:
peak_height.columns = ['_'.join(c.split('_')[14:18]) for c in peak_height.columns]

In [ ]:
peak_height = peak_height.fillna(0)

In [ ]:
# norm_peak_height = peak_height.copy()
# norm_peak_height = norm_peak_height.apply(lambda x: x / x.max())
# norm_peak_height.head()

In [ ]:
peak_height.shape

In [ ]:
# norm_peak_height.columns.sort_values().shape

In [ ]:
# c

In [ ]:
sf = pd.DataFrame()
for c in peak_height.columns.sort_values():
    sf[c] = peak_height[c]
for i,row in sf.iterrows():
    sf.loc[i,:] = row/row.max()

In [ ]:
sf.head()

In [ ]:
cm = plt.colormaps()

for i,c in enumerate(cm):
    print i,c

In [ ]:
for i,r in sf.iterrows():
    print r.max()

In [ ]:
# The returned object has a savefig method that should be used if you want to save the figure object without clipping the dendrograms.

# To access the reordered row indices, use: clustergrid.dendrogram_row.reordered_ind

# Column indices, use: clustergrid.dendrogram_col.reordered_ind

g.dendrogram_row.reordered_ind

In [ ]:
import seaborn as sns
cmap = cm[148]
# cmap = sns.light_palette(as_cmap=True)
# cmap = sns.diverging_palette(h_neg=210, h_pos=350, s=90, l=30, as_cmap=True)
# g  =sns.clustermap(peak_height.apply(lambda x: (x+1)**0.25),metric='correlation',figsize=(25,25),cmap=cmap)
# g  =sns.clustermap(peak_height.apply(lambda x: np.log10(x+1)),metric='correlation',figsize=(25,25))
g  =sns.clustermap(sf,metric='euclidean',figsize=(30,18),cmap=cmap,col_cluster=True)
plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0)  # For y axis
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90) # For x axis
plt.show()
# # http://seaborn.pydata.org/generated/seaborn.clustermap.html

In [ ]:
# from scipy import stats
# zscore = lambda x: (x - stats.nanmean(x)) / stats.nanstd(x)

In [ ]:
# fig, ax = plt.subplots(2,3,figsize=(20,30))
# for i in range(6):
#     plt.subplot(2,3,i+1)
#     row = peak_height.iloc[[i]]
#     c = 0
#     vals = []
#     for k,v in row.to_dict().items():
#         vals.append(v.items()[0][-1])
#     v = np.asarray(vals)
# #     v = v - np.nanmin(v)
# #     v = v / np.nanmax(v)
#     v[np.isnan(v)] = np.nanmin(v)
#     plt.plot(v,'.-')
#     plt.title(myAtlas.compound_identifications[i].name)
#     plt.gca().set_yscale('log')
# #     ax.set_yscale('log')
# plt.tight_layout()
# # for i,row in peak_height.iterrows():

    
# # peak_height.iloc[0,]

In [ ]: