In [ ]:
%matplotlib notebook
import sys, os
import glob
import json
sys.path.insert(0,'/global/homes/b/bpb/repos/metatlas')
# sys.path.insert(1,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
import metatlas.metatlas_objects as metob
from metatlas.helpers import mzmine_helpers as mzm
from metatlas.helpers import dill2plots as dp
from metatlas.helpers import metatlas_get_data_helper_fun as ma_data
from metatlas.helpers import chromatograms_mp_plots as cp
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import multiprocessing as mp
from ast import literal_eval
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
# pd.set_option("display.max_colwidth", 1000000)
In [ ]:
# groups = dp.select_groups_for_analysis(name = 'Scoelicolor%Day6',
# most_recent = True,
# remove_empty = True,
# include_list = [], exclude_list = ['Ref','pellet','_Del_'])#['QC','Blank'])
# files = []
# for g in groups:
# for f in g.items:
# files.append(f)
# sorted(list(pd.unique([f.name for f in files])))
In [ ]:
# CHECK TO SEE IF THERE IS A NO MSMS FILTER. ADD MSMS REQUIRED FLAG TO def pk_checker(metatlas_dataset,atlas_df,compound_idx,params):
# Use Katie Murphy's as first analysis to go onto portal. Do raw data too.
# Add cameron curries project
In [ ]:
# Go from dataframe to list of dictionaries
#pd.DataFrame(mzmine_things).T.to_dict().values()
# pd.DataFrame(mzmine_things).to_csv('/global/homes/b/bpb/Downloads/mzmine_tasks.csv',index=False)
df = pd.read_csv('/global/homes/b/bpb/Downloads/mzmine_tasks.csv')
literal_cols = ['blank_str','file_filters','groups']
for col in literal_cols:
df[col] = df[col].apply(literal_eval)
df = df[df.process==True]
mzmine_things = df.T.to_dict().values()
df.tail()
In [ ]:
# df = pd.read_csv('/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel_Spain_vs_Berkeley/20160824_and_20170808_C18_MP_Solar_negative.sbatch')
# for d in df['#!/bin/sh']:
# print(d)
In [ ]:
import glob as glob
status = []
for i,params in enumerate(mzmine_things):
project_name = '%s_%s'%(params['basename'],params['polarity'])
results_dir = os.path.join(params['basedir'],project_name)
# see if final mzmine feature table exists.
job_done_1 = os.path.isfile(os.path.join(params['basedir'],'intermediate_results','%s_%s.csv'%(params['basename'],params['polarity'])),)
# if mzmine workspace exists store this so it can be removed from job script for reruns
mzmine_things[i]['mzmine_done'] = job_done_1
# see if sheets/peak_height.tab exists
job_done_2 = False
peak_height_file = os.path.join(results_dir,'sheets','peak_height.tab')
identification_folder = os.path.join(results_dir,'identification')
num_id_figures = 0
if os.path.isfile(peak_height_file):
with open(peak_height_file,'r') as fid:
peaks = fid.read()
num_peaks = len(peaks.split('\n'))-1
num_id_figures = len(glob.glob(os.path.join(identification_folder,'*.pdf')))
if num_id_figures > 1:
job_done_2 = True
else:
num_peaks = 0
# if metatlas is done note this too, but these jobs shouldn't be rerun (since they are done)
mzmine_things[i]['metatlas_done'] = job_done_2
# see if the filtered mzmine workspace has been created
job_done_3 = os.path.isfile(os.path.join(results_dir,'%s.mzmine'%project_name))
# this won't change the job script but completes the book-keeping
mzmine_things[i]['small_mzmine_done'] = job_done_3
status.append({'0_basedir':params['basedir'].split('/')[-1],'1_job':'%s_%s'%(params['basename'],params['polarity']),
'2_features_done':job_done_1,
'3_sheets__and_msms_done':job_done_2,
'4_workspace_done':job_done_3,
'5_num_features':num_peaks,
'6_num_id_figures':num_id_figures})
pd.DataFrame(status)
In [ ]:
new_mzmine_things=[]
print(len(mzmine_things))
for i,params in enumerate(mzmine_things):
# see if final mzmine file exists. this will happen even if peak_height.tab is missing
if (not params['metatlas_done']) or (not params['small_mzmine_done']) or (not params['mzmine_done']):
new_mzmine_things.append(params)
mzmine_things = new_mzmine_things
print(len(mzmine_things))
In [ ]:
for m in pd.unique([m['basedir'] for m in mzmine_things]):
print('rm -r %s'%m)
In [ ]:
# groups = ['20160825_KBL_C18_MP_Solar_Berk','20170728_KBL_C18_MP_Solar_Spain']
mzm = reload(mzm)
for i,m in enumerate(mzmine_things):
mzmine_things[i]['files'] = mzm.get_files(m['groups'],m['filename_substring'],m['file_filters'],m['is_group'])
print(m['basename'],m['polarity'],len(mzmine_things[i]['files']))
In [ ]:
mzm = reload(mzm)
for i,params in enumerate(mzmine_things):
#setup all the job scripts when job actually runs it will copy params to params-used
job_script = mzm.create_job_script(params)
#dump the parameters to a json file
with open( os.path.join(params['basedir'],'logs','%s_%s_params.json'%(params['basename'],params['polarity'])), 'w') as fid:
fid.write(json.dumps(params))
if len(params['files'])>500:
print('sbatch %s'%job_script).replace('.sbatch','_denovo.sbatch')
else:
print('sbatch %s'%job_script)
In [ ]:
In [ ]:
mzm = reload(mzm)
for i,params in enumerate(mz[bpb@cori06 ~ 02:14 PM] ~ > salloc -N 1 -q interactive -C haswell -t 4:00:00 -L project
salloc: Pending job allocation 10461042
salloc: job 10461042 queued and waiting for resources
salloc: job 10461042 has been allocated resources
salloc: Granted job allocation 10461042
salloc: Waiting for resource configuration
salloc: Nodes nid00119 are ready for job
[bpb@nid00119 ~ 02:14 PM] ~ > echo $HOSTNAME
cori06mine_things):
#setup all the job scripts when job actually runs it will copy params to params-used
job_script = mzm.create_job_script(params)
#dump the parameters to a json file
with open( os.path.join(params['basedir'],'logs','%s_%s_params.json'%(params['basename'],params['polarity'])), 'w') as fid:
fid.write(json.dumps(params))
print('source <(grep "^[^#;]" %s)'%job_script)
In [ ]:
tar_str = 'tar -czvf pactolus_files.tar.gz '
for i in df[df.basedir.str.contains('erbe')].index:
for f in mzmine_things[i]['files']:
print('cp %s ~/pactolus_files'%f.replace('.mzML','.pactolus.gz'))
# tar_str = '%s %s'%(tar_str,f.replace('.mzML','.pactolus.gz'))
# print(os.path.isfile(f.replace('.mzML','.pactolus.gz')))
# print tar_str
In [ ]:
%system cat /project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy/20170728_KBL_C18_PZ-KM_RootFnl_positive.sbatch
In [ ]:
# [bpb@cori06 ~ 02:14 PM] ~ > salloc -N 1 -q interactive -C haswell -t 4:00:00 -L project
# salloc: Pending job allocation 10461042
# salloc: job 10461042 queued and waiting for resources
# salloc: job 10461042 has been allocated resources
# salloc: Granted job allocation 10461042
# salloc: Waiting for resource configuration
# salloc: Nodes nid00119 are ready for job
# [bpb@nid00119 ~ 02:14 PM] ~ > echo $HOSTNAME
# cori06
In [ ]:
mzm = reload(mzm)
json_file = "/project/projectdirs/metatlas/projects/jgi_projects/LeBoldus_innoc_and_noninnoc_poplar/logs/"
mzm.clean_and_filter_mzmine_output(json_file)
In [ ]:
def pk_checker(metatlas_dataset,atlas_df,compound_idx,params):
# across all files for a compound, make sure that
# the peak height is above threshold,
# the peak is within rt-span tolerance and
# there is a minimal value on both sides of the peak
# there is msms between rt_min and rt_max
# this condition only needs to occur in one file
has_msms = []
for met_data in metatlas_dataset:
try:
has_msms.append(len(met_data[compound_idx]['data']['msms']['data']['mz'])>0)
except:
has_msms.append(False)
rt_valid = [rt_checker(met_data,atlas_df,compound_idx,params) for met_data in metatlas_dataset]
minmax_valid = [min_checker(met_data,atlas_df,compound_idx,params) for met_data in metatlas_dataset]
is_valid = any((has_msms) and (rt_valid) and (minmax_valid))
return is_valid
def rt_checker(met_data,atlas_df,compound_idx,params):
"""
simply checks is actual peak is within rt_timespan of stated peak
"""
try:
valid = abs(met_data[compound_idx]['data']['ms1_summary']['rt_peak'] - atlas_df.loc[compound_idx,'rt_peak']) < params['rt_timespan']
except:
valid = False
return valid
def min_checker(met_data,atlas_df,compound_idx,params):
"""
looks forward and backward by rt_timespan and requires that the measured peak height be
greater than minima.
"""
try:
measured_rt_peak = met_data[compound_idx]['data']['ms1_summary']['rt_peak']
peak_height = met_data[compound_idx]['data']['ms1_summary']['peak_height']
if pd.isnull(measured_rt_peak) or peak_height < params['min_intensity']:
return False
else:
eic = met_data[compound_idx]['data']['eic']
condition_1 = np.asarray(eic['rt']) > measured_rt_peak
condition_2 = np.asarray(eic['rt']) < (measured_rt_peak + params['rt_timespan'])
condition_3 = np.asarray(eic['rt']) < measured_rt_peak
condition_4 = np.asarray(eic['rt']) > (measured_rt_peak - params['rt_timespan'])
intensity = np.asarray(eic['intensity'])
forward_idx = (condition_1) & (condition_2)
if any(forward_idx):
forward_pass = peak_height > (intensity[forward_idx].min() * params['peak_to_valley_ratio'])
else:
forward_pass = False
backward_idx = (condition_3) & (condition_4)
if any(backward_idx):
backward_pass = peak_height > (intensity[backward_idx].min() * params['peak_to_valley_ratio'])
else:
backward_pass = False
return ((forward_pass) and (backward_pass))
except:
return False
def peak_height_df(metatlas_dataset,attribute='peak_height',zero_nans=True):
"""
Turn a list of lists in a metatlas dataset into a
peak height dataframe where rows are features
and columns are samples
Valid attributes are:'mz_centroid','mz_peak',
'num_ms1_datapoints','peak_area','peak_height',
'rt_centroid','rt_peak'
infs, nans, and nulls are converted to zero by default.
"""
d = []
for m in metatlas_dataset: #iterate over features
row = []
for mm in m: #iterate over files
try:
row.append(mm['data']['ms1_summary'][attribute])
except:
row.append(0)
d.append(row)
df = pd.DataFrame(d).T
if zero_nans:
df[pd.isnull(df)] = 0
return df
def peak_in_top_n(metatlas_dataset,n_peaks=1000,prior_boolean=None):
"""
"""
df = peak_height_df(metatlas_dataset)
if prior_boolean is not None: #make dataframe
top_peaks = df[prior_boolean].max(axis=1).rank(method='min',ascending=False)<=n_peaks
df.loc[top_peaks.index,'top_peaks'] = top_peaks
df['top_peaks'].fillna(False,inplace=True)
else:
top_peaks = df.max(axis=1).rank(method='min',ascending=False)<=n_peaks
df['top_peaks'] = top_peaks
return df['top_peaks'].tolist()
In [ ]:
os.listdir('/global/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy/20170728_KBL_C18_PZ-KM_RootFnl_positive/identification/')
I'm testing this one: /global/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy/20170728_KBL_C18_PZ-KM_RootFnl_positive
In [ ]:
# json_filename = "/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel_Spain_vs_Berkeley/20160824_and_20170808_C18_MP_Solar_negative_params.json"
json_filename = '/global/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy/20170728_KBL_C18_PZ-KM_RootFnl_negative_params.json'
with open(json_filename) as data_file:
params = json.load(data_file)
file_to_convert = os.path.join(params['basedir'],'%s_%s.csv'%(params['basename'],params['polarity']))
print('# Working on %s %s'%(params['basename'],params['polarity']))
if os.path.isfile(file_to_convert):
# take the comprehensive csv from mzmine and make a peak-height only version of it
df,original_mzmine = mzm.metatlas_formatted_atlas_from_mzmine_output(file_to_convert,params['polarity'],
make_atlas=False,min_rt=0.55,
remove_fragments=False,
remove_adducts=False,
remove_clusters=False,)
#Filter features found in the blank
df = mzm.clean_up_mzmine_dataframe(df)
df_blank_compare = df.transpose().groupby(['b' if any([s in g.lower() for s in params['blank_str']]) else 's' for g in df.columns]).max().transpose()
if 'b' in df_blank_compare.columns:
df_features_not_in_blank = df_blank_compare[df_blank_compare['s'] > (params['sample_to_blank'] * df_blank_compare['b'])]
print('# %d features total'%(df.shape[0]))
print('# %d features removed by blank\n'%(df_blank_compare.shape[0] - df_features_not_in_blank.shape[0]))
else:
print('# No files have "blank" or "injbl" in their names.')
df_features_not_in_blank = df_blank_compare
df_features_not_in_blank.reset_index(inplace=True)
print('#There are now %d features not in blank'%df_features_not_in_blank.shape[0])
#Make an Atlas
cids = []
for j,row in df_features_not_in_blank.iterrows():
my_mz_ref = metob.MzReference(mz=row.mz,mz_tolerance=row.mz_tolerance,detected_polarity=params['polarity'],lcms_run=None)
my_rt_ref = metob.RtReference(rt_peak=row.rt_peak,rt_min=row.rt_min,rt_max=row.rt_max,lcms_run=None)
my_id = metob.CompoundIdentification(rt_references=[my_rt_ref],mz_references=[my_mz_ref],name=row.label)
cids.append(my_id)
my_atlas = metob.Atlas(name='untargeted atlas',compound_identifications=cids)
atlas_df = ma_data.make_atlas_df(my_atlas)
#Make Groups
all_files = [f.replace('Peak height','').replace('filtered','').strip() for f in df.columns if '.mzML' in f]
metatlas_files = []
for f in all_files:
f = metob.retrieve('Lcmsruns',name=f,username='*')[-1]
if isinstance(f,type(metob.LcmsRun())):
metatlas_files.append(f)
else:
print('%s NOT FOUND'%f)
break
groups = metob.Group(name='untargeted group',items=metatlas_files)
#Get Data
all_files = []
for my_file in groups.items:
all_files.append((my_file,groups,atlas_df,my_atlas))
pool = mp.Pool(processes=min(12, len(all_files)))
metatlas_dataset = pool.map(ma_data.get_data_for_atlas_df_and_file, all_files)
pool.close()
pool.terminate()
# remove peaks that aren't valid. See pk_checker for details on validity.
num_features = len(metatlas_dataset[0])
num_files = len(metatlas_dataset)
valid_peaks = [pk_checker(metatlas_dataset,atlas_df,compound_idx,params) for compound_idx in range(num_features)]
valid_peaks = peak_in_top_n(metatlas_dataset,n_peaks=10,prior_boolean=valid_peaks)
df_filtered_peaks_atlas = atlas_df.loc[valid_peaks]
file_to_convert = os.path.join(params['basedir'],'%s_%s.csv'%(params['basename'],params['polarity']))
df_filtered_peaks_atlas[['mz','rt_peak','label']].sort_values('rt_peak').to_csv(file_to_convert.replace('.csv','') + '_formatted_peakfiltered.csv',index=False)
print('# There are now %d filtered peaks'%df_filtered_peaks_atlas.shape[0])
# make a filtered atlas, get data, and make_plots
filtered_ids = []
for i,valid in enumerate(valid_peaks):
if valid:
filtered_ids.append(my_atlas.compound_identifications[i])
my_atlas.compound_identifications = filtered_ids
atlas_df = ma_data.make_atlas_df(my_atlas)
#Get Data
all_files = []
for my_file in groups.items:
all_files.append((my_file,groups,atlas_df,my_atlas))
pool = mp.Pool(processes=min(12, len(all_files)))
metatlas_dataset = pool.map(ma_data.get_data_for_atlas_df_and_file, all_files)
pool.close()
pool.terminate()
output_dir = os.path.join(params['basedir'],'%s_%s'%(params['basename'],params['polarity']))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
atlas_identifications = dp.export_atlas_to_spreadsheet(my_atlas,os.path.join(output_dir,'atlas_export.csv'))
# atlas_identifications = dp.export_atlas_to_spreadsheet(myAtlas,'%s/sheets/%s.csv'%(plot_location_label,myAtlas.name))
peak_height = dp.make_output_dataframe(input_fname = '',input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_height' , output_loc=os.path.join(output_dir,'sheets'))
peak_area = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_area' , output_loc=os.path.join(output_dir,'sheets'))
mz_peak = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_peak' , output_loc=os.path.join(output_dir,'sheets'))
rt_peak = dp.make_output_dataframe(input_fname = my_file, input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [],fieldname='rt_peak' , output_loc=os.path.join(output_dir,'sheets'))
mz_centroid = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_centroid' , output_loc=os.path.join(output_dir,'sheets'))
rt_centroid = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='rt_centroid' , output_loc=os.path.join(output_dir,'sheets'))
dp.make_identification_figure_v2(input_dataset = metatlas_dataset, input_fname = my_file, include_lcmsruns = [],exclude_lcmsruns = params['blank_str'].append('QC'), output_loc=os.path.join(output_dir,'identification'))
In [ ]:
1+1
In [ ]:
# mzmine_things = [
# #potato
# # raw data folder:
# # /global/project/projectdirs/metatlas/raw_data/erbilgin/20161026_OE_Turnbaughswtpotato2
# # filename structure:
# # 20161026_SK-OE_Turnbaugh_SwtPotato2_QE144_C18-USDAY32305_<pol>_<sample_type>_<??>_<??>_<??>
# # sample_type codes:
# # InjBl-MeOH = injection blank
# # QC-SK160922-sopv2 = qc
# # if it has "B" in it = extraction blank
# # if it has a R or C = raw or cooked. The number is the bio replicate.
# # Ignore all sample_type codes that have "H" in them; only keep the ones that have "S" in them
# {'polarity':'positive', #polarity of this analysis
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20161026_OE_Turnbaughswtpotato2', #output location
# 'basename':'20161026_OE_Turnbaughswtpotato2_raw-v-cooked', #output file prefix string
# 'groups':['20161026_OE_Turnbaughswtpotato2'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['_b','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'min_peak_duration' : 0.025,
# 'max_peak_duration' : 30.0,
# 'rt_tol_perfile' : 0.015,
# 'rt_tol_multifile' : 0.15,
# 'min_peak_height': 1e6,
# 'noise_floor': 3e4,
# 'mz_tolerance': 10.0,
# 'min_sn_ratio': 2.0,
# 'min_rt':1.1,
# 'max_rt':100,
# 'filename_substring':'%',#files must contain this substring % is the wildcard
# 'file_filters':['_QC','_NEG','_ITSD','_H_'] #files containing these strings will be excluded
# },
# {'polarity':'negative', #polarity of this analysis
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20161026_OE_Turnbaughswtpotato2', #output location
# 'basename':'20161026_OE_Turnbaughswtpotato2_raw-v-cooked', #output file prefix string
# 'groups':['20161026_OE_Turnbaughswtpotato2'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['_b','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'min_peak_duration' : 0.025,
# 'max_peak_duration' : 30.0,
# 'rt_tol_perfile' : 0.015,
# 'rt_tol_multifile' : 0.15,
# 'min_peak_height': 1e6,
# 'noise_floor': 3e4,
# 'mz_tolerance': 10.0,
# 'min_sn_ratio': 2.0,
# 'min_rt':1.1,
# 'max_rt':100,
# 'filename_substring':'%',#files must contain this substring % is the wildcard
# 'file_filters':['_QC','_POS','_ITSD','_H_'] #files containing these strings will be excluded
# },
# # 20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18
# {'polarity':'positive', #polarity of this analysis
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output location
# 'basename':'20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output file prefix string
# 'groups':['20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['waterctrl','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e5,
# 'filename_substring':'%',#files must contain this substring % is the wildcard
# 'file_filters':['_QC','_NEG'] #files containing these strings will be excluded
# },
# {'polarity':'negative', #polarity of this analysis
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output location
# 'basename':'20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output file prefix string
# 'groups':['20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['waterctrl','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e5,
# 'filename_substring':'%',#files must contain this substring % is the wildcard
# 'file_filters':['_QC','_POS'] #files containing these strings will be excluded
# },
# {'polarity':'positive', #polarity of this analysis
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output location
# 'basename':'20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_C18', #output file prefix string
# 'groups':['20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_C18'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['waterctrl','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e5,
# 'filename_substring':'%',#files must contain this substring % is the wildcard
# 'file_filters':['_QC','_NEG'] #files containing these strings will be excluded
# },
# {'polarity':'negative', #polarity of this analysis
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output location
# 'basename':'20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_C18', #output file prefix string
# 'groups':['20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_C18'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['waterctrl','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e5,
# 'filename_substring':'%',#files must contain this substring % is the wildcard
# 'file_filters':['_QC','_POS'] #files containing these strings will be excluded
# },
# {'polarity':'positive', #polarity of this analysis
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output location
# 'basename':'20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_HIL', #output file prefix string
# 'groups':['20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_HIL'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['waterctrl','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e5,
# 'filename_substring':'%',#files must contain this substring % is the wildcard
# 'file_filters':['_QC','_NEG'] #files containing these strings will be excluded
# },
# {'polarity':'negative', #polarity of this analysis
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20171116_SK_BRB_Bochner_PetriCondSpring2017_Aq_C18', #output location
# 'basename':'20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_HIL', #output file prefix string
# 'groups':['20171116_SK_BRB_Bochner_PetriCondSpring2017_MeOH_HIL'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['waterctrl','blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e5,
# 'filename_substring':'%',#files must contain this substring % is the wildcard
# 'file_filters':['_QC','_POS'] #files containing these strings will be excluded
# },
# #P simiae mutant lb dataset
# # 20170217_KBL_C18_RexMalm_Psimiae
# {'polarity':'positive', #polarity of this analysis
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170217_KBL_C18_RexMalm_Psimiae', #output location
# 'basename':'20170217_KBL_C18_RexMalm_Psimiae_supernatant', #output file prefix string
# 'groups':['20170217_KBL_C18_RexMalm_Psimiae'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',#files must contain this substring % is the wildcard
# 'file_filters':['_QC','_NEG','_ITSD','_pellet_'] #files containing these strings will be excluded
# },
# {'polarity':'negative', #polarity of this analysis
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170217_KBL_C18_RexMalm_Psimiae', #output location
# 'basename':'20170217_KBL_C18_RexMalm_Psimiae_supernatant', #output file prefix string
# 'groups':['20170217_KBL_C18_RexMalm_Psimiae'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl','excontrol'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',#files must contain this substring % is the wildcard
# 'file_filters':['_QC','_POS','_ITSD','_pellet_'] #files containing these strings will be excluded
# },
# #LeBoldus
# {'polarity':'positive', #polarity of this analysis
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/LeBoldus_innoc_and_noninnoc_poplar', #output location
# 'basename':'20170731_KBL_C18LipidV7_PZ-KM_RootFnl', #output file prefix string
# 'groups':['20170501_KBL_C18_15min_JL-KD_PopFun'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',#files must contain this substring % is the wildcard
# 'file_filters':['_QC','_NEG','_ITSD'] #files containing these strings will be excluded
# },
# {'polarity':'negative',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/LeBoldus_innoc_and_noninnoc_poplar',
# 'basename':'20170731_KBL_C18LipidV7_PZ-KM_RootFnl',
# 'groups':['20170501_KBL_C18_15min_JL-KD_PopFun'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_POS','_ITSD']
# },
# {'polarity':'positive',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/LeBoldus_innoc_and_noninnoc_poplar',
# 'basename':'20170503_KBL_ZHILIC_JL_PoplarFungus',
# 'groups':['20170503_KBL_ZHILIC_JL_PoplarFungus'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_NEG','_ITSD']
# },
# {'polarity':'negative',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/LeBoldus_innoc_and_noninnoc_poplar',
# 'basename':'20170503_KBL_ZHILIC_JL_PoplarFungus',
# 'groups':['20170503_KBL_ZHILIC_JL_PoplarFungus'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_POS','_ITSD']
# },
# #BRAUNI
# {'polarity':'positive',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170919_KBL_C18_TD-DB_Brauni',
# 'basename':'20170919_KBL_C18_TD-DB_BrauniiLipids',
# 'groups':['20170919_KBL_C18_TD-DB_BrauniiLipids'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_NEG','_ITSD']
# },
# {'polarity':'negative',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170919_KBL_C18_TD-DB_Brauni',
# 'basename':'20170919_KBL_C18_TD-DB_BrauniiLipids',
# 'groups':['20170919_KBL_C18_TD-DB_BrauniiLipids'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_POS','_ITSD']
# },
# {'polarity':'positive',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170919_KBL_C18_TD-DB_Brauni',
# 'basename':'20170914_KBL_C18Lipid_TD-DB_Braunii',
# 'groups':['20170914_KBL_C18Lipid_TD-DB_Braunii'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_NEG','_ITSD']
# },
# {'polarity':'negative',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170919_KBL_C18_TD-DB_Brauni',
# 'basename':'20170914_KBL_C18Lipid_TD-DB_Braunii',
# 'groups':['20170914_KBL_C18Lipid_TD-DB_Braunii'],
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_POS','_ITSD']
# },
# #Zerbe-Murphy
# {'polarity':'positive',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy',
# 'basename':'20170728_KBL_C18_PZ-KM_RootFnl',
# 'groups':['20170728_KBL_C18_PZ-KM_RootFnl'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_NEG','_ITSD']
# },
# {'polarity':'negative',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy',
# 'basename':'20170728_KBL_C18_PZ-KM_RootFnl',
# 'groups':['20170728_KBL_C18_PZ-KM_RootFnl'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_POS','_ITSD']
# },
# {'polarity':'positive',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy',
# 'basename':'20170731_KBL_C18LipidV7_PZ-KM_RootFnl',
# 'groups':['20170731_KBL_C18LipidV7_PZ-KM_RootFnl'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_NEG','_ITSD']
# },
# {'polarity':'negative',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/20170728_KBL_ZerbeMurphy',
# 'basename':'20170731_KBL_C18LipidV7_PZ-KM_RootFnl',
# 'groups':['20170731_KBL_C18LipidV7_PZ-KM_RootFnl'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_POS','_ITSD']
# },
# #RIPP project
# {'polarity':'positive',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/RITT_Project_20170901_KBL_C18_DP-PB_PoplarRhizo',
# 'basename':'20170901_KBL_C18_DP-PB_PoplarRhizo_media',
# 'groups':['20170901_KBL_C18_DP-PB_PoplarRhizo'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_NEG','_ITSD','pellet']
# },
# {'polarity':'negative',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/RITT_Project_20170901_KBL_C18_DP-PB_PoplarRhizo',
# 'basename':'20170901_KBL_C18_DP-PB_PoplarRhizo_media',
# 'groups':['20170901_KBL_C18_DP-PB_PoplarRhizo'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_POS','_ITSD','pellet']
# },
# {'polarity':'positive',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/RITT_Project_20170901_KBL_C18_DP-PB_PoplarRhizo',
# 'basename':'20170901_KBL_C18_DP-PB_PoplarRhizo_pellet',
# 'groups':['20170901_KBL_C18_DP-PB_PoplarRhizo'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_NEG','_ITSD','media']
# },
# {'polarity':'negative',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/RITT_Project_20170901_KBL_C18_DP-PB_PoplarRhizo',
# 'basename':'20170901_KBL_C18_DP-PB_PoplarRhizo_pellet',
# 'groups':['20170901_KBL_C18_DP-PB_PoplarRhizo'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_POS','_ITSD','media']
# },
# #Solar Panels Spain vs Berkeley
# {'polarity':'positive',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel_Spain_vs_Berkeley',
# 'basename':'20160824_and_20170808_C18_MP_Solar',
# 'groups':['20160825_KBL_C18_MP_Solar_Berk','20170728_KBL_C18_MP_Solar_Spain'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_NEG','_ITSD']
# },
# {'polarity':'negative',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel_Spain_vs_Berkeley',
# 'basename':'20160824_and_20170808_C18_MP_Solar',
# 'groups':['20160825_KBL_C18_MP_Solar_Berk','20170728_KBL_C18_MP_Solar_Spain'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_POS','_ITSD']
# },
# {'polarity':'positive',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel_Spain_vs_Berkeley',
# 'basename':'20160824_and_20170808_C18lipid_MP_Solar',
# 'groups':['20160824_KBL_C18Lipid_SolarPanel_MP','20170808_KBL_C18lipid_MP_Solar_Spain'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_NEG','_ITSD']
# },
# {'polarity':'negative',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel_Spain_vs_Berkeley',
# 'basename':'20160824_and_20170808_C18lipid_MP_Solar',
# 'groups':['20160824_KBL_C18Lipid_SolarPanel_MP','20170808_KBL_C18lipid_MP_Solar_Spain'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_POS','_ITSD']
# },
# #P simiae plate flip corrected
# {'polarity':'positive',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/P_Simiae_plate_flip',
# 'basename':'20170428_KBL_C18_Psimiae_DiffSubs',
# 'groups':['20170428_KBL_C18_Psimiae_DiffSubs_FlipCorrected'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_NEG','_ITSD']
# },
# {'polarity':'negative',
# 'basedir':'/project/projectdirs/metatlas/projects/jgi_projects/P_Simiae_plate_flip',
# 'basename':'20170428_KBL_C18_Psimiae_DiffSubs',
# 'groups':['20170428_KBL_C18_Psimiae_DiffSubs_FlipCorrected'], #list of folders containg datafiles
# 'metatlas_path':'export PYTHONPATH="/global/homes/b/bpb/repos/metatlas:${PYTHONPATH}"',
# 'blank_str':['blank','injbl','meohbl'], # lowercase strings that will identify blanks
# 'sample_to_blank':3,#cutoff for excluding features in the blank
# 'rt_timespan':0.5, #rt_peak must be within this amount from mzmine's rt and local minima must be +/- this amount
# 'peak_to_valley_ratio':3,
# 'min_intensity':1e6,
# 'filename_substring':'%',
# 'file_filters':['_QC','_POS','_ITSD']
# },
# ]
In [ ]:
%%time
metob.store([my_atlas])
In [ ]:
dp = reload(dp)
%matplotlib notebook
a = dp.adjust_rt_for_selected_compound(metatlas_dataset,compound_idx=213,include_lcmsruns = [],alpha=0.75,width=10,height=6)
In [ ]:
output_dir = '/global/homes/b/bpb/Downloads/test_code_mzmine/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
In [ ]:
dp.make_identification_figure_v2(input_dataset = metatlas_dataset, input_fname = my_file, include_lcmsruns = [],exclude_lcmsruns = ['InjBl','QC','Blank','blank'], output_loc=os.path.join(output_dir,'identification'))
In [ ]:
n_files = len(metatlas_dataset)
n_features = len(metatlas_dataset[0])
has_msms = [False]*n_features #initialize all features to false
not_in_blank = [False]*n_features #initialize all features to false
good_peak_shape = [False]*n_features #initialize all features to false
top_1000 = [False]*n_features #initialize all features to false
In [ ]:
for m in metatlas_dataset:
for n in m:
print(n['data']['ms1_summary']['peak_height'])
In [ ]:
getattr(my_mz_ref.lcms_run,'unique_id','None')
In [ ]:
for i,m in enumerate(mzmine_things):
file_to_convert = os.path.join(m['basedir'],'%s_%s.csv'%(m['basename'],m['polarity']))
# file_info = os.stat(file_to_convert)
# print(file_info[-1])
if os.path.isfile(file_to_convert):
# take the comprehensive csv from mzmine and make a peak-height only version of it
df,original_mzmine = mzm.metatlas_formatted_atlas_from_mzmine_output(file_to_convert,m['polarity'],
make_atlas=False,min_rt=0.55,
remove_fragments=False,
remove_adducts=False,
remove_clusters=False)
df.to_csv(file_to_convert.replace('.csv','') + '_formatted.csv',index=True)
# filter the peaks by those found in the blank at high intensity.
df = clean_up_mzmine_dataframe(df)
df_blank_compare = df.transpose().groupby(['b' if (('blank') or ('injbl')) in g.lower() else 's' for g in df.columns]).max().transpose()
print('%s_%s'%(m['basename'],m['polarity']))
if 'b' in df_blank_compare.columns:
keep_features_not_in_blank = df_blank_compare[df_blank_compare['s'] > (3 * df_blank_compare['b'])]
print('%d features total'%(df.shape[0]))
print('%d features removed by blank\n'%(df_blank_compare.shape[0] - keep_features_not_in_blank.shape[0]))
else:
print('No files have "blank" or "injbl" in their names.')
keep_features_not_in_blank = df_blank_compare
keep_features_not_more_than_1000 = keep_features_not_in_blank[keep_features_not_in_blank['s'].rank(method='max') <= 1000]
df_blank_filtered = df.loc[keep_features_not_more_than_1000.index]
df_blank_filtered.to_csv(file_to_convert.replace('.csv','') + '_formatted_blankremoved.csv',index=True)
else:
print('#REDO:')
job_cmd = make_task_and_job(m['basedir'],m['basename'],m['polarity'],m['files'])
print('%s'%job_cmd)
In [ ]:
In [ ]:
temp = df_blank_filtered.head().reset_index()
# .reindex(columns=['mz','label','mz_tolerance','rt_peak','rt_min','rt_max','inchi_key','detected_polarity','adduct_assignments'])
temp.head()
In [ ]:
from shutil import copyfile
In [ ]:
%system mkdir /global/project/projectdirs/metatlas/raw_data/bpb/20170428_KBL_C18_Psimiae_DiffSubs_FlipCorrected
In [ ]:
162*2
In [ ]:
%system ls -l $new_dir | wc -l
In [ ]:
flip_file = '/global/homes/b/bpb/Downloads/20170428_KBL_RM_C18_Psimiae_PlateFlip_NewFileNames_2 (2).xlsx'
new_dir = '/global/project/projectdirs/metatlas/raw_data/bpb/20170428_KBL_C18_Psimiae_DiffSubs_FlipCorrected'
df = pd.read_excel(flip_file)
for i,row in df.iterrows():
print(os.path.basename(row['New file name']),os.path.isfile(row['Old file name']))
new_loc = os.path.join(new_dir,os.path.basename(row['New file name']))
copyfile(row['Old file name'],new_loc)
In [ ]:
# df_super = df[[c for c in df.columns if not 'super' in c]]
# df_super = df_super[[c for c in df_super.columns if not 'kana' in c]]
# df_super = df_super[[c for c in df_super.columns if not 'max_intensity' in c]]
# df_super = df_super[[c for c in df_super.columns if not 'Excontrol' in c]]
# df_super = df_super[[c for c in df_super.columns if not 'adduct' in c]]
# df_super.set_index(['label','mz','mz_tolerance','rt_peak','rt_min','rt_max','inchi_key','detected_polarity'],inplace=True)
# df_super.columns = [''.join(c.split('_')[2:3]) if 'pellet' in c else c for c in df_super.columns ]
In [ ]:
df_super = df[[c for c in df.columns if not 'pellet' in c]]
df_super = df_super[[c for c in df_super.columns if not 'kana' in c]]
df_super = df_super[[c for c in df_super.columns if not 'max_intensity' in c]]
df_super = df_super[[c for c in df_super.columns if not 'Excontrol' in c]]
df_super = df_super[[c for c in df_super.columns if not 'adduct' in c]]
df_super['rt_min'] = df_super['rt_peak'] - 0.2
df_super.set_index(['label','mz','mz_tolerance','rt_peak','rt_min','rt_max','inchi_key','detected_polarity'],inplace=True)
df_super.columns = [''.join(c.split('_')[2:3]) if 'super' in c else c for c in df_super.columns ]
In [ ]:
import numpy as np
from scipy.stats import ttest_ind
group1 = df_super[[c for c in df_super.columns if c == 'LB']]
group2 = df_super[[c for c in df_super.columns if c == 'PS']]
# group2 = df[df['group'] == 'GROUP2']['data'].astype(float)
t, p = ttest_ind(group1.T, group2.T)
stats_df = pd.DataFrame(index=df_super.index)
stats_df['p_value'] = p
stats_df['t_score'] = t
stats_df['log2_fold_change'] = group2.min(axis=1).apply(lambda x: np.log2(x+1)) - \
group1.max(axis=1).apply(lambda x: np.log2(x+1))
stats_df['in_control'] = group1.median(axis=1) > 1e5
stats_df['median_control'] = group1.median(axis=1)
stats_df['median_treatment'] = group2.median(axis=1)
stats_df.head()
In [ ]:
fig, ax = plt.subplots(figsize=(8,7))
stats_df['log2_fold_change'].hist(bins=100)
ax.set_yscale('log')
ax.set_xlabel('LOG2 Fold Change')
ax.set_ylabel('#features')
plt.show()
In [ ]:
fig, ax = plt.subplots(figsize=(8,7))
ax.plot(stats_df['log2_fold_change'],stats_df['p_value'],'.')
# stats_df['fold_change'].apply(lambda x: np.log2(x+1)).hist(bins=100)
ax.set_yscale('log')
ax.set_xlabel('LOG2 fold change')
ax.set_ylabel('p_value')
plt.show()
In [ ]:
filtered_df = stats_df[(stats_df['log2_fold_change']>7) & (stats_df['median_treatment']>1e5)]
fig, ax = plt.subplots(figsize=(8,7))
# color_map = {filtered_df.log2_fold_change.min():"r", filtered_df.log2_fold_change.max():"g"}
# z_as_colors = map(color_map.get, filtered_df.log2_fold_change)
sc = ax.scatter(filtered_df.index.get_level_values('mz'),
filtered_df.index.get_level_values('rt_peak'),
c = filtered_df.log2_fold_change)
# stats_df['fold_change'].apply(lambda x: np.log2(x+1)).hist(bins=100)
# ax.set_yscale('log')
plt.colorbar(sc)
ax.set_xlabel('mz')
ax.set_ylabel('rt')
In [ ]:
# (~stats_df['in_control']) &
filtered_df = stats_df[(stats_df['log2_fold_change']>7) & (stats_df['median_treatment']>3e5)]
print filtered_df.shape
# pd.options.display.precision = 5
# pd.set_option('display.max',200)
filtered_df.sort_values('log2_fold_change', ascending=False).head(20)
In [ ]:
df_super.iloc[df_super.index.get_level_values('label') == '754.3994@3.75']
In [ ]:
filtered_df.head()
In [ ]:
# filtered_df.index.names
df = pd.DataFrame(columns=filtered_df.index.names)
for i,row in enumerate(filtered_df.index.get_values()):
for j,c in enumerate(df.columns):
df.loc[i,c] = row[j]
# df.columns =
# filtered_df.index.get_values()
# #.reset_index(level=[0,1,2])
# atlas_df = atlas_df[atlas_df.columns[:6]]
# # atlas_df.reset_index(inplace=True)
# # atlas_df = atlas_df[[]]
# # atlas_df.reset_index(inplace=True)
# atlas_df.fillna('')
# atlas_df
In [ ]:
In [ ]:
myAtlas = dp.make_atlas_from_spreadsheet(df,'20170406_ions_made_psimiae_in_LB_pos_c18',filetype='dataframe',
sheetname='',
polarity = 'positive',
store=True,
mz_tolerance = 10)
atlas_df = ma_data.make_atlas_df(myAtlas)
In [ ]:
groups = dp.select_groups_for_analysis(name = '%rexmalm_pos_super%',
most_recent = True,
remove_empty = True,#'Strain=SB214'
include_list = [], exclude_list = [],)#['QC','Blank'])
In [ ]:
for g in groups:
for i in g.items:
print i.name
In [ ]:
all_files = []
for my_group in groups:
for my_file in my_group.items:
all_files.append((my_file,my_group,atlas_df,myAtlas))
pool = mp.Pool(processes=min(10, len(all_files)))
# from metatlas.helpers.metatlas_get_data_helper_fun import get_data_for_atlas_df_and_file
metatlas_dataset = pool.map(ma_data.get_data_for_atlas_df_and_file, all_files)
pool.close()
pool.terminate()
#If you're code crashes here, make sure to terminate any processes left open.
In [ ]:
output_dir = '/global/homes/b/bpb/Downloads/psim_c18_important_pos_super/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
atlas_identifications = dp.export_atlas_to_spreadsheet(myAtlas,os.path.join(output_dir,'atlas_export.csv'))
# dp = reload(dp)
# atlas_identifications = dp.export_atlas_to_spreadsheet(myAtlas,'%s/sheets/%s.csv'%(plot_location_label,myAtlas.name))
peak_height = dp.make_output_dataframe(input_fname = '',input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_height' , output_loc=os.path.join(output_dir,'sheets'))
# peak_area = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_area' , output_loc=os.path.join(output_dir,'sheets'))
# mz_peak = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_peak' , output_loc=os.path.join(output_dir,'sheets'))
# rt_peak = dp.make_output_dataframe(input_fname = my_file, input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [],fieldname='rt_peak' , output_loc=os.path.join(output_dir,'sheets'))
# mz_centroid = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_centroid' , output_loc=os.path.join(output_dir,'sheets'))
# rt_centroid = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='rt_centroid' , output_loc=os.path.join(output_dir,'sheets'))
In [ ]:
peak_height.head()
In [ ]:
peak_height.shape
In [ ]:
# from scipy import cluster
# mat = peak_height.as_matrix()
# mat[np.isnan(mat)] = np.nanmin(mat)
# labels = cluster.hierarchy.fclusterdata(mat,0.2,criterion='distance',method='average',metric='correlation')
# results = pd.DataFrame(data=labels, columns=['cluster'], index=peak_height.index)
In [ ]:
peak_height.index = ['@'.join(c.split('_')[:2]).replace('p','.') for c in peak_height.index]
In [ ]:
peak_height.columns = peak_height.columns.droplevel(0)
In [ ]:
peak_height.columns = ['_'.join(c.split('_')[14:18]) for c in peak_height.columns]
In [ ]:
peak_height = peak_height.fillna(0)
In [ ]:
# norm_peak_height = peak_height.copy()
# norm_peak_height = norm_peak_height.apply(lambda x: x / x.max())
# norm_peak_height.head()
In [ ]:
peak_height.shape
In [ ]:
# norm_peak_height.columns.sort_values().shape
In [ ]:
# c
In [ ]:
sf = pd.DataFrame()
for c in peak_height.columns.sort_values():
sf[c] = peak_height[c]
for i,row in sf.iterrows():
sf.loc[i,:] = row/row.max()
In [ ]:
sf.head()
In [ ]:
cm = plt.colormaps()
for i,c in enumerate(cm):
print i,c
In [ ]:
for i,r in sf.iterrows():
print r.max()
In [ ]:
# The returned object has a savefig method that should be used if you want to save the figure object without clipping the dendrograms.
# To access the reordered row indices, use: clustergrid.dendrogram_row.reordered_ind
# Column indices, use: clustergrid.dendrogram_col.reordered_ind
g.dendrogram_row.reordered_ind
In [ ]:
import seaborn as sns
cmap = cm[148]
# cmap = sns.light_palette(as_cmap=True)
# cmap = sns.diverging_palette(h_neg=210, h_pos=350, s=90, l=30, as_cmap=True)
# g =sns.clustermap(peak_height.apply(lambda x: (x+1)**0.25),metric='correlation',figsize=(25,25),cmap=cmap)
# g =sns.clustermap(peak_height.apply(lambda x: np.log10(x+1)),metric='correlation',figsize=(25,25))
g =sns.clustermap(sf,metric='euclidean',figsize=(30,18),cmap=cmap,col_cluster=True)
plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0) # For y axis
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90) # For x axis
plt.show()
# # http://seaborn.pydata.org/generated/seaborn.clustermap.html
In [ ]:
# from scipy import stats
# zscore = lambda x: (x - stats.nanmean(x)) / stats.nanstd(x)
In [ ]:
# fig, ax = plt.subplots(2,3,figsize=(20,30))
# for i in range(6):
# plt.subplot(2,3,i+1)
# row = peak_height.iloc[[i]]
# c = 0
# vals = []
# for k,v in row.to_dict().items():
# vals.append(v.items()[0][-1])
# v = np.asarray(vals)
# # v = v - np.nanmin(v)
# # v = v / np.nanmax(v)
# v[np.isnan(v)] = np.nanmin(v)
# plt.plot(v,'.-')
# plt.title(myAtlas.compound_identifications[i].name)
# plt.gca().set_yscale('log')
# # ax.set_yscale('log')
# plt.tight_layout()
# # for i,row in peak_height.iterrows():
# # peak_height.iloc[0,]
In [ ]: