In [1]:
import sys
sys.path.insert(0,'/global/homes/b/bpb/repos/metatlas/')
sys.path.insert(1,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
import multiprocessing as mp
import numpy as np
import os
import pandas as pd
import metatlas.metatlas_objects as metob
from matplotlib import pyplot as plt
import glob
# %matplotlib notebook
%matplotlib notebook
from metatlas.helpers import mzmine_helpers as mzm
from metatlas.helpers import pactolus_tools as pt
pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',100)
In [3]:
# peak_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/20160824_C18_LIPID___POS_mzmine_output.csv'
# peak_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/Scoelicolor_media_WT_mzmine_output.csv'
# peak_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/Psim_super_C18_pos_mzmine_output.csv'
# peak_file = '/global/homes/b/bpb/Psim_super_C18_pos_mzmine_output_456peaks.csv'
# Psim_pellet_C18_neg
# hedlund_jad2_and_media
# peak_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/Psim_super_C18_neg_mzmine_output.csv'
# pactolus_results = '/scratch2/scratchdirs/bpb/pactolus_runs/hedlund_jad2_and_media/'
# pactolus_results = '/global/cscratch1/sd/bpb/pactolus_runs/rexmalm_pos_super/'
# pactolus_results = '/global/project/projectdirs/metatlas/projects/jgi_projects/Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2/'
pactolus_results = '/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel/Pactolus_Results_20160824_KBL_SolarPanel_MP/'
# ctolus_results = '/project/projectdirs/metatlas/projects/pactolus_runs/20170317_SK_Arkin_PseudoAbxCsource/'
# my_polarity = 'negative'
import glob as glob
trees = glob.glob('/global/cscratch1/sd/bpb/level_3_trees/*.h5')
ref_df = pd.read_pickle('/project/projectdirs/metatlas/projects/magi_paper/unique_compounds.pkl')
In [5]:
output_path = '/global/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel/Pactolus_Results_20160824_KBL_SolarPanel_MP/'
if not os.path.isdir(output_path):
os.path.mkdir(output_path)
output_pickle = os.path.join(output_path,'pactolus_hits.pkl')
output_csv = os.path.join(output_path,'pactolus_hits.csv')
In [6]:
pactolus_files = [f for f in glob.glob(os.path.join(pactolus_results,'*.h5')) if 'pactolus_results' in os.path.basename(f.lower())]
print len(pactolus_files)
# pactolus_files = [f for f in glob.glob(pactolus_results+'*.h5') if 'pactolus_results_' in f.lower()]
In [7]:
# pactolus_results = '/global/cscratch1/sd/bpb/pactolus_runs/Cori_20161209_Manuel_SolarPanel'
max_processes = 64
pt = reload(pt)
pool = mp.Pool(processes=min(max_processes, len(pactolus_files)))
output = pool.map(pt.read_pacolus_results, pactolus_files)
pool.close()
pool.terminate()
for i,f in enumerate(pactolus_files):
fname = os.path.basename(f).split('.')[0]
output[i][0]['filename'] = fname
#output has scan_df,tree_df
In [8]:
# important to note that compound==0 is a spectrum with no pactolus hits
msms_df = pd.concat([df[0] for df in output], axis=0)
msms_df.rename(columns={'precursor mz':'precursor_mz','retention time':'retention_time','index':'spectrum_index'},inplace=True)
msms_df['compound'] = msms_df['compound'].fillna(0.0).astype(int)
msms_df['compound_index'] = msms_df['compound']
msms_df.reset_index(inplace=True)
msms_df = msms_df.drop(['index'],1)
msms_df['score'].fillna(0,inplace=True)
msms_df['compound'].fillna(0,inplace=True)
msms_df.head()
Out[8]:
In [8]:
# pactolus_path = '/project/projectdirs/metatlas/projects/pactolus_runs/'
# pactolus_file = 'pactolus_results_20161209_SK_Standards_MSMLS_QE144_50447-638867_MS1_MSMS-POS_MSMLS-PKZ-R9_IR1_148_148.h5'
# import h5py
# min_score=0
# with h5py.File('/project/projectdirs/metatlas/projects/magi_paper/pactolus_results/pactolus_results_20161209_SK_Standards_MSMLS_QE144_50447-638867_MS1_MSMS-POS_MSMLS-PKZ-R9_IR1_148_148.h5','r') as fid:
# #read score_matrix, convert all by all matrix to lists of scores
# idx = range(fid['score_matrix'].shape[0])
# d = {'retention time':fid['scan_metadata']['peak_rt'][idx],
# 'precursor intensity':fid['scan_metadata']['peak_intensity'][idx],
# 'precursor mz':fid['scan_metadata']['peak_mz'][idx],
# 'polarity': fid['scan_metadata']['polarity'][idx],
# 'index': idx}
# scan_df = pd.DataFrame(d)
# scan_df['filename'] = pactolus_file
# m = fid['score_matrix'][:]
# hits = []
# for mm in m:
# idx = np.where(mm>min_score)[0]
# hits.append(sorted([(mm[i],i) for i in idx])[::-1])
# df = pd.DataFrame({'scores':hits})
# b_flat = pd.DataFrame([[i, x[0], x[1]]
# for i, y in df.scores.apply(list).iteritems()
# for x in y], columns=['index','score','compound']).set_index('index')
# scan_df = scan_df.merge(b_flat, how = 'outer',left_index=True, right_index=True)
# #get a list of True/False if any hits for a compound:
# f = np.any(m.T>min_score,axis=1)
# #only do this for ones that get a hit
# idx = np.where(f)[0]#range(fid['score_matrix'].shape[1])
# # lookup = fid['tree_file_lookup_table'][:]
# d = {'filename': fid['tree_file_lookup_table']['filename'][idx],
# 'ms1_mass': fid['tree_file_lookup_table']['ms1_mass'][idx],
# 'inchi': fid['tree_file_lookup_table']['inchi'][idx],
# 'permanent_charge': fid['tree_file_lookup_table']['permanent_charge'][idx],
# 'index': idx}
# # get inchikey like this:
# d['inchi_key'] = [os.path.basename(a).split('.')[0].split('_')[-1] for a in fid['tree_file_lookup_table']['filename'][idx]]
# tree_df = pd.DataFrame(d)
# # tree_df.set_index('index',drop=True,inplace=True)
# # return scan_df,tree_df
In [9]:
compound_df = pd.concat([df[1] for df in output], axis=0)
compound_df.drop_duplicates(inplace=True)
compound_df.set_index('index',drop=True,inplace=True)
In [10]:
cols_to_use = list(set(ref_df.columns.tolist()) - set(compound_df.columns.tolist()))
cols_to_use.append('inchi_key')
compound_df2 = compound_df.reset_index().merge(ref_df[cols_to_use],on='inchi_key',how='inner').reset_index().set_index('index').drop(['level_0'],1)
In [11]:
compound_df2.head()
Out[11]:
In [12]:
temp = msms_df.merge(compound_df2,how='left',left_on='compound',right_index=True)
In [13]:
temp.inchi_key.fillna('',inplace=True)
In [16]:
# temp.to_pickle('/global/homes/b/bpb/Downloads/pseudo5_pactolus.pkl')
In [14]:
def calculate_ppm(theoretical_mass,measured_mz,adduct_masses):
"""
given theoretical mass vector, measured mz vector, and adduct mass vector
return the ppm
#### TODO: add in the adduct names and with an argmin list the adduct name for pactolus hit ####
Note:
theoretical_mass and measured_mz must be [N,1] where N is the number
of features.
adduct_masses must be [1,M] where M is the number of adducts
typical adduct masses for positive mode are [0, 1.007276]
typical adduct masses negative mode are [-1.007276]
"""
measured_mz = np.outer(measured_mz,np.ones(adduct_masses.shape))
theoretical_mass = np.outer(theoretical_mass,np.ones(adduct_masses.shape))
mz_neutralizations = np.outer(np.ones(measured_mz.shape[0]),adduct_masses)
mass_difference = abs(measured_mz - mz_neutralizations - theoretical_mass)
ppm_difference = np.divide(mass_difference,theoretical_mass)*1e6
return ppm_difference.min(axis=1)
In [15]:
idx = temp['polarity'] == 1
temp['detected_polarity'] = ''
temp.loc[idx,'detected_polarity'] = 'positive' #this sets things up for metatlas
adduct_masses = np.asarray([1.007276]) #assume [M+] and [M+H] are the adducts
temp.loc[idx,'ppm'] = calculate_ppm(temp[idx]['mono_isotopic_molecular_weight'].values,
temp[idx]['precursor_mz'].values,
adduct_masses)
idx = temp['polarity'] == 0
temp.loc[idx,'detected_polarity'] = 'negative' #this sets things up for metatlas
adduct_masses = np.asarray([-1.007276]) #assume [M-H] are the adducts
temp.loc[idx,'ppm'] = calculate_ppm(temp[idx]['mono_isotopic_molecular_weight'].values,
temp[idx]['precursor_mz'].values,
adduct_masses)
In [16]:
temp.ppm.hist(bins=1000)
Out[16]:
In [27]:
temp_filtered = temp[temp.ppm < 25].copy()
fig = plt.figure()
temp_filtered.ppm.hist(bins=100)
plt.show()
In [18]:
temp.shape
Out[18]:
In [28]:
temp_filtered.shape
Out[28]:
In [29]:
temp_filtered.rename(columns={'inchi_key':'original_compound','score':'compound_score'},inplace=True)
In [33]:
len(temp_filtered.original_compound.unique())
Out[33]:
In [30]:
temp_filtered.head()
Out[30]:
In [31]:
temp_filtered.to_pickle(output_pickle)
In [32]:
temp_filtered.to_csv(output_csv)
In [ ]:
In [ ]:
# pactolus_df.fillna(0,inplace=True)
# peak_df.fillna(0,inplace=True)
In [ ]:
ema_df_pos = pd.read_pickle('/project/projectdirs/metatlas/projects/magi_paper/ema_pos_hilic_atlas_50447.pkl')
ema_df_neg = pd.read_pickle('/project/projectdirs/metatlas/projects/magi_paper/ema_neg_hilic_atlas_50447.pkl')
inchikeys = list(set(ema_df_pos.inchi_key.tolist() + ema_df_neg.inchi_key.tolist()))
len(inchikeys)
In [ ]:
pactolus_inchikeys = temp_filtered.inchi_key.tolist()
In [ ]:
len(list(set(inchikeys) - set(pactolus_inchikeys)))
In [ ]:
len(list(set([i.split('-')[0] for i in inchikeys]) - set([i.split('-')[0] for i in pactolus_inchikeys])))
In [ ]:
temp_filtered[(abs(temp_filtered.precursor_mz - 222.097213)<0.001)].sort_values('score',ascending=False)
In [ ]:
In [ ]:
In [ ]:
selected = temp_filtered[(temp_filtered.filename_x.str.contains('20161209_SK_Standards_MSMLS_QE144_50447-638867_MS1_MSMS-NEG_MSMLS-P1-RH_IR2_22_22')) &
(abs(temp_filtered.precursor_mz - 220.0827)<0.01) &
(temp_filtered.detected_polarity == 'negative') &
(abs(temp_filtered.retention_time - 5.5)<200.5)].sort_values('score',ascending=False)
selected[['inchi_key','ppm','score','name','retention_time']]
In [ ]:
ema_df = pd.read_pickle('/project/projectdirs/metatlas/projects/magi_paper/ema_posneg_monoisotopicweights.pkl')
msms = []
for i,row in ema_df.iterrows():
temp = pd.DataFrame(row.msms)
temp['key'] = i
msms.append(temp)
msms = pd.concat(msms)
msms_ema_df = pd.merge(ema_df[[c for c in ema_df.columns if not 'msms' in c]],msms,how='inner',left_index=True,right_on='key')
msms_ema_df.to_csv('/global/u2/b/bpb/Downloads/ema_pos_export.csv')
In [ ]:
In [ ]:
for i,row in ema_df.iterrows():
matching = temp_filtered[(temp_filtered.inchi_key.str.contains(row.inchi_key.split('-')[0])) &
# (temp_filtered.precursor_mz > (row.mz - row.mz*25/row.mz/1e6)) &
# (temp_filtered.precursor_mz < (row.mz + row.mz*25/row.mz/1e6)) &
(temp_filtered.retention_time > row.rt_min-0.2) &
(temp_filtered.retention_time < row.rt_max+0.2) ]
if (matching.shape[0] == 0) & (row.msms != []):
print 'msms=',row.msms != [], row.adduct, '%.4f'%row.mz,'%.2f'%row.rt_peak,row.label,row.inchi_key,
print [os.path.basename(f) for f in row.filenames]
print ''
In [ ]:
for i,row in ema_df.iterrows():
matching = temp_filtered[(temp_filtered.inchi_key.str.contains(row.inchi_key.split('-')[0])) &
# (temp_filtered.precursor_mz > (row.mz - row.mz*25/row.mz/1e6)) &
# (temp_filtered.precursor_mz < (row.mz + row.mz*25/row.mz/1e6)) &
(temp_filtered.retention_time > row.rt_min-0.2) &
(temp_filtered.retention_time < row.rt_max+0.2) ]
if (matching.shape[0] == 0) & (row.msms != []):
print 'msms=',row.msms != [], row.adduct, '%.4f'%row.mz,'%.2f'%row.rt_peak,row.label,row.inchi_key,
print [os.path.basename(f) for f in row.filenames]
print ''
In [ ]:
ema_df[ema_df.label=='BETAINE']
In [ ]:
temp[temp.inchi_key.str.contains('OEYIOHPDSNJKLS')]
In [ ]:
# bpb@edison09:/project/projectdirs/metatlas/projects/pactolus_trees>
# cp /project/projectdirs/openmsi/projects/level_3_trees/*.h5 .
# cp /project/projectdirs/openmsi/projects/ben_trees/*.h5 .
In [ ]:
%system ls -f /project/projectdirs/metatlas/projects/clean_pactolus_trees/ | grep OVRNDRQMDRJTHS-ZTVVOAFPSA-N
In [ ]:
%system ls -f /project/projectdirs/openmsi/projects/level_3_trees/ | grep OVRNDRQMDRJTHS-ZTVVOAFPSA-N
In [ ]:
import h5py
with h5py.File('/project/projectdirs/openmsi/projects/level_3_trees/FragTreeLibrary_test_hdf5_3_OVRNDRQMDRJTHS-ZTVVOAFPSA-N.h5') as fid:
k = fid.keys()
print k
k2 = fid[k[0]].keys()
print k2
k3 = fid[k[0]][k2[0]]
print k3
with h5py.File('/project/projectdirs/metatlas/projects/clean_pactolus_trees/FragTreeLibrary_test_hdf5_5_OVRNDRQMDRJTHS-ZTVVOAFPSA-N.h5') as fid:
k = fid.keys()
print k
k2 = fid[k[0]].keys()
print k2
k3 = fid[k[0]][k2[0]]
print k3
In [ ]:
temp[(temp_filtered.precursor_mz > (104.10699 - 104.10699)*25/row.mz/1e6) &
(temp_filtered.precursor_mz > (row.mz - row.mz)*25/row.mz/1e6) &
(temp_filtered.retention_time > row.rt_min-0.2) &
(temp_filtered.retention_time < row.rt_max+0.2) ]
In [ ]:
t = np.load('/project/projectdirs/openmsi/projects/level_3_trees/tree_lookup.npy')
In [ ]:
t.shape
In [ ]:
%system ls -l /project/projectdirs/openmsi/projects/ben_trees/*.npy
In [ ]:
# df = pd.read_pickle('/project/projectdirs/metatlas/projects/mzmine_parameters/actinorhodin_dataset.pkl')
# df.to_csv('example.csv')
In [ ]:
# temp[temp.synonyms.fillna('').str.contains('prodigiosin')]
In [ ]:
# temp_filtered.head(100)
In [ ]:
# def take_best_hit(df_in,compound_df,mz_tolerance,mz,polarity):
# """
# given a dataframe take the hits and filename and return sorted scores with only the highest score remaining
# """
# all_hits = [hh + (f,mz,compound_df.loc[hh[1],'ms1_mass'],) for (h,f) in zip(df_in['hits'],df_in['filename']) for hh in h]
# all_hits = sorted(all_hits)[::-1]
# if polarity == 'positive':
# multiplier = 1
# else:
# multiplier = -1
# #remove hits that have a ppm difference greater than a threshold
# all_hits = [h for h in all_hits if abs(h[3] - h[4] - (multiplier * 1.007276))/h[4]*1e6 < mz_tolerance]
# df = pd.DataFrame(all_hits,columns=['score','compound_idx','filename','mz','mass'])
# df[df['score'] == df.groupby(['compound_idx'])['score'].transform(max)] #take only the highest pactolus score
# return [tuple(r) for r in df.values]
# peak_df['compound_indices'] = hits #This references the indices in compound_df
In [ ]:
# peak_has_msms = [False if h=='no msms' else True for h in hits]
# peak_df = peak_df[peak_has_msms]
In [ ]:
# peak_df.reset_index(inplace=True)
In [ ]:
In [ ]:
# items_as_cols = df.apply(lambda x: pd.Series(x['samples']), axis=1)
# # Keep original df index as a column so it's retained after melt
# items_as_cols['orig_index'] = items_as_cols.index
# melted_items = pd.melt(items_as_cols, id_vars='orig_index',
# var_name='sample_num', value_name='sample')
# melted_items.set_index('orig_index', inplace=True)
# df.merge(melted_items, left_index=True, right_index=True)
In [ ]:
# Number of hits
# Score of best hit
# Avg Score
# Median Score
# Number of hits within 50% of best hit
# List of compound names from top hits (<N)
# Show the full info for the best hit
# export hit tables for each feature
In [ ]:
# peak_df['num_pactolus_hits'] = peak_df.compound_indices.apply(lambda x: len(x))
# def pactolus_stats(x):
# best_score = None
# worst_score = None
# avg_score = None
# median_score = None
# best_score_file = None
# best_score_precursor_mz = None
# best_score_precursor_mass = None
# best_score_compound_idx = None
# if x:
# best_score = x[0][0]
# worst_score = x[-1][0]
# avg_score = np.mean([xx[0] for xx in x])
# median_score = np.median([xx[0] for xx in x])
# best_score_file = x[0][2]
# best_score_precursor_mass = x[0][4]
# best_score_precursor_mz = x[0][3]
# best_score_compound_idx = int(x[0][1])
# return pd.Series({'best_score':best_score,
# 'worst_score':worst_score,
# 'avg_score':avg_score,
# 'median_score':median_score,
# 'best_score_file':best_score_file,
# 'best_score_precursor_mz':best_score_precursor_mz,
# 'best_score_precursor_mass':best_score_precursor_mass,
# 'best_score_compound_idx':best_score_compound_idx
# })
# temp = peak_df.compound_indices.apply(pactolus_stats)
# peak_df[temp.columns] = temp
In [ ]:
# peak_df = peak_df.merge(compound_df2, how='inner',left_on='best_score_compound_idx', right_index=True)
In [ ]:
# peak_df.head()
In [ ]:
# peak_df.drop('compound_indices',1).to_csv('~/Downloads/peak_output.csv')
In [ ]:
# # %%time
# N=100
# items_as_cols = peak_df.head(N).apply(lambda x: pd.Series(x['compound_indices']), axis=1)
# # Keep original df index as a column so it's retained after melt
# items_as_cols['orig_index'] = items_as_cols.index
# melted_items = pd.melt(items_as_cols, id_vars='orig_index', var_name='sample_num', value_name='compound_indices')['compound_indices']
# tuple_cols = ['pactolus_score','compound_index','pactolus_file','precursor_mass','precursor_mz']
# temp = pd.DataFrame(melted_items.head(),columns=tuple_cols)
# # melted_items[]
# # temp = melted_items['compound_indices'].apply(pd.Series)
# # melted_items.set_index('orig_index', inplace=True)
# # df2 = df.head(N).merge(melted_items, left_index=True, right_index=True)
In [ ]:
# items_as_cols = peak_df.head(N).apply(lambda x: pd.Series(x['compound_indices']), axis=1)
# temp = pd.melt(items_as_cols).value.apply(pd.Series)
# temp.columns = tuple_cols
# temp.head()
In [ ]:
# temp = melted_items.to_frame()['compound_indices'].apply(pd.Series)
# temp.columns = tuple_cols
# temp.head()
In [ ]:
# temp
In [ ]:
# compound_df2.ix[[h[1] for h in peak_df.loc[5,'compound_indices']]]
export hit tables for each feature
delete hit table colum from peak_df
In [ ]:
# best_hit_df = pd.DataFrame(columns=compound_df2.columns)
# empty_df = pd.DataFrame(columns=compound_df2.columns)
# for i in range(10):
# print i
# idx = peak_df.loc[i,'compound_indices']
# # if idx:
# best_hit_df.append(compound_df2.ix[[idx[2]]])#,ignore_index=True,)
# # else:
# # best_hit_df.append(empty_df)
In [ ]:
# best_hit_df
In [ ]:
# # import numpy as np
# # import pandas as pd
# from textwrap import wrap
# HEADER = '''
# <link rel="stylesheet" type="text/css" href="http://cdn.datatables.net/1.10.13/css/jquery.dataTables.css">
# <style>
# </style>
# <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js" type="text/javascript"></script>
# <script type="text/javascript" charset="utf8" src="http://cdn.datatables.net/1.10.13/js/jquery.dataTables.js"></script>
# <html>
# <head>
# <script >
# $(document).ready( function () { $('#table_id').DataTable() } );
# </script>
# </head>
# <body>
# '''
# FOOTER = '''
# </body>
# </html>
# <script type="text/javascript" >
# $(document).ready( function () {
# $('#table_id').DataTable();
# } );
# </script>
# '''
# # title = ax.set_title("\n".join(wrap("Some really really long long long title I really really need - and just can't - just can't - make it any - simply any - shorter - at all.", 60)))
# # df.rename(columns=lambda x: x[1:], inplace=True)
# with open('test.html', 'w') as f:
# f.write(HEADER)
# f.write(compound_df2.ix[[h[1] for h in peak_df.loc[1,'compound_indices']]].to_html(classes='df').replace('class="dataframe df"','id="table_id" class="hover"').replace('border="1"','td {word-wrap: break-word}'))
# # f.write(peak_df.rename(columns=lambda x: "\n".join(wrap(x,25))).head(100).to_html(classes='df').replace('class="dataframe df"','id="table_id" class="hover"').replace('border="1"','td {word-wrap: break-word}'))
# f.write(FOOTER)
In [ ]: