In [4]:
import pandas as pd
import numpy as np
output_filename = '/global/project/projectdirs/metatlas/projects/jgi_projects/Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2/mzmine_pos_with_pactolus.csv'
output_pickle = '/global/project/projectdirs/metatlas/projects/jgi_projects/Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2/pactolus_hits.pkl'
pactolus_df = pd.read_pickle(output_pickle)
pactolus_df.reset_index(inplace=True)
peak_df = pd.read_csv('/global/project/projectdirs/metatlas/projects/jgi_projects/Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2/output_with_ttest_pos.csv',sep=',')
peak_df.fillna('',inplace=True)
peak_df.head()
Out[4]:
In [5]:
pactolus_df = pactolus_df[~pactolus_df.filename_x.str.contains('ontrol')]
pactolus_df.reset_index(inplace=True)
In [6]:
def get_matching_msms(row,msms_df=None,mz_tolerance=10,rt_tolerance=None,mz_field = 'm/z',polarity='positive'):
"""
"""
if rt_tolerance:
hits = np.where((abs(msms_df['precursor_mz'].values - row['m/z']) / row[mz_field] * 1.0e6 < mz_tolerance) &
abs(msms_df['retention_time'].values - row['rt_min'] ) < rt_tolerance &
(msms_df['detected_polarity'] == polarity))
else:
hits = np.where((abs(msms_df['precursor_mz'].values - row[mz_field]) / row[mz_field] * 1.0e6 < mz_tolerance) &
(msms_df['retention_time'].values > row['rt_min'] ) &
(msms_df['retention_time'].values < row['rt_max'] ) &
(msms_df['detected_polarity'] == polarity))
return hits
hits = peak_df.apply(get_matching_msms,msms_df=pactolus_df,mz_field='mz',polarity='positive',axis=1)
hits = hits.apply(lambda x: x[0])
df = pd.DataFrame({'scores':hits})
peak_to_msms_df = pd.DataFrame([[i, x]
for i, y in df.scores.apply(list).iteritems()
for x in y], columns=['peak_index','msms_index']).set_index('peak_index')
peak_to_msms_df.reset_index(inplace=True)
labeled_df = peak_df.merge(peak_to_msms_df,how='outer',left_index=True,right_on='peak_index')
temp = labeled_df.merge(pactolus_df,how='left',left_on='msms_index',right_index=True)
temp.to_csv(output_filename)
temp.sort_values(by=['score','label'],ascending=[False,True],inplace=True)
temp_top5 = temp.groupby('label').head(5)
first = output_filename.split('.')[0]
last = output_filename.split('.')[1]
temp_top5.to_csv(first+'_top5'+'.'+last)
In [4]:
In [5]:
small_slice = temp_top5[temp_top5.label.str.startswith('331.1894@5.92')]
small_slice[['label','score']].head(10)
Out[5]:
In [6]:
small_slice = temp[temp.label.str.startswith('331.1894@5.92')]
small_slice[['label','score']].head(100)
Out[6]:
In [ ]: