In [4]:
import pandas as pd
import numpy as np
output_filename = '/global/project/projectdirs/metatlas/projects/jgi_projects/Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2/mzmine_pos_with_pactolus.csv'
output_pickle = '/global/project/projectdirs/metatlas/projects/jgi_projects/Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2/pactolus_hits.pkl'
pactolus_df = pd.read_pickle(output_pickle)
pactolus_df.reset_index(inplace=True)
peak_df = pd.read_csv('/global/project/projectdirs/metatlas/projects/jgi_projects/Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2/output_with_ttest_pos.csv',sep=',')
peak_df.fillna('',inplace=True)
peak_df.head()


Out[4]:
Unnamed: 0 index Unnamed: 0.1 label adduct_assignments mz mz_tolerance rt_peak rt_min rt_max ... 20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E23-r4_IR1_97_97.mzML Peak height 20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E22-r5_IR1_130_130.mzML Peak height 20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E23-r2_IR1_64_64.mzML Peak height max_intensity p_value t_score log2_fold_change median_control median_treatment max_of_medians
0 31 31 31 331.1894@5.92 331.189422 8 5.920321 5.820321 6.277215 ... 4.955479e+08 1.243879e+08 3.951449e+08 4.955479e+08 0.049405 -2.455725 5.894109 6.609905e+05 1.292694e+08 1.292694e+08
1 163 163 163 331.1896@6.34 331.189609 8 6.338763 6.238763 6.718822 ... 9.404695e+07 4.335052e+07 9.175287e+07 9.404695e+07 0.034651 -2.719825 2.970632 9.860107e+05 3.907345e+07 3.907345e+07
2 172 172 172 287.1636@6.67 287.163611 8 6.670176 6.570176 6.989447 ... 9.178425e+07 5.226471e+07 1.214487e+08 1.214487e+08 0.046355 -2.502760 3.080708 5.756308e+04 3.804697e+07 3.804697e+07
3 198 199 199 301.1793@4.80 301.179310 8 4.796742 4.696742 4.925015 ... 9.292590e+07 2.593150e+07 6.043557e+07 9.292590e+07 0.021410 -3.089192 6.406881 1.615787e+05 3.992997e+07 3.992997e+07
4 434 424 424 315.1948@7.45 315.194752 8 7.452940 7.352940 7.662237 ... 8.258138e+06 5.770330e+06 5.606400e+06 1.250258e+07 0.024043 -2.998892 1.833565 1.244034e+06 4.836730e+06 4.836730e+06

5 rows × 61 columns


In [5]:
pactolus_df = pactolus_df[~pactolus_df.filename_x.str.contains('ontrol')]
pactolus_df.reset_index(inplace=True)

In [6]:
def get_matching_msms(row,msms_df=None,mz_tolerance=10,rt_tolerance=None,mz_field = 'm/z',polarity='positive'):
    """
    
    """
    if rt_tolerance:
        hits = np.where((abs(msms_df['precursor_mz'].values - row['m/z']) / row[mz_field] * 1.0e6 < mz_tolerance) & 
                    abs(msms_df['retention_time'].values - row['rt_min'] ) < rt_tolerance &
                   (msms_df['detected_polarity'] == polarity))
    else:
        hits = np.where((abs(msms_df['precursor_mz'].values - row[mz_field]) / row[mz_field] * 1.0e6 < mz_tolerance) & 
                        (msms_df['retention_time'].values > row['rt_min'] ) &
                        (msms_df['retention_time'].values < row['rt_max'] ) &
                       (msms_df['detected_polarity'] == polarity))
    return hits

hits = peak_df.apply(get_matching_msms,msms_df=pactolus_df,mz_field='mz',polarity='positive',axis=1)
hits = hits.apply(lambda x: x[0])
df = pd.DataFrame({'scores':hits})
peak_to_msms_df = pd.DataFrame([[i, x] 
                   for i, y in df.scores.apply(list).iteritems() 
                   for x in y], columns=['peak_index','msms_index']).set_index('peak_index')

peak_to_msms_df.reset_index(inplace=True)

labeled_df = peak_df.merge(peak_to_msms_df,how='outer',left_index=True,right_on='peak_index')
temp = labeled_df.merge(pactolus_df,how='left',left_on='msms_index',right_index=True)
temp.to_csv(output_filename)

temp.sort_values(by=['score','label'],ascending=[False,True],inplace=True)
temp_top5 = temp.groupby('label').head(5)
first = output_filename.split('.')[0]
last = output_filename.split('.')[1]
temp_top5.to_csv(first+'_top5'+'.'+last)

In [4]:


In [5]:
small_slice = temp_top5[temp_top5.label.str.startswith('331.1894@5.92')]
small_slice[['label','score']].head(10)


Out[5]:
label score

In [6]:
small_slice = temp[temp.label.str.startswith('331.1894@5.92')]
small_slice[['label','score']].head(100)


Out[6]:
label score

In [ ]: