notebook.community

Edit and run



In [4]:

    
import pandas as pd
import numpy as np
output_filename = '/global/project/projectdirs/metatlas/projects/jgi_projects/Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2/mzmine_pos_with_pactolus.csv'
output_pickle = '/global/project/projectdirs/metatlas/projects/jgi_projects/Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2/pactolus_hits.pkl'
pactolus_df = pd.read_pickle(output_pickle)
pactolus_df.reset_index(inplace=True)
peak_df = pd.read_csv('/global/project/projectdirs/metatlas/projects/jgi_projects/Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2/output_with_ttest_pos.csv',sep=',')
peak_df.fillna('',inplace=True)
peak_df.head()









    Out[4]:







  
    
      
      Unnamed: 0
      index
      Unnamed: 0.1
      label
      adduct_assignments
      mz
      mz_tolerance
      rt_peak
      rt_min
      rt_max
      ...
      20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E23-r4_IR1_97_97.mzML Peak height
      20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E22-r5_IR1_130_130.mzML Peak height
      20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E23-r2_IR1_64_64.mzML Peak height
      max_intensity
      p_value
      t_score
      log2_fold_change
      median_control
      median_treatment
      max_of_medians
    
  
  
    
      0
      31
      31
      31
      331.1894@5.92
      
      331.189422
      8
      5.920321
      5.820321
      6.277215
      ...
      4.955479e+08
      1.243879e+08
      3.951449e+08
      4.955479e+08
      0.049405
      -2.455725
      5.894109
      6.609905e+05
      1.292694e+08
      1.292694e+08
    
    
      1
      163
      163
      163
      331.1896@6.34
      
      331.189609
      8
      6.338763
      6.238763
      6.718822
      ...
      9.404695e+07
      4.335052e+07
      9.175287e+07
      9.404695e+07
      0.034651
      -2.719825
      2.970632
      9.860107e+05
      3.907345e+07
      3.907345e+07
    
    
      2
      172
      172
      172
      287.1636@6.67
      
      287.163611
      8
      6.670176
      6.570176
      6.989447
      ...
      9.178425e+07
      5.226471e+07
      1.214487e+08
      1.214487e+08
      0.046355
      -2.502760
      3.080708
      5.756308e+04
      3.804697e+07
      3.804697e+07
    
    
      3
      198
      199
      199
      301.1793@4.80
      
      301.179310
      8
      4.796742
      4.696742
      4.925015
      ...
      9.292590e+07
      2.593150e+07
      6.043557e+07
      9.292590e+07
      0.021410
      -3.089192
      6.406881
      1.615787e+05
      3.992997e+07
      3.992997e+07
    
    
      4
      434
      424
      424
      315.1948@7.45
      
      315.194752
      8
      7.452940
      7.352940
      7.662237
      ...
      8.258138e+06
      5.770330e+06
      5.606400e+06
      1.250258e+07
      0.024043
      -2.998892
      1.833565
      1.244034e+06
      4.836730e+06
      4.836730e+06
    
  

5 rows × 61 columns



In [5]:

    
pactolus_df = pactolus_df[~pactolus_df.filename_x.str.contains('ontrol')]
pactolus_df.reset_index(inplace=True)



In [6]:

    
def get_matching_msms(row,msms_df=None,mz_tolerance=10,rt_tolerance=None,mz_field = 'm/z',polarity='positive'):
    """
    
    """
    if rt_tolerance:
        hits = np.where((abs(msms_df['precursor_mz'].values - row['m/z']) / row[mz_field] * 1.0e6 < mz_tolerance) & 
                    abs(msms_df['retention_time'].values - row['rt_min'] ) < rt_tolerance &
                   (msms_df['detected_polarity'] == polarity))
    else:
        hits = np.where((abs(msms_df['precursor_mz'].values - row[mz_field]) / row[mz_field] * 1.0e6 < mz_tolerance) & 
                        (msms_df['retention_time'].values > row['rt_min'] ) &
                        (msms_df['retention_time'].values < row['rt_max'] ) &
                       (msms_df['detected_polarity'] == polarity))
    return hits

hits = peak_df.apply(get_matching_msms,msms_df=pactolus_df,mz_field='mz',polarity='positive',axis=1)
hits = hits.apply(lambda x: x[0])
df = pd.DataFrame({'scores':hits})
peak_to_msms_df = pd.DataFrame([[i, x] 
                   for i, y in df.scores.apply(list).iteritems() 
                   for x in y], columns=['peak_index','msms_index']).set_index('peak_index')

peak_to_msms_df.reset_index(inplace=True)

labeled_df = peak_df.merge(peak_to_msms_df,how='outer',left_index=True,right_on='peak_index')
temp = labeled_df.merge(pactolus_df,how='left',left_on='msms_index',right_index=True)
temp.to_csv(output_filename)

temp.sort_values(by=['score','label'],ascending=[False,True],inplace=True)
temp_top5 = temp.groupby('label').head(5)
first = output_filename.split('.')[0]
last = output_filename.split('.')[1]
temp_top5.to_csv(first+'_top5'+'.'+last)



In [4]:



In [5]:

    
small_slice = temp_top5[temp_top5.label.str.startswith('331.1894@5.92')]
small_slice[['label','score']].head(10)









    Out[5]:






  
    
      
      label
      score



In [6]:

    
small_slice = temp[temp.label.str.startswith('331.1894@5.92')]
small_slice[['label','score']].head(100)









    Out[6]:






  
    
      
      label
      score



In [ ]:

	Unnamed: 0	index	Unnamed: 0.1	label	mz	mz_tolerance	rt_peak	rt_min	rt_max	...	20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E23-r4_IR1_97_97.mzML Peak height	20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E22-r5_IR1_130_130.mzML Peak height	20161027_SK-MR_SupprSoils_EthylAcetate2_QE144_EPC18-USDAY32305_POS_E23-r2_IR1_64_64.mzML Peak height	max_intensity	p_value	t_score	log2_fold_change	median_control	median_treatment	max_of_medians
0	31	31	31	331.1894@5.92	331.189422	8	5.920321	5.820321	6.277215	...	4.955479e+08	1.243879e+08	3.951449e+08	4.955479e+08	0.049405	-2.455725	5.894109	6.609905e+05	1.292694e+08	1.292694e+08
1	163	163	163	331.1896@6.34	331.189609	8	6.338763	6.238763	6.718822	...	9.404695e+07	4.335052e+07	9.175287e+07	9.404695e+07	0.034651	-2.719825	2.970632	9.860107e+05	3.907345e+07	3.907345e+07
2	172	172	172	287.1636@6.67	287.163611	8	6.670176	6.570176	6.989447	...	9.178425e+07	5.226471e+07	1.214487e+08	1.214487e+08	0.046355	-2.502760	3.080708	5.756308e+04	3.804697e+07	3.804697e+07
3	198	199	199	301.1793@4.80	301.179310	8	4.796742	4.696742	4.925015	...	9.292590e+07	2.593150e+07	6.043557e+07	9.292590e+07	0.021410	-3.089192	6.406881	1.615787e+05	3.992997e+07	3.992997e+07
4	434	424	424	315.1948@7.45	315.194752	8	7.452940	7.352940	7.662237	...	8.258138e+06	5.770330e+06	5.606400e+06	1.250258e+07	0.024043	-2.998892	1.833565	1.244034e+06	4.836730e+06	4.836730e+06