notebook.community

Edit and run



In [1]:

    
import sys
sys.path.insert(0,'/global/homes/b/bpb/repos/metatlas/')
sys.path.insert(1,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
import multiprocessing as mp

import numpy as np
import os
import pandas as pd
import metatlas.metatlas_objects as metob

from matplotlib import pyplot as plt

import glob
# %matplotlib notebook
%matplotlib notebook

from metatlas.helpers import mzmine_helpers as mzm
from metatlas.helpers import pactolus_tools as pt

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',100)



In [3]:

    
# peak_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/20160824_C18_LIPID___POS_mzmine_output.csv'
# peak_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/Scoelicolor_media_WT_mzmine_output.csv'
# peak_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/Psim_super_C18_pos_mzmine_output.csv'
# peak_file = '/global/homes/b/bpb/Psim_super_C18_pos_mzmine_output_456peaks.csv'
# Psim_pellet_C18_neg
# hedlund_jad2_and_media

# peak_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/Psim_super_C18_neg_mzmine_output.csv'

# pactolus_results = '/scratch2/scratchdirs/bpb/pactolus_runs/hedlund_jad2_and_media/'
# pactolus_results = '/global/cscratch1/sd/bpb/pactolus_runs/rexmalm_pos_super/'
# pactolus_results = '/global/project/projectdirs/metatlas/projects/jgi_projects/Pactolus_Results_20170512_SK_-MR_SupprSoils_EthylAc2/'
pactolus_results = '/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel/Pactolus_Results_20160824_KBL_SolarPanel_MP/'

# ctolus_results = '/project/projectdirs/metatlas/projects/pactolus_runs/20170317_SK_Arkin_PseudoAbxCsource/'


# my_polarity = 'negative'

import glob as glob
trees = glob.glob('/global/cscratch1/sd/bpb/level_3_trees/*.h5')
ref_df = pd.read_pickle('/project/projectdirs/metatlas/projects/magi_paper/unique_compounds.pkl')



In [5]:

    
output_path = '/global/project/projectdirs/metatlas/projects/jgi_projects/SolarPanel/Pactolus_Results_20160824_KBL_SolarPanel_MP/'
if not os.path.isdir(output_path):
    os.path.mkdir(output_path)
output_pickle = os.path.join(output_path,'pactolus_hits.pkl')
output_csv = os.path.join(output_path,'pactolus_hits.csv')



In [6]:

    
pactolus_files = [f for f in glob.glob(os.path.join(pactolus_results,'*.h5')) if 'pactolus_results' in os.path.basename(f.lower())]
print len(pactolus_files)
# pactolus_files = [f for f in glob.glob(pactolus_results+'*.h5') if 'pactolus_results_' in f.lower()]



In [7]:

    
# pactolus_results = '/global/cscratch1/sd/bpb/pactolus_runs/Cori_20161209_Manuel_SolarPanel'
max_processes = 64
pt = reload(pt)
pool = mp.Pool(processes=min(max_processes, len(pactolus_files)))
output = pool.map(pt.read_pacolus_results, pactolus_files)
pool.close()
pool.terminate()
for i,f in enumerate(pactolus_files):
    fname = os.path.basename(f).split('.')[0]
    output[i][0]['filename'] = fname
#output has scan_df,tree_df



In [8]:

    
# important to note that compound==0 is a spectrum with no pactolus hits
msms_df = pd.concat([df[0] for df in output], axis=0)
msms_df.rename(columns={'precursor mz':'precursor_mz','retention time':'retention_time','index':'spectrum_index'},inplace=True)
msms_df['compound'] = msms_df['compound'].fillna(0.0).astype(int)
msms_df['compound_index'] = msms_df['compound']
msms_df.reset_index(inplace=True)
msms_df = msms_df.drop(['index'],1)
msms_df['score'].fillna(0,inplace=True)
msms_df['compound'].fillna(0,inplace=True)
msms_df.head()









    Out[8]:







  
    
      
      spectrum_index
      polarity
      precursor intensity
      precursor_mz
      retention_time
      filename
      score
      compound
      compound_index
    
  
  
    
      0
      0
      1
      1.157535e+09
      607.359985
      9.141931
      pactolus_results_20160824_C18_LIPID___POS_MSMS...
      0.030960
      126328
      126328
    
    
      1
      0
      1
      1.157535e+09
      607.359985
      9.141931
      pactolus_results_20160824_C18_LIPID___POS_MSMS...
      0.027728
      126329
      126329
    
    
      2
      0
      1
      1.157535e+09
      607.359985
      9.141931
      pactolus_results_20160824_C18_LIPID___POS_MSMS...
      0.014015
      126196
      126196
    
    
      3
      0
      1
      1.157535e+09
      607.359985
      9.141931
      pactolus_results_20160824_C18_LIPID___POS_MSMS...
      0.001968
      126327
      126327
    
    
      4
      0
      1
      1.157535e+09
      607.359985
      9.141931
      pactolus_results_20160824_C18_LIPID___POS_MSMS...
      0.001410
      126195
      126195



In [8]:

    
# pactolus_path = '/project/projectdirs/metatlas/projects/pactolus_runs/'
# pactolus_file = 'pactolus_results_20161209_SK_Standards_MSMLS_QE144_50447-638867_MS1_MSMS-POS_MSMLS-PKZ-R9_IR1_148_148.h5'
# import h5py
# min_score=0
# with h5py.File('/project/projectdirs/metatlas/projects/magi_paper/pactolus_results/pactolus_results_20161209_SK_Standards_MSMLS_QE144_50447-638867_MS1_MSMS-POS_MSMLS-PKZ-R9_IR1_148_148.h5','r') as fid:
# #read score_matrix, convert all by all matrix to lists of scores
#     idx = range(fid['score_matrix'].shape[0])  
#     d = {'retention time':fid['scan_metadata']['peak_rt'][idx],
#         'precursor intensity':fid['scan_metadata']['peak_intensity'][idx],
#         'precursor mz':fid['scan_metadata']['peak_mz'][idx],
#         'polarity': fid['scan_metadata']['polarity'][idx],
#         'index': idx}
#     scan_df = pd.DataFrame(d)
#     scan_df['filename'] = pactolus_file

#     m = fid['score_matrix'][:]
#     hits = []
#     for mm in m:
#         idx = np.where(mm>min_score)[0]
#         hits.append(sorted([(mm[i],i) for i in idx])[::-1])
#     df = pd.DataFrame({'scores':hits})
#     b_flat = pd.DataFrame([[i, x[0], x[1]] 
#                        for i, y in df.scores.apply(list).iteritems() 
#                        for x in y], columns=['index','score','compound']).set_index('index')
#     scan_df = scan_df.merge(b_flat, how = 'outer',left_index=True, right_index=True)

#     #get a list of True/False if any hits for a compound:
#     f = np.any(m.T>min_score,axis=1)
#     #only do this for ones that get a hit
#     idx = np.where(f)[0]#range(fid['score_matrix'].shape[1])
# #     lookup = fid['tree_file_lookup_table'][:]
#     d = {'filename': fid['tree_file_lookup_table']['filename'][idx],
#          'ms1_mass': fid['tree_file_lookup_table']['ms1_mass'][idx],
#          'inchi': fid['tree_file_lookup_table']['inchi'][idx],
#          'permanent_charge': fid['tree_file_lookup_table']['permanent_charge'][idx],
#           'index': idx}
# #     get inchikey like this:
#     d['inchi_key'] = [os.path.basename(a).split('.')[0].split('_')[-1] for a in fid['tree_file_lookup_table']['filename'][idx]]

#     tree_df = pd.DataFrame(d)
#     # tree_df.set_index('index',drop=True,inplace=True)
# #     return scan_df,tree_df



In [9]:

    
compound_df = pd.concat([df[1] for df in output], axis=0)
compound_df.drop_duplicates(inplace=True)
compound_df.set_index('index',drop=True,inplace=True)



In [10]:

    
cols_to_use = list(set(ref_df.columns.tolist())  - set(compound_df.columns.tolist()))
cols_to_use.append('inchi_key')

compound_df2 = compound_df.reset_index().merge(ref_df[cols_to_use],on='inchi_key',how='inner').reset_index().set_index('index').drop(['level_0'],1)



In [11]:

    
compound_df2.head()









    Out[11]:







  
    
      
      filename
      inchi
      inchi_key
      ms1_mass
      permanent_charge
      mono_isotopic_molecular_weight
      creation_time
      synonyms
      neutralized_2d_inchi_key
      chebi_url
      img_abc_id
      neutralized_2d_inchi
      lipidmaps_url
      source
      kegg_url
      hmdb_url
      wikipedia_url
      formula
      number_components
      iupac_name
      username
      pubchem_compound_id
      description
      metacyc_id
      kegg_id
      hmdb_id
      chebi_id
      neutralized_inchi_key
      neutralized_inchi
      name
      num_free_radicals
      lipidmaps_id
      last_modified
      pubchem_url
      unique_id
    
    
      index
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      9
      /project/projectdirs/metatlas/projects/clean_p...
      InChI=1S/C5H8S/c1-5-3-2-4-6-5/h2,4-5H,3H2,1H3
      MFAXBTKOLOJEHX-UHFFFAOYSA-N
      100.034671
      0
      100.034671
      2016-06-18 01:10:21
      SCHEMBL257408///2,3-Dihydro-2-methylthiophene/...
      MFAXBTKOLOJEHX-UHFFFAOYSA-N
      
      
      InChI=1S/C5H8S/c1-5-3-2-4-6-5/h2,4-5H,3H2,1H3
      
      hmdb
      
      http://www.hmdb.ca/metabolites/HMDB33564
      
      C5H8S
      1
      
      bpb
      22054765
      
      
      
      HMDB33564
      
      MFAXBTKOLOJEHX-UHFFFAOYSA-N
      InChI=1S/C5H8S/c1-5-3-2-4-6-5/h2,4-5H,3H2,1H3
      SCHEMBL257408
      0
      
      2016-06-18 01:10:21
      http://pubchem.ncbi.nlm.nih.gov/compound/22054765
      85bf9492e780402486b9454231707741
    
    
      10
      /project/projectdirs/metatlas/projects/clean_p...
      InChI=1S/C5H8S/c1-5-3-2-4-6-5/h3H,2,4H2,1H3
      KBUVXRDFMQNIPD-UHFFFAOYSA-N
      100.034671
      0
      100.034671
      2016-06-18 01:07:39
      5-Methyl-2,3-dihydrothiophene///2-Methyl-4,5-d...
      KBUVXRDFMQNIPD-UHFFFAOYSA-N
      
      
      InChI=1S/C5H8S/c1-5-3-2-4-6-5/h3H,2,4H2,1H3
      
      hmdb
      
      http://www.hmdb.ca/metabolites/HMDB33565
      
      C5H8S
      1
      
      bpb
      559083
      
      
      
      HMDB33565
      
      KBUVXRDFMQNIPD-UHFFFAOYSA-N
      InChI=1S/C5H8S/c1-5-3-2-4-6-5/h3H,2,4H2,1H3
      5-Methyl-2,3-dihydrothiophene
      0
      
      2016-06-18 01:07:39
      http://pubchem.ncbi.nlm.nih.gov/compound/559083
      a8f9ff0d53374ae591534b4e24ac5ef2
    
    
      11
      /project/projectdirs/metatlas/projects/clean_p...
      InChI=1S/C5H9P/c1-5(2,3)4-6/h1-3H3
      PXQNAKPBTIQCTI-UHFFFAOYSA-N
      100.044187
      0
      100.044187
      2016-06-18 01:15:07
      1-Phospha-1-butyne, 3,3-dimethyl-///2,2-dimeth...
      PXQNAKPBTIQCTI-UHFFFAOYSA-N
      
      
      InChI=1S/C5H9P/c1-5(2,3)4-6/h1-3H3
      
      wikidata
      
      
      
      C5H9P
      1
      
      bpb
      144845
      
      
      
      
      
      PXQNAKPBTIQCTI-UHFFFAOYSA-N
      InChI=1S/C5H9P/c1-5(2,3)4-6/h1-3H3
      1-Phospha-1-butyne, 3,3-dimethyl-
      0
      
      2016-06-18 01:15:07
      http://pubchem.ncbi.nlm.nih.gov/compound/144845
      be64e0f381394d2290b21507c52c3fce
    
    
      53
      /project/projectdirs/metatlas/projects/clean_p...
      InChI=1S/C4H9BO2/c1-4-5(6-2)7-3/h4H,1H2,2-3H3
      MHTUGAFHHNZMNV-UHFFFAOYSA-N
      100.069560
      0
      100.069560
      2016-06-18 01:10:29
      dimethyl ethenylboronate///CHEBI:50984
      MHTUGAFHHNZMNV-UHFFFAOYSA-N
      http://www.ebi.ac.uk/chebi/searchId.do?chebiId...
      
      InChI=1S/C4H9BO2/c1-4-5(6-2)7-3/h4H,1H2,2-3H3
      
      chebi
      
      
      
      C4H9BO2
      1
      
      bpb
      25058192
      
      
      
      
      CHEBI:50984
      MHTUGAFHHNZMNV-UHFFFAOYSA-N
      InChI=1S/C4H9BO2/c1-4-5(6-2)7-3/h4H,1H2,2-3H3
      dimethyl ethenylboronate
      0
      
      2016-06-18 01:10:29
      http://pubchem.ncbi.nlm.nih.gov/compound/25058192
      95d84c12d95248548dbc0b8e600fa2ba
    
    
      54
      /project/projectdirs/metatlas/projects/clean_p...
      InChI=1S/C6H12O/c1-2-3-4-5-6-7/h3-4,7H,2,5-6H2...
      UFLHIIWVXFIJGU-ONEGZZNKSA-N
      100.088815
      0
      100.088815
      2016-06-18 01:19:37
      trans-3-Hexen-1-ol///(E)-Hex-3-en-1-ol///trans...
      UFLHIIWVXFIJGU-UHFFFAOYSA-N
      
      
      InChI=1S/C6H12O/c1-2-3-4-5-6-7/h3-4,7H,2,5-6H2...
      http://www.lipidmaps.org/data/LMSDRecord.php?L...
      lipidmaps
      
      
      
      C6H12O
      1
      
      bpb
      5284503
      
      
      
      
      
      UFLHIIWVXFIJGU-ONEGZZNKSA-N
      InChI=1S/C6H12O/c1-2-3-4-5-6-7/h3-4,7H,2,5-6H2...
      trans-3-Hexen-1-ol
      0
      LMFA05000113
      2016-06-18 01:19:37
      http://pubchem.ncbi.nlm.nih.gov/compound/5284503
      934b58afc92447adad38717e7129e6b4



In [12]:

    
temp = msms_df.merge(compound_df2,how='left',left_on='compound',right_index=True)



In [13]:

    
temp.inchi_key.fillna('',inplace=True)



In [16]:

    
# temp.to_pickle('/global/homes/b/bpb/Downloads/pseudo5_pactolus.pkl')

remove peaks that are outside of mz tolerance



In [14]:

    
def calculate_ppm(theoretical_mass,measured_mz,adduct_masses):
    """
    given theoretical mass vector, measured mz vector, and adduct mass vector
    return the ppm
    #### TODO: add in the adduct names and with an argmin list the adduct name for pactolus hit ####

    Note: 
    theoretical_mass and measured_mz must be [N,1] where N is the number
    of features.
    adduct_masses must be [1,M] where M is the number of adducts
    typical adduct masses for positive mode are [0, 1.007276]
    typical adduct masses negative mode are [-1.007276]

    """
    measured_mz = np.outer(measured_mz,np.ones(adduct_masses.shape))
    theoretical_mass = np.outer(theoretical_mass,np.ones(adduct_masses.shape))
    mz_neutralizations = np.outer(np.ones(measured_mz.shape[0]),adduct_masses)
    mass_difference = abs(measured_mz - mz_neutralizations - theoretical_mass)
    ppm_difference = np.divide(mass_difference,theoretical_mass)*1e6
    return ppm_difference.min(axis=1)



In [15]:

    
idx = temp['polarity'] == 1
temp['detected_polarity'] = ''

temp.loc[idx,'detected_polarity'] = 'positive' #this sets things up for metatlas
adduct_masses = np.asarray([1.007276]) #assume [M+] and [M+H] are the adducts

temp.loc[idx,'ppm'] = calculate_ppm(temp[idx]['mono_isotopic_molecular_weight'].values,
                                    temp[idx]['precursor_mz'].values,
                                   adduct_masses)
idx = temp['polarity'] == 0
temp.loc[idx,'detected_polarity'] = 'negative' #this sets things up for metatlas
adduct_masses = np.asarray([-1.007276]) #assume [M-H] are the adducts
temp.loc[idx,'ppm'] = calculate_ppm(temp[idx]['mono_isotopic_molecular_weight'].values,
                                    temp[idx]['precursor_mz'].values,
                                   adduct_masses)



In [16]:

    
temp.ppm.hist(bins=1000)









    














    











    Out[16]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fff46f82710>



In [27]:

    
temp_filtered = temp[temp.ppm < 25].copy()
fig = plt.figure()
temp_filtered.ppm.hist(bins=100)
plt.show()



In [18]:

    
temp.shape









    Out[18]:





(867589, 46)



In [28]:

    
temp_filtered.shape









    Out[28]:





(351140, 46)



In [29]:

    
temp_filtered.rename(columns={'inchi_key':'original_compound','score':'compound_score'},inplace=True)



In [33]:

    
len(temp_filtered.original_compound.unique())









    Out[33]:





32699



In [30]:

    
temp_filtered.head()









    Out[30]:







  
    
      
      spectrum_index
      polarity
      precursor intensity
      precursor_mz
      retention_time
      filename_x
      compound_score
      compound
      compound_index
      filename_y
      inchi
      original_compound
      ms1_mass
      permanent_charge
      mono_isotopic_molecular_weight
      creation_time
      synonyms
      neutralized_2d_inchi_key
      chebi_url
      img_abc_id
      neutralized_2d_inchi
      lipidmaps_url
      source
      kegg_url
      hmdb_url
      wikipedia_url
      formula
      number_components
      iupac_name
      username
      pubchem_compound_id
      description
      metacyc_id
      kegg_id
      hmdb_id
      chebi_id
      neutralized_inchi_key
      neutralized_inchi
      name
      num_free_radicals
      lipidmaps_id
      last_modified
      pubchem_url
      unique_id
      detected_polarity
      ppm
    
  
  
    
      0
      0
      1
      1.157535e+09
      607.359985
      9.141931
      pactolus_results_20160824_C18_LIPID___POS_MSMS...
      0.030960
      126328
      126328
      /project/projectdirs/metatlas/projects/clean_p...
      InChI=1S/C30H54O12/c1-4-5-6-7-8-9-10-11-12-13-...
      LGWRUIALZTVJCL-OODILDFESA-N
      606.361527
      0.0
      606.361527
      2016-10-13 19:43:36
      
      LGWRUIALZTVJCL-UHFFFAOYSA-N
      
      
      InChI=1S/C30H54O12/c1-4-5-6-7-8-9-10-11-12-13-...
      
      
      
      
      
      C30H54O12
      0.0
      
      bpb
      
      No description
      
      
      
      
      LGWRUIALZTVJCL-OODILDFESA-N
      InChI=1S/C30H54O12/c1-4-5-6-7-8-9-10-11-12-13-...
      Untitled
      0.0
      
      2016-10-13 19:48:33
      
      f531c78607e34f8898765a17e04ae671
      positive
      14.542177
    
    
      1
      0
      1
      1.157535e+09
      607.359985
      9.141931
      pactolus_results_20160824_C18_LIPID___POS_MSMS...
      0.027728
      126329
      126329
      /project/projectdirs/metatlas/projects/clean_p...
      InChI=1S/C30H54O12/c1-5-6-7-8-9-10-11-12-13-14...
      OJANSCBZYLSEIY-ZYSALSKRSA-N
      606.361527
      0.0
      606.361527
      2016-10-13 19:44:27
      
      OJANSCBZYLSEIY-UHFFFAOYSA-N
      
      
      InChI=1S/C30H54O12/c1-5-6-7-8-9-10-11-12-13-14...
      
      
      
      
      
      C30H54O12
      0.0
      
      bpb
      
      No description
      
      
      
      
      OJANSCBZYLSEIY-ZYSALSKRSA-N
      InChI=1S/C30H54O12/c1-5-6-7-8-9-10-11-12-13-14...
      Untitled
      0.0
      
      2016-10-13 19:51:36
      
      e107e66a306940d886c4b30238f39599
      positive
      14.542177
    
    
      3
      0
      1
      1.157535e+09
      607.359985
      9.141931
      pactolus_results_20160824_C18_LIPID___POS_MSMS...
      0.001968
      126327
      126327
      /project/projectdirs/metatlas/projects/clean_p...
      InChI=1S/C37H50O7/c1-23(14-20-32(41)42)13-19-3...
      FSKQADSFEYMILN-ROZCPUIOSA-N
      606.355654
      0.0
      606.355654
      2016-10-13 19:44:54
      
      FSKQADSFEYMILN-UHFFFAOYSA-N
      
      
      InChI=1S/C37H50O7/c1-23(14-20-32(41)42)13-19-3...
      
      
      
      
      
      C37H50O7
      0.0
      
      bpb
      
      No description
      
      
      
      
      FSKQADSFEYMILN-ROZCPUIOSA-N
      InChI=1S/C37H50O7/c1-23(14-20-32(41)42)13-19-3...
      Untitled
      0.0
      
      2016-10-13 19:53:14
      
      aeed79e0c9cf4793bb4a7117b4563391
      positive
      4.856207
    
    
      13
      1
      1
      1.157535e+09
      608.359985
      9.143232
      pactolus_results_20160824_C18_LIPID___POS_MSMS...
      0.000080
      126416
      126416
      /project/projectdirs/metatlas/projects/clean_p...
      InChI=1S/C30H49N5O8/c1-8-18(4)24-29(40)34(7)25...
      ZYTHYMZQQBWDDS-LSGBTAIHSA-N
      607.358114
      0.0
      607.358114
      2016-10-13 19:43:07
      
      ZYTHYMZQQBWDDS-UHFFFAOYSA-N
      
      
      InChI=1S/C30H49N5O8/c1-8-18(4)24-29(40)34(7)25...
      
      
      
      
      
      C30H49N5O8
      0.0
      
      bpb
      
      No description
      
      
      
      
      ZYTHYMZQQBWDDS-LSGBTAIHSA-N
      InChI=1S/C30H49N5O8/c1-8-18(4)24-29(40)34(7)25...
      Untitled
      0.0
      
      2016-10-13 19:46:18
      
      a7e4d2c7d9de440f8a73a38f9e07d8d5
      positive
      8.897842
    
    
      14
      2
      1
      5.307276e+08
      607.563354
      9.182587
      pactolus_results_20160824_C18_LIPID___POS_MSMS...
      0.160458
      126370
      126370
      /project/projectdirs/metatlas/projects/clean_p...
      InChI=1S/C39H74O4/c1-3-5-7-9-11-13-15-17-19-21...
      QYZMEQIXKKGYKR-CHAYHAMRSA-N
      606.558711
      0.0
      606.558711
      2016-06-18 01:16:28
      1-hydroxy-3-[(1Z,9Z)-octadeca-1,9-dien-1-yloxy...
      QYZMEQIXKKGYKR-UHFFFAOYSA-N
      http://www.ebi.ac.uk/chebi/searchId.do?chebiId...
      
      InChI=1S/C39H74O4/c1-3-5-7-9-11-13-15-17-19-21...
      
      chebi
      
      
      
      C39H74O4
      1.0
      
      bpb
      44176405
      A 1-O-(alk-1-enyl)-2-O-acylglycerol having a c...
      
      
      
      CHEBI:52949
      QYZMEQIXKKGYKR-CHAYHAMRSA-N
      InChI=1S/C39H74O4/c1-3-5-7-9-11-13-15-17-19-21...
      1-hydroxy-3-[(1Z,9Z)-octadeca-1,9-dien-1-yloxy...
      0.0
      
      2016-06-18 01:16:28
      http://pubchem.ncbi.nlm.nih.gov/compound/44176405
      ae685dc570e8478bb027e53dabdc677b
      positive
      4.339820



In [31]:

    
temp_filtered.to_pickle(output_pickle)



In [32]:

    
temp_filtered.to_csv(output_csv)



In [ ]:



In [ ]:

    
# pactolus_df.fillna(0,inplace=True)
# peak_df.fillna(0,inplace=True)

see if compounds are in ema atlas



In [ ]:

    
ema_df_pos = pd.read_pickle('/project/projectdirs/metatlas/projects/magi_paper/ema_pos_hilic_atlas_50447.pkl')
ema_df_neg = pd.read_pickle('/project/projectdirs/metatlas/projects/magi_paper/ema_neg_hilic_atlas_50447.pkl')
inchikeys = list(set(ema_df_pos.inchi_key.tolist() + ema_df_neg.inchi_key.tolist()))
len(inchikeys)



In [ ]:

    
pactolus_inchikeys = temp_filtered.inchi_key.tolist()



In [ ]:

    
len(list(set(inchikeys) - set(pactolus_inchikeys)))



In [ ]:

    
len(list(set([i.split('-')[0] for i in inchikeys]) - set([i.split('-')[0] for i in pactolus_inchikeys])))



In [ ]:

    
temp_filtered[(abs(temp_filtered.precursor_mz - 222.097213)<0.001)].sort_values('score',ascending=False)



In [ ]:



In [ ]:



In [ ]:

    
selected = temp_filtered[(temp_filtered.filename_x.str.contains('20161209_SK_Standards_MSMLS_QE144_50447-638867_MS1_MSMS-NEG_MSMLS-P1-RH_IR2_22_22')) &
                         (abs(temp_filtered.precursor_mz - 220.0827)<0.01) & 
              (temp_filtered.detected_polarity == 'negative') &
              (abs(temp_filtered.retention_time - 5.5)<200.5)].sort_values('score',ascending=False)
selected[['inchi_key','ppm','score','name','retention_time']]



In [ ]:

    
ema_df = pd.read_pickle('/project/projectdirs/metatlas/projects/magi_paper/ema_posneg_monoisotopicweights.pkl')
msms = []
for i,row in ema_df.iterrows():
    temp = pd.DataFrame(row.msms)
    temp['key'] = i
    msms.append(temp)
msms = pd.concat(msms)
msms_ema_df = pd.merge(ema_df[[c for c in ema_df.columns if not 'msms' in c]],msms,how='inner',left_index=True,right_on='key')
msms_ema_df.to_csv('/global/u2/b/bpb/Downloads/ema_pos_export.csv')



In [ ]:



In [ ]:

    
for i,row in ema_df.iterrows():
    matching = temp_filtered[(temp_filtered.inchi_key.str.contains(row.inchi_key.split('-')[0])) &
#                     (temp_filtered.precursor_mz > (row.mz - row.mz*25/row.mz/1e6)) & 
#                     (temp_filtered.precursor_mz < (row.mz + row.mz*25/row.mz/1e6)) &
                    (temp_filtered.retention_time > row.rt_min-0.2) & 
                    (temp_filtered.retention_time < row.rt_max+0.2) ]
    if (matching.shape[0] == 0) & (row.msms != []):
        print 'msms=',row.msms != [], row.adduct, '%.4f'%row.mz,'%.2f'%row.rt_peak,row.label,row.inchi_key,
        print [os.path.basename(f) for f in row.filenames]
        print ''



In [ ]:

    
for i,row in ema_df.iterrows():
    matching = temp_filtered[(temp_filtered.inchi_key.str.contains(row.inchi_key.split('-')[0])) &
#                     (temp_filtered.precursor_mz > (row.mz - row.mz*25/row.mz/1e6)) & 
#                     (temp_filtered.precursor_mz < (row.mz + row.mz*25/row.mz/1e6)) &
                    (temp_filtered.retention_time > row.rt_min-0.2) & 
                    (temp_filtered.retention_time < row.rt_max+0.2) ]
    if (matching.shape[0] == 0) & (row.msms != []):
        print 'msms=',row.msms != [], row.adduct, '%.4f'%row.mz,'%.2f'%row.rt_peak,row.label,row.inchi_key,
        print [os.path.basename(f) for f in row.filenames]
        print ''



In [ ]:

    
ema_df[ema_df.label=='BETAINE']



In [ ]:

    
temp[temp.inchi_key.str.contains('OEYIOHPDSNJKLS')]



In [ ]:

    
# bpb@edison09:/project/projectdirs/metatlas/projects/pactolus_trees> 
# cp /project/projectdirs/openmsi/projects/level_3_trees/*.h5 .
# cp /project/projectdirs/openmsi/projects/ben_trees/*.h5 .



In [ ]:

    
%system ls -f /project/projectdirs/metatlas/projects/clean_pactolus_trees/ | grep OVRNDRQMDRJTHS-ZTVVOAFPSA-N



In [ ]:

    
%system ls -f /project/projectdirs/openmsi/projects/level_3_trees/ | grep OVRNDRQMDRJTHS-ZTVVOAFPSA-N



In [ ]:

    
import h5py
with h5py.File('/project/projectdirs/openmsi/projects/level_3_trees/FragTreeLibrary_test_hdf5_3_OVRNDRQMDRJTHS-ZTVVOAFPSA-N.h5') as fid:
    k = fid.keys()
    print k
    k2 = fid[k[0]].keys()
    print k2
    k3 = fid[k[0]][k2[0]]
    print k3
    
with h5py.File('/project/projectdirs/metatlas/projects/clean_pactolus_trees/FragTreeLibrary_test_hdf5_5_OVRNDRQMDRJTHS-ZTVVOAFPSA-N.h5') as fid:
    k = fid.keys()
    print k
    k2 = fid[k[0]].keys()
    print k2
    k3 = fid[k[0]][k2[0]]
    print k3



In [ ]:

    
temp[(temp_filtered.precursor_mz > (104.10699 - 104.10699)*25/row.mz/1e6) & 
                    (temp_filtered.precursor_mz > (row.mz - row.mz)*25/row.mz/1e6) &
                    (temp_filtered.retention_time > row.rt_min-0.2) & 
                    (temp_filtered.retention_time < row.rt_max+0.2) ]



In [ ]:

    
t = np.load('/project/projectdirs/openmsi/projects/level_3_trees/tree_lookup.npy')



In [ ]:

    
t.shape



In [ ]:

    
%system ls -l /project/projectdirs/openmsi/projects/ben_trees/*.npy



In [ ]:

    
# df = pd.read_pickle('/project/projectdirs/metatlas/projects/mzmine_parameters/actinorhodin_dataset.pkl')
# df.to_csv('example.csv')



In [ ]:

    
# temp[temp.synonyms.fillna('').str.contains('prodigiosin')]



In [ ]:

    
# temp_filtered.head(100)



In [ ]:

    
# def take_best_hit(df_in,compound_df,mz_tolerance,mz,polarity):
#     """
#     given a dataframe take the hits and filename and return sorted scores with only the highest score remaining
#     """
    
#     all_hits = [hh + (f,mz,compound_df.loc[hh[1],'ms1_mass'],) for (h,f) in zip(df_in['hits'],df_in['filename']) for hh in h]
#     all_hits = sorted(all_hits)[::-1]
#     if polarity == 'positive':
#         multiplier = 1
#     else:
#         multiplier = -1
        
#     #remove hits that have a ppm difference greater than a threshold
#     all_hits = [h for h in all_hits if abs(h[3] - h[4] - (multiplier * 1.007276))/h[4]*1e6 < mz_tolerance]
      
#     df = pd.DataFrame(all_hits,columns=['score','compound_idx','filename','mz','mass'])
#     df[df['score'] == df.groupby(['compound_idx'])['score'].transform(max)] #take only the highest pactolus score
#     return [tuple(r) for r in df.values]


# peak_df['compound_indices'] = hits #This references the indices in compound_df

remove peaks that don't have msms in any file



In [ ]:

    
# peak_has_msms = [False if h=='no msms' else True for h in hits]
# peak_df = peak_df[peak_has_msms]



In [ ]:

    
# peak_df.reset_index(inplace=True)



In [ ]:



In [ ]:

    
# items_as_cols = df.apply(lambda x: pd.Series(x['samples']), axis=1)
# # Keep original df index as a column so it's retained after melt
# items_as_cols['orig_index'] = items_as_cols.index

# melted_items = pd.melt(items_as_cols, id_vars='orig_index', 
#                        var_name='sample_num', value_name='sample')
# melted_items.set_index('orig_index', inplace=True)

# df.merge(melted_items, left_index=True, right_index=True)



In [ ]:

    
# Number of hits
# Score of best hit
# Avg Score
# Median Score
# Number of hits within 50% of best hit
# List of compound names from top hits (<N)
# Show the full info for the best hit
# export hit tables for each feature



In [ ]:

    
# peak_df['num_pactolus_hits'] = peak_df.compound_indices.apply(lambda x: len(x))
# def pactolus_stats(x):
#     best_score = None
#     worst_score = None
#     avg_score = None
#     median_score = None
#     best_score_file = None
#     best_score_precursor_mz = None
#     best_score_precursor_mass = None
#     best_score_compound_idx = None
#     if x:
#         best_score = x[0][0]
#         worst_score = x[-1][0]
#         avg_score = np.mean([xx[0] for xx in x])
#         median_score = np.median([xx[0] for xx in x])
#         best_score_file = x[0][2]
#         best_score_precursor_mass = x[0][4]
#         best_score_precursor_mz = x[0][3]
#         best_score_compound_idx = int(x[0][1])
        

#     return pd.Series({'best_score':best_score,
#                       'worst_score':worst_score,
#                       'avg_score':avg_score,
#                       'median_score':median_score,
#                       'best_score_file':best_score_file,
#                       'best_score_precursor_mz':best_score_precursor_mz,
#                       'best_score_precursor_mass':best_score_precursor_mass,
#                       'best_score_compound_idx':best_score_compound_idx
#                      })
   
# temp = peak_df.compound_indices.apply(pactolus_stats)
# peak_df[temp.columns] = temp



In [ ]:

    
# peak_df = peak_df.merge(compound_df2, how='inner',left_on='best_score_compound_idx', right_index=True)



In [ ]:

    
# peak_df.head()



In [ ]:

    
# peak_df.drop('compound_indices',1).to_csv('~/Downloads/peak_output.csv')

Make pactolus table: feature ID, compound info, pactolus score



In [ ]:

    
# # %%time
# N=100
# items_as_cols = peak_df.head(N).apply(lambda x: pd.Series(x['compound_indices']), axis=1)
# # Keep original df index as a column so it's retained after melt
# items_as_cols['orig_index'] = items_as_cols.index

# melted_items = pd.melt(items_as_cols, id_vars='orig_index', var_name='sample_num', value_name='compound_indices')['compound_indices']

# tuple_cols = ['pactolus_score','compound_index','pactolus_file','precursor_mass','precursor_mz']
# temp = pd.DataFrame(melted_items.head(),columns=tuple_cols)

# # melted_items[]
# # temp = melted_items['compound_indices'].apply(pd.Series)
# # melted_items.set_index('orig_index', inplace=True)
# # df2 = df.head(N).merge(melted_items, left_index=True, right_index=True)



In [ ]:

    
# items_as_cols = peak_df.head(N).apply(lambda x: pd.Series(x['compound_indices']), axis=1)
# temp = pd.melt(items_as_cols).value.apply(pd.Series)
# temp.columns = tuple_cols
# temp.head()



In [ ]:

    
# temp = melted_items.to_frame()['compound_indices'].apply(pd.Series)
# temp.columns = tuple_cols
# temp.head()



In [ ]:

    
# temp



In [ ]:

    
# compound_df2.ix[[h[1] for h in peak_df.loc[5,'compound_indices']]]

Number of hits
Score of best hit
Avg Score
Median Score
Number of hits within 50% of best hit
Show the full info for the best hit
export hit tables for each feature
delete hit table colum from peak_df



In [ ]:

    
# best_hit_df = pd.DataFrame(columns=compound_df2.columns)
# empty_df = pd.DataFrame(columns=compound_df2.columns)

# for i in range(10):
#     print i
#     idx = peak_df.loc[i,'compound_indices']
# #     if idx:
#     best_hit_df.append(compound_df2.ix[[idx[2]]])#,ignore_index=True,)
# #     else:
# #         best_hit_df.append(empty_df)



In [ ]:

    
# best_hit_df

long term, export three views to an interactive webpage

Feature table, when a row is selected:
populates a pactolus hit table, when a row is selected:
populates a chem_network table



In [ ]:

    
# # import numpy as np
# # import pandas as pd
# from textwrap import wrap
# HEADER = '''
# <link rel="stylesheet" type="text/css" href="http://cdn.datatables.net/1.10.13/css/jquery.dataTables.css">
# <style>

# </style>
# <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js" type="text/javascript"></script>
# <script type="text/javascript" charset="utf8" src="http://cdn.datatables.net/1.10.13/js/jquery.dataTables.js"></script>
# <html>
#     <head>
#         <script >
#             $(document).ready( function () { $('#table_id').DataTable() } );
#         </script>
#     </head>
#     <body>
# '''
# FOOTER = '''
#     </body>
# </html>


# <script type="text/javascript" >
#     $(document).ready( function () {
#     $('#table_id').DataTable();
# } );
# </script>
# '''

# # title = ax.set_title("\n".join(wrap("Some really really long long long title I really really need - and just can't - just can't - make it any - simply any - shorter - at all.", 60)))
# # df.rename(columns=lambda x: x[1:], inplace=True)

# with open('test.html', 'w') as f:
#     f.write(HEADER)
#     f.write(compound_df2.ix[[h[1] for h in peak_df.loc[1,'compound_indices']]].to_html(classes='df').replace('class="dataframe df"','id="table_id" class="hover"').replace('border="1"','td {word-wrap: break-word}'))
# #     f.write(peak_df.rename(columns=lambda x: "\n".join(wrap(x,25))).head(100).to_html(classes='df').replace('class="dataframe df"','id="table_id" class="hover"').replace('border="1"','td {word-wrap: break-word}'))
#     f.write(FOOTER)



In [ ]:

	polarity	precursor intensity	precursor_mz	retention_time	filename	score	compound	compound_index
0	1	1.157535e+09	607.359985	9.141931	pactolus_results_20160824_C18_LIPID___POS_MSMS...	0.030960	126328	126328
1	1	1.157535e+09	607.359985	9.141931	pactolus_results_20160824_C18_LIPID___POS_MSMS...	0.027728	126329	126329
2	1	1.157535e+09	607.359985	9.141931	pactolus_results_20160824_C18_LIPID___POS_MSMS...	0.014015	126196	126196
3	1	1.157535e+09	607.359985	9.141931	pactolus_results_20160824_C18_LIPID___POS_MSMS...	0.001968	126327	126327
4	1	1.157535e+09	607.359985	9.141931	pactolus_results_20160824_C18_LIPID___POS_MSMS...	0.001410	126195	126195

	filename	inchi	inchi_key	ms1_mass	permanent_charge	mono_isotopic_molecular_weight	creation_time	synonyms	neutralized_2d_inchi_key	chebi_url	neutralized_2d_inchi	lipidmaps_url	source	hmdb_url	formula	number_components	username	pubchem_compound_id	hmdb_id	chebi_id	neutralized_inchi_key	neutralized_inchi	name	num_free_radicals	lipidmaps_id	last_modified	pubchem_url	unique_id
index
9	/project/projectdirs/metatlas/projects/clean_p...	InChI=1S/C5H8S/c1-5-3-2-4-6-5/h2,4-5H,3H2,1H3	MFAXBTKOLOJEHX-UHFFFAOYSA-N	100.034671	0	100.034671	2016-06-18 01:10:21	SCHEMBL257408///2,3-Dihydro-2-methylthiophene/...	MFAXBTKOLOJEHX-UHFFFAOYSA-N		InChI=1S/C5H8S/c1-5-3-2-4-6-5/h2,4-5H,3H2,1H3		hmdb	http://www.hmdb.ca/metabolites/HMDB33564	C5H8S	1	bpb	22054765	HMDB33564		MFAXBTKOLOJEHX-UHFFFAOYSA-N	InChI=1S/C5H8S/c1-5-3-2-4-6-5/h2,4-5H,3H2,1H3	SCHEMBL257408	0		2016-06-18 01:10:21	http://pubchem.ncbi.nlm.nih.gov/compound/22054765	85bf9492e780402486b9454231707741
10	/project/projectdirs/metatlas/projects/clean_p...	InChI=1S/C5H8S/c1-5-3-2-4-6-5/h3H,2,4H2,1H3	KBUVXRDFMQNIPD-UHFFFAOYSA-N	100.034671	0	100.034671	2016-06-18 01:07:39	5-Methyl-2,3-dihydrothiophene///2-Methyl-4,5-d...	KBUVXRDFMQNIPD-UHFFFAOYSA-N		InChI=1S/C5H8S/c1-5-3-2-4-6-5/h3H,2,4H2,1H3		hmdb	http://www.hmdb.ca/metabolites/HMDB33565	C5H8S	1	bpb	559083	HMDB33565		KBUVXRDFMQNIPD-UHFFFAOYSA-N	InChI=1S/C5H8S/c1-5-3-2-4-6-5/h3H,2,4H2,1H3	5-Methyl-2,3-dihydrothiophene	0		2016-06-18 01:07:39	http://pubchem.ncbi.nlm.nih.gov/compound/559083	a8f9ff0d53374ae591534b4e24ac5ef2
11	/project/projectdirs/metatlas/projects/clean_p...	InChI=1S/C5H9P/c1-5(2,3)4-6/h1-3H3	PXQNAKPBTIQCTI-UHFFFAOYSA-N	100.044187	0	100.044187	2016-06-18 01:15:07	1-Phospha-1-butyne, 3,3-dimethyl-///2,2-dimeth...	PXQNAKPBTIQCTI-UHFFFAOYSA-N		InChI=1S/C5H9P/c1-5(2,3)4-6/h1-3H3		wikidata		C5H9P	1	bpb	144845			PXQNAKPBTIQCTI-UHFFFAOYSA-N	InChI=1S/C5H9P/c1-5(2,3)4-6/h1-3H3	1-Phospha-1-butyne, 3,3-dimethyl-	0		2016-06-18 01:15:07	http://pubchem.ncbi.nlm.nih.gov/compound/144845	be64e0f381394d2290b21507c52c3fce
53	/project/projectdirs/metatlas/projects/clean_p...	InChI=1S/C4H9BO2/c1-4-5(6-2)7-3/h4H,1H2,2-3H3	MHTUGAFHHNZMNV-UHFFFAOYSA-N	100.069560	0	100.069560	2016-06-18 01:10:29	dimethyl ethenylboronate///CHEBI:50984	MHTUGAFHHNZMNV-UHFFFAOYSA-N	http://www.ebi.ac.uk/chebi/searchId.do?chebiId...	InChI=1S/C4H9BO2/c1-4-5(6-2)7-3/h4H,1H2,2-3H3		chebi		C4H9BO2	1	bpb	25058192		CHEBI:50984	MHTUGAFHHNZMNV-UHFFFAOYSA-N	InChI=1S/C4H9BO2/c1-4-5(6-2)7-3/h4H,1H2,2-3H3	dimethyl ethenylboronate	0		2016-06-18 01:10:29	http://pubchem.ncbi.nlm.nih.gov/compound/25058192	95d84c12d95248548dbc0b8e600fa2ba
54	/project/projectdirs/metatlas/projects/clean_p...	InChI=1S/C6H12O/c1-2-3-4-5-6-7/h3-4,7H,2,5-6H2...	UFLHIIWVXFIJGU-ONEGZZNKSA-N	100.088815	0	100.088815	2016-06-18 01:19:37	trans-3-Hexen-1-ol///(E)-Hex-3-en-1-ol///trans...	UFLHIIWVXFIJGU-UHFFFAOYSA-N		InChI=1S/C6H12O/c1-2-3-4-5-6-7/h3-4,7H,2,5-6H2...	http://www.lipidmaps.org/data/LMSDRecord.php?L...	lipidmaps		C6H12O	1	bpb	5284503			UFLHIIWVXFIJGU-ONEGZZNKSA-N	InChI=1S/C6H12O/c1-2-3-4-5-6-7/h3-4,7H,2,5-6H2...	trans-3-Hexen-1-ol	0	LMFA05000113	2016-06-18 01:19:37	http://pubchem.ncbi.nlm.nih.gov/compound/5284503	934b58afc92447adad38717e7129e6b4