In [2]:
%matplotlib notebook
# %matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib import pylab as plt
import sys
import glob, os
# import glob
# metob.load_lcms_files(glob.glob('/project/projectdirs/metatlas/data_for_metatlas_2/20150504_LPSilva_Actino_HILIC_POS_51isolates/*.*’))
curr_ld_lib_path = ''
os.environ['LD_LIBRARY_PATH'] = curr_ld_lib_path + ':/project/projectdirs/openmsi/jupyterhub_libs/boost_1_55_0/lib' + ':/project/projectdirs/openmsi/jupyterhub_libs/lib'
import sys
# sys.path.remove('/anaconda/lib/python2.7/site-packages')
sys.path.append('/global/project/projectdirs/openmsi/jupyterhub_libs/anaconda/lib/python2.7/site-packages')
sys.path.insert(0,'/project/projectdirs/openmsi/projects/meta-iq/pactolus/pactolus' )
# %system ls /project/projectdirs/openmsi/projects/meta-iq/pactolus/pactolus/
from generate_frag_dag import *
sys.path.insert(0,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
from metatlas import metatlas_objects as metob
from metatlas import h5_query as h5q
from metatlas import mzml_to_hdf
import tables
import pickle
In [ ]:
import peakdetect
In [ ]:
# import glob
# metob.load_lcms_files(glob.glob('/project/projectdirs/metatlas/raw_data/kblouie/TagOP_Trial_1_DP_NHSCA_NHSPU/*.*'))
In [15]:
# myRun = metob.queryDatabase('lcmsrun',mzml_file='%actino%',created_by='lpsilva',description='%C18_POS%')
# myRun = metob.queryDatabase('lcmsrun',mzml_file='%pandeer%')
myRun = metob.queryDatabase('lcmsrun',created_by='pandeer')
# myExperiment = metob.queryDatabase('Group')#,mzml_file='%global/homes/b/bpb/Actino_C18_Pos%')
In [13]:
myExperiment[0].items
Out[13]:
In [16]:
print len(myRun)
for r in range(len(myRun)):
print myRun[r]
In [6]:
myRun[0].edit()
In [6]:
# myfile = '/global/homes/b/bpb/ExoMetabolomic_Example_Data/MEDIA-1.h5'
h5file = tables.open_file(myRun[0].hdf5_file)
# h5file
In [13]:
RTSTART = 2.0
RTSTOP = 23.0
MZSTART = 70
MZSTOP = 1050
DELTA_RT = 0.00980
PPM_BINNING = 10
NUM_MZ_EDGES = np.ceil(1e6*np.log(MZSTOP/MZSTART)/PPM_BINNING)
RT_EDGES = np.linspace(RTSTART,RTSTOP,(RTSTOP - RTSTART) / DELTA_RT)
MZ_EDGES = np.logspace(np.log10(MZSTART), np.log10(MZSTOP), NUM_MZ_EDGES)
PPM_BINNING = 100
NUM_MZ_EDGES = np.ceil(1e6*np.log(MZSTOP/MZSTART)/PPM_BINNING)
MZ_EDGES_100ppm = np.logspace(np.log10(MZSTART), np.log10(MZSTOP), NUM_MZ_EDGES)
# create a low res binning to make figures
MZ_EDGES_1K = np.linspace(MZSTART,MZSTOP,1000)
print MZ_EDGES.shape
#RT_CENTERS = (RT_EDGES[:-1] + RT_EDGES[1:]) / 2
MZ_CENTERS = (MZ_EDGES[:-1] + MZ_EDGES[1:]) / 2
EIC_MEDFILT_SIZE = 5
EIC_STDEV = 0.05 #stdev of a peak in minutes
MZ_GAUSS_WIDTH = 1 #ppm
MZ_MEDFILT_SIZE = 3 #bins
In [15]:
print len(MZ_EDGES)
In [ ]:
In [16]:
mz,intensity = h5q.get_spectrogram(h5file,0,20,1,1,bins=MZ_EDGES)
In [18]:
plt.plot(mz[:10],intensity[:10])
plt.show()
In [ ]:
In [ ]:
In [ ]:
# mzml_to_hdf(myRun[0].mzml_file, myRun[0].hdf5_file)
# h5file = tables.open_file(myRun[0].hdf5_file)
# import glob
# metob.load_lcms_files(glob.glob('/global/homes/b/bpb/Actino_C18_Pos/*.*'))
In [ ]:
MSMS_data = {}
MSMS_data['scan'] = []
MSMS_data['file'] = []
MSMS_data['rt'] = []
MSMS_data['precursor_mz'] = []
MSMS_data['precursor_intensity'] = []
MSMS_data['collision_energy'] = []
for i,myfile in enumerate(myRun):
print i
data = get_msms_data(myfile.hdf5_file,2,1)
for myrt in np.unique(data['rt']):
idx = np.argwhere(data['rt'] == myrt)
temp = np.array([data['mz'][idx],
data['i'][idx]] ).T
MSMS_data['scan'].append( np.squeeze(temp) )
MSMS_data['file'].append( myfile.hdf5_file )
MSMS_data['rt'].append( myrt )
MSMS_data['precursor_mz'].append( data['precursor_MZ'][idx[0]][0] )
MSMS_data['precursor_intensity'].append( data['precursor_intensity'][idx[0]][0] )
MSMS_data['collision_energy'].append( data['collision_energy'][idx[0]][0] )
In [25]:
# print len(MSMS_data['rt'])
# MSMS_data['file'][100]
# with open('Actino_C18_Pos.pickle', 'wb') as handle:
# pickle.dump(MSMS_data, handle)
with open('Actino_C18_Pos.pickle', 'rb') as handle:
MSMS_data = pickle.load(handle)
In [3]:
1+1
Out[3]:
In [26]:
intensity = MSMS_data['precursor_intensity']
sx = np.argsort(intensity[:])
In [32]:
MSMS_data_sorted = {}
MSMS_data_sorted['scan'] = []
MSMS_data_sorted['file'] = []
MSMS_data_sorted['rt'] = []
MSMS_data_sorted['precursor_mz'] = []
MSMS_data_sorted['precursor_intensity'] = []
MSMS_data_sorted['collision_energy'] = []
for i in sx:
MSMS_data_sorted['scan'].append(MSMS_data['scan'][i])
MSMS_data_sorted['file'].append(MSMS_data['file'][i])
MSMS_data_sorted['rt'].append(MSMS_data['rt'][i])
MSMS_data_sorted['precursor_mz'].append(MSMS_data['precursor_mz'][i])
MSMS_data_sorted['precursor_intensity'].append(MSMS_data['precursor_intensity'][i])
MSMS_data_sorted['collision_energy'].append(MSMS_data['collision_energy'][i])
In [33]:
pos_mode_neutralizations = [-1.00727646677, -(1.00727646677+1.00782504), +5.4857990946e-4,]
neg_mode_neutralizations = [-el for el in pos_mode_neutralizations]
# make lookup table
path_to_trees = '/project/projectdirs/openmsi/projects/pactolus_trees/'
all_my_h5_files = glob.glob('/project/projectdirs/openmsi/projects/pactolus_trees/*_hdf5_5_*.h5')
my_tree_filename = 'metacyc_max_depth_5'
if not os.path.isfile(os.path.join(path_to_trees, my_tree_filename + '.npy')):
score_frag_dag.make_file_lookup_table_by_MS1_mass(all_my_h5_files,
path=path_to_trees,
save_result='metacyc_max_depth_5')
maxdepth_5_table = os.path.join(path_to_trees, my_tree_filename + '.npy')
params = {'file_lookup_table': maxdepth_5_table,
'ms1_mass_tol': 0.02,
'ms2_mass_tol': 0.01,
'neutralizations': pos_mode_neutralizations,
'max_depth': 5,
}
In [16]:
import time
import score_frag_dag
score_results = []
# print len(score_results)
In [22]:
t0 = time.time()
with open('/project/projectdirs/metatlas/projects/pactolus/results/test.pickle','wb') as handle:
pickle.dump(score_results, handle)
t1 = time.time()
print i, t1 - t0
In [17]:
# N=
t0 = time.time()
for i in range(36000,len(MSMS_data_sorted['precursor_mz']),3000):
if i + 3000 < len(MSMS_data_sorted['precursor_mz']):
foo = score_frag_dag.score_scan_list_against_trees(MSMS_data_sorted['scan'][i:i+3000],
MSMS_data_sorted['precursor_mz'][i:i+3000], params)
else:
foo = score_frag_dag.score_scan_list_against_trees(MSMS_data_sorted['scan'][i:],
MSMS_data_sorted['precursor_mz'][i:], params)
# score_results.append(foo)
foo2 = []
for i in foo:
foo.append(i)
foo = np.array(foo)
s = score_frag_dag.make_pactolus_hit_table(foo,maxdepth_5_table,'/global/homes/b/bpb/notebooks/meta-iq_old/midas_lbl/MetaCyc.mdb')
print foo.shape
with open('Actino_C18_Pos_Scores_Sorted.pickle', 'wb') as handle:
pickle.dump(score_results, handle)
t1 = time.time()
print i, t1 - t0
In [ ]:
print maxdepth_5_table
In [ ]:
s = score_frag_dag.make_pactolus_hit_table(foo,maxdepth_5_table,'/global/homes/b/bpb/notebooks/meta-iq_old/midas_lbl/MetaCyc.mdb')
print s
In [ ]:
score_frag_dag.
for f in score_results:
print np.argmax(f)
In [ ]:
my_db = '/project/projectdirs/openmsi/projects/meta-iq/pactolus/data/' + 'MetaCyc.mdb'
score_frag_dag.make_pactolus_hit_table(foo, maxdepth_5_table, original_db=my_db)
In [ ]:
In [ ]:
q = lambda i,j,n: n*j - j*(j+1)/2 + i - 1 - j
In [ ]:
In [ ]:
In [ ]:
import itertools as it
import numpy as np
import timeit
a = np.random.randint(1e6, size=20)
a = a.astype(float) / 1e3
a = np.sort(a)
idx = range(len(a))
print a
print idx
yi = np.interp(a,idx,)
In [ ]:
def get_msms_data(h5file, ms_level, polarity, **kwargs):
"""
Get raw data from an h5file meeting given criteria.
Parameters
----------
h5file: string or open pytables file
The path to the h5file or open file handle.
ms_level : int
MS Level.
polarity : int
Plus proton (1) or Minus proton (0).
**kwargs
Optional search modifiers. (e.g. min_r=0, max_mz=100, precursor_MZ=1).
Use verbose=True for displaying query messages.
Returns
-------
out : dictionary
Dictionary with arrays for 'i', 'mz', and 'rt' values meeting criteria.
"""
if not isinstance(h5file, tables.File):
h5file = tables.open_file(h5file)
if ms_level == 1:
if not polarity:
table = h5file.root.ms1_neg
else:
table = h5file.root.ms1_pos
elif not polarity:
table = h5file.root.ms2_neg
else:
table = h5file.root.ms2_pos
if not table.nrows:
raise ValueError('No data in chosen table: %s' % table._v_name)
query = ''
for name in ['rt', 'mz', 'precursor_MZ', 'precursor_intensity',
'collision_energy']:
if 'min_%s' % name in kwargs:
query += ' & (%s >= %s)' % (name, kwargs['min_%s' % name])
if 'max_%s' % name in kwargs:
query += ' & (%s <= %s)' % (name, kwargs['max_%s' % name])
if name in kwargs:
query += ' & (%s == %s)' % (name, kwargs[name])
if not query:
data = table.read()
return dict(i=data['i'],
rt=data['rt'],
mz=data['mz'],
precursor_MZ=data['precursor_MZ'],
precursor_intensity=data['precursor_intensity'],
collision_energy=data['collision_energy'] )
# chop off the initial ' & '
query = query[3:]
if kwargs.get('verbose', None):
print('Querying: %s from %s' % (query, table._v_name))
data = table.read_where(query)
if not data.size:
raise ValueError('No data found matching criteria')
if kwargs.get('verbose', None):
print('Query complete')
return dict(i=data['i'],
rt=data['rt'],
mz=data['mz'],
precursor_MZ=data['precursor_MZ'],
precursor_intensity=data['precursor_intensity'],
collision_energy=data['collision_energy'] )
In [ ]:
In [ ]:
# myRun = metob.LcmsRun(hdf5_file = '/global/homes/b/bpb/ExoMetabolomic_Example_Data/MEDIA-1.h5')
myRun.interact(min_mz = 233.153, max_mz = 233.155)#, polarity = 'positive', ms_level=1)
In [ ]:
import tables
target_mz = 233.15416
target_mz_tolerance = 105
target_rt_min = 7.1
target_rt_max = 7.15
# target_rt_min = 7.4
# target_rt_min = 7.6
fid = tables.open_file(myRun.hdf5_file)
data = h5q.get_data(fid,2,0,
min_precursor_mz = target_mz - target_mz * target_mz_tolerance / 1e6,
max_precursor_mz = target_mz + target_mz * target_mz_tolerance / 1e6,
# max_mz = mz_ref.mz + mz_ref.mz*mz_ref.mz_tolerance/1e6,
min_rt = target_rt_min,
max_rt = target_rt_max,
)
In [ ]:
data
In [ ]:
%matplotlib inline
import numpy as np
plt.vlines(data['mz'],np.zeros(data['i'].shape),data['i'])
plt.show()
In [ ]:
sys.path.insert(0,'/project/projectdirs/openmsi/projects/meta-iq/pactolus/pactolus' )
%system ls /project/projectdirs/openmsi/projects/meta-iq/pactolus/pactolus/
from generate_frag_dag import *
In [ ]:
allruns = metob.queryDatabase('lcmsrun',created_by='kblou%')
In [ ]:
print len(allruns)
In [ ]:
myCompound = metob.Compound()
myCompound.edit()
In [ ]:
myCompound.store()
In [ ]:
allCompounds = metob.queryDatabase('Compounds')
print allCompounds
In [ ]:
myMethod = metob.Method()
myMethod.edit()
In [ ]:
# queryDatabase('Atlas',user='curt',description='an atlas')
# queryDatabase('group',modified_by='silvest',name='My cool title')
In [ ]:
# myGrade = metob.IdentificationGrade(name='A', description='High intensity and verifiable by MSMS and RT authentic standard')
# myGrade.store()
# myGrade = metob.IdentificationGrade(name='B', description='Verifiable by MSMS from database or publication')
# myGrade.store()
# myGrade = metob.IdentificationGrade(name='C', description='Has fragment ion or neutral loss characteristic of a class of compounds')
# myGrade.store()
# myGrade = metob.IdentificationGrade(name='D', description='Unambiguous chemical formula and adduct')
# myGrade.store()
# myGrade = metob.IdentificationGrade(name='E', description='Significant feature with MSMS suggestion from MIDAS')
# myGrade.store()
# myGrade = metob.IdentificationGrade(name='F', description='Unknown feature that is not significant')
# myGrade.store()
In [ ]:
myIDGrade = metob.queryDatabase('IdentificationGrade',name='A')
myIDGrade
In [ ]:
atlases = metob.queryDatabase('Atlas',created_by='bpb')
for a in atlases:
print a
myAtlas = atlases[3]
# print myAtlas
myAtlas
In [ ]:
# class FragmentationReference(Reference):
# polarity = Enum(POLARITY, 'positive')
# precursor_mz = CFloat()
# mz_intensities = List(Instance(MzIntensityPair),
# help='list of [mz, intesity] tuples that describe ' +
# ' a fragmentation spectra')
# class MzIntensityPair(MetatlasObject):
# mz = CFloat()
# intensity = CFloat()
# @set_docstring
# class MzReference(Reference):
# """Source of the assertion that a compound has a given m/z and
# other properties directly tied to m/z.
# """
# mz = CFloat()
# mz_tolerance = CFloat()
# mz_tolerance_units = Enum(('ppm', 'Da'), 'ppm')
# detected_polarity = Enum(POLARITY, 'positive')
# adduct = CUnicode(help='Optional adduct')
# modification = CUnicode(help='Optional modification')
# observed_formula = CUnicode(help='Optional observed formula')
In [ ]:
my_ref = []
my_ref.append(metob.RtReference(lcms_run=myRun, RTpeak=2.6, RTmin = 2.0, RTmax = 3.0, RTUnits = 'min'))
my_ref.append(metob.MzReference(lcms_run=myRun, mz=170.081169, mz_tolerance=5, mz_tolerance_units='ppm', detected_polarity='positive',adduct='+Proton',modification='',observed_formula='C8H11NO3') ) #TODO: should be neutral formula
# metob.FragmentationReference() #TODO: should be the reference to a real spectrum in a file
# my_ref.append(metob.FragmentationReference(lcms_run=myRun, RTpeak=2.6, RTmin = 2.0, RTmax = 3.0, RTUnits = 'min'))
my_id = metob.CompoundId(compound=myCompound, identification_grade=myIDGrade[0], references=my_ref)
# myAtlas = metob.Atlas(name = 'R2A Media Positive Mode Hilic', compounds_ids = [my_id])
myAtlas.compound_ids = [my_id] #TODO: should be compound_identifications
myAtlas.store()
In [ ]:
print myAtlas.compounds_ids[0].compound.name
print myAtlas.compounds_ids[0].compound.InChl #TODO: Change to InChI
# print myAtlas.compounds_ids[0].identification_grade
rt_ref = myAtlas.compounds_ids[0].references[0]
mz_ref = myAtlas.compounds_ids[0].references[1]
print rt_ref.RTpeak, rt_ref.RTmin, rt_ref.RTmax
print mz_ref.mz, mz_ref.mz_tolerance, mz_ref.mz_tolerance_units
In [ ]:
#TODO: I need to specify a method for these files
import glob, os
myPath = '/project/projectdirs/metatlas/data_for_metatlas_2/20150504_LPSilva_Actino_HILIC_POS_51isolates'
myFiles = glob.glob(os.path.join(myPath,'*.h5'))
# print myFiles
myItems = []
for f in myFiles:
if 'HB42' in f:
myItems.append( metob.LcmsRun(hdf5_file = f) )
spentmediaFiles = metob.Group(name='HB42 Spent Medium',description='Spent medium from isolate HB42 grown in R2A and analyzed with hilic positive mode',items=myItems)
spentmediaFiles.store()
myItems = []
for f in myFiles:
if 'R2A' in f:
myItems.append( metob.LcmsRun(hdf5_file = f))
mediaFiles = metob.Group(name='Control Medium',description='R2A medium analyzed with hilic positive mode',items=myItems)
mediaFiles.store()
myExperiment = metob.Group(name='Control vs Spent Medium',
description='Four replicates of both R2A medium and spent medium from isolate HB42 analyzed with hilic positive mode',
items=[spentmediaFiles,mediaFiles])
myExperiment.store()
In [ ]:
print myExperiment.description
print myExperiment.items[0].description
print myExperiment.items[0].items[0].hdf5_file
print myExperiment.items[1].description
print myExperiment.items[1].items[0].hdf5_file
In [ ]:
from metatlas import h5_query as h5q
In [ ]:
# TODO: reconvert files and use LcmsRun.interact
# This was done the old way and won't work anymore
data = []
for f in myExperiment.items:
for ff in f.items:
print ff.hdf5_file
data.append(h5q.get_data(ff.hdf5_file,
1,
1,
min_mz = mz_ref.mz - mz_ref.mz*mz_ref.mz_tolerance/1e6,
max_mz = mz_ref.mz + mz_ref.mz*mz_ref.mz_tolerance/1e6,
min_rt = rt_ref.RTmin,
max_rt = rt_ref.RTmax,
))
In [ ]:
In [ ]:
In [ ]:
# import pandas as pd
# atlas_file = '/global/homes/b/bpb/20150804_POS_pHILIC_R2A_Small.txt'
# finfo_file = '/global/homes/b/bpb/POS_C18_Actino_FileInfo.txt'
# df = pd.read_csv(atlas_file,sep='\t')
# df.fillna('',inplace=True)
# comp_list = df.to_dict('records')
# df = pd.read_csv(finfo_file,sep='\t')
# df.fillna('',inplace=True)
# finfo_list = df.to_dict('records')
# print comp_list[0]
# # print finfo_list
In [ ]:
# compounds = []
# for comp in comp_list:
# compounds.append(metatlas_objects.Compound(name = comp['name'],
# formula = comp['formula'],
# adducts = comp['adducts'],
# mz = comp['mz'],
# mz_threshold = comp['mz_threshold'],
# rt_min = comp['rt_min'],
# rt_max = comp['rt_max'],
# rt_peak = comp['rt_peak'],
# neutral_mass = comp['neutral_mass'],
# pubchem_id = comp['pubchem_id']))
In [ ]: