This notebook provides a demonstration of the spatial metabolomics pipeline applied to a single molecule from a database. During deployment the process is repeated for all 10K database entries.
The pipeline can be broadly split into two stages:
In [8]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import csv
%matplotlib inline
import sys
from IPython.display import display, clear_output
import os
sys.path.append('/Users/palmer/Documents/python_codebase/')
from pyIMS.hdf5.inMemoryIMS_hdf5 import inMemoryIMS_hdf5
from pyMS.pyisocalc import pyisocalc
import time
import pickle
from pySpatialMetabolomics.parse_databases import parse_databases
An entry from the database is parsed and the sumformula is extracted. We take this and the database key so we can link back later.
We also define a set of 'adducts' these are molecules that can become attached during the mass spectrometry process and so change the mass of the molecule. It's possible that some or all of the adducts are detected in the data but there's no way of knowing a priori which ones.
In [ ]:
## Input parameters
# Data file/ouput location
filename_in = '/Users/palmer/Documents/tmp_data/UCSD_01902_centroids_IMS.hdf5' #using a temporary hdf5 based format
output_dir = ''
## Isotope pattern prediction
# possible adducts
adducts = ('H','Na','K')
charge = 1
# Database file locations
db_filename = '/Users/palmer/Copy/hmdb_database.tsv'
db_dump_folder = '/Users/palmer/Documents/Projects/2015/SM_development/databases' # store location for isotope patterns - to avoid regenerating every time
# Instrument prediction parameters
isocalc_sig=0.01
isocalc_resolution = 200000.
isocalc_do_centroid = True
## Image genreation parameters
ppm = 2.; # parts per million - a measure of how accuracte the mass spectrometer is
nlevels = 30 # parameter for measure of chaos
q=99 # percentile threshold ()
## Results filter thresholds
measure_tol = 0.99#heuristic tolerances for filter
iso_corr_tol = 0.5
iso_ratio_tol = 0.85
In [10]:
## Load database of metabolites
# TODO: standardise database format
# currently a separate load function is needed each time
# sum_formula = 'C23H45NO4' #Glycocholic acid: http://89.223.39.196:2347/substance/00138
# sum_formulae = read_kegg_compounds(db_filename)
# sum_formulae = read_hmdb_compounds(db_filename)
sum_formulae = parse_databases.read_hmdb_compounds(db_filename)
We simulate a mass spectrum for each sum formula/adduct combination. This generates a set of isotope patterns (see http://www.mi.fu-berlin.de/wiki/pub/ABI/QuantProtP4/isotope-distribution.pdf) which can provide additional informaiton on the molecule detected. This gives us a list of m/z centres for the molecule
In [3]:
## Predict isotope patterns
def calcualte_isotope_patterns(sum_formulae,adducts='',isocalc_sig=0.01,isocalc_resolution = 200000.,isocalc_do_centroid = True):
tstart=time.time()
mz_list={}
for n,sum_formula in enumerate(sum_formulae):
if np.mod(n,10) == 0:
clear_output(wait=True)
print '{} {:2.3f}\% complete\r'.format(sum_formula,100*float(n)/len(sum_formulae),end="\r")
sys.stdout.flush()
for adduct in adducts:
isotope_ms = pyisocalc.isodist(sum_formula+adduct,plot=False,sigma=isocalc_sig,charges=charge,resolution=isocalc_resolution,do_centroid=isocalc_do_centroid)
if not sum_formula in mz_list:
mz_list[sum_formula] = {}
mz_list[sum_formula][adduct] = isotope_ms.get_spectrum(source='centroids')
print 'Elapsed: {:5.2f} seconds'.format(time.time() - tstart)
return mz_list
# Check if already genrated and load if possible
db_name= os.path.splitext(os.path.basename(db_filename))[0]
mz_list={}
for adduct in adducts:
load_file = '{}/{}_{}_{}_{}.dbasedump'.format(db_dump_folder,db_name,adduct,isocalc_sig,isocalc_resolution)
if os.path.isfile(load_file):
mz_list_tmp = pickle.load(open(load_file,'r'))
else:
mz_list_tmp = calcualte_isotope_patterns(sum_formulae,adducts=(adduct,),isocalc_sig=isocalc_sig,isocalc_resolution=isocalc_resolution)
pickle.dump(mz_list_tmp,open(load_file,'w'))
for sum_formula in mz_list_tmp:
if not sum_formula in mz_list:
mz_list[sum_formula] = {}
mz_list[sum_formula][adduct] = mz_list_tmp[sum_formula][adduct]
print 'all isotope patterns generated and loaded'
We parse a dataset and then search it one adduct at a time using the following sequence:
1. generate images for the peaks
2. score this image using a measure of spatial chaos (checks if image is likley noise)
3. score the distributions (should correlate with the monoisotopic)
4. score the isotope ratio (average image intensity, should match predicted intensities)
5. use scores to filter whether molecule is present
In [4]:
# Parse data
from pyIMS.hdf5.inMemoryIMS_hdf5 import inMemoryIMS_hdf5
from pyIMS.image_measures import level_sets_measure
IMS_dataset=inMemoryIMS_hdf5(filename_in)
#IMS_dataset=inMemoryIMS_hdf5(filename_in,max_mz=500.,min_int=150.,index_range=range(0,3842)) #options to limit data size
In [11]:
## The main part of the algorithm - don't change these parameters
tstart=time.time()
measure_value_score={}
iso_correlation_score = {}
iso_ratio_score = {}
for n, sum_formula in enumerate(sum_formulae):
for adduct in adducts:
## 1. Geneate ion images
mz_list[sum_formula][adduct][0] #get centroid mz values
ion_datacube = IMS_dataset.get_ion_image(mz_list[sum_formula][adduct][0],ppm) #for each spectrum, sum the intensity of all peaks within tol of mz_list
# tidy images
for xic in ion_datacube.xic:
xic_q = np.percentile(xic,q)
xic[xic>xic_q]=xic_q
## 2. Spatial Chaos
if not sum_formula in measure_value_score:
measure_value_score[sum_formula] = {}
if np.sum(ion_datacube.xic_to_image(0)) == 0:
measure_value_score[sum_formula][adduct] = 0 # this is now coded into measure_of_chaos
else:
measure_value_score[sum_formula][adduct] = 1-level_sets_measure.measure_of_chaos(ion_datacube.xic_to_image(0),nlevels,interp=False)[0]
if measure_value_score[sum_formula][adduct] == 1:
measure_value_score[sum_formula][adduct] = 0
clear_output(wait=True)
## 3. Score correlation with monoiso
if not sum_formula in iso_correlation_score:
iso_correlation_score[sum_formula] = {}
if len(mz_list[sum_formula][adduct][1]) > 1:
iso_correlation = np.corrcoef(ion_datacube.xic)[1:,0]
iso_correlation[np.isnan(iso_correlation)] = 0 # when alll values are the same (e.g. zeros) then correlation is undefined
iso_correlation_score[sum_formula][adduct] = np.average(
iso_correlation,weights=mz_list[sum_formula][adduct][1][1:]
) # slightly faster to compute all correlations and pull the elements needed
else: # only one isotope peak, so correlation doesn't make sense
iso_correlation_score[sum_formula][adduct] = 1
## 4. Score isotope ratio
if not sum_formula in iso_ratio_score:
iso_ratio_score[sum_formula] = {}
isotope_intensity = mz_list[sum_formula][adduct][1]
image_intensities = [sum(ion_datacube.xic[ii]) for ii in range(0,len(mz_list[sum_formula][adduct][0]))]
iso_ratio_score[sum_formula][adduct] = 1-np.mean(abs( isotope_intensity/np.linalg.norm(isotope_intensity) - image_intensities/np.linalg.norm(image_intensities)))
if np.mod(n,10)==0:
clear_output(wait=True)
print '{:3.2f}\% complete\r'.format(100*n/len(sum_formulae),end="\r")
sys.stdout.flush()
print 'Elapsed: {:5.2f} seconds'.format(time.time() - tstart)
In [13]:
# Save the processing results
filename_out = '{}{}{}_full_results.txt'.format(output_dir,os.sep,os.path.splitext(os.path.basename(filename_in))[0])
with open(filename_out,'w') as f_out:
f_out.write('sf,adduct,mz,moc,spec,spat,pass\n'.format())
for sum_formula in sum_formulae:
for adduct in adducts:
moc_pass = measure_value_score[sum_formula][adduct] > measure_tol and iso_correlation_score[sum_formula][adduct] > iso_corr_tol and iso_ratio_score[sum_formula][adduct] > iso_ratio_tol
f_out.write('{},{},{},{},{},{},{}\n'.format(
sum_formula,
adduct,
mz_list[sum_formula][adduct][0][0],
measure_value_score[sum_formula][adduct],
iso_correlation_score[sum_formula][adduct],
iso_ratio_score[sum_formula][adduct],
moc_pass))
filename_out = '{}{}{}_pass_results.txt'.format(output_dir,os.sep,os.path.splitext(os.path.basename(filename_in))[0])
with open(filename_out,'w') as f_out:
f_out.write('ID,sf,adduct,mz,moc,spec,spat\n'.format())
for sum_formula in sum_formulae:
for adduct in adducts:
if measure_value_score[sum_formula][adduct] > measure_tol and iso_correlation_score[sum_formula][adduct] > iso_corr_tol and iso_ratio_score[sum_formula][adduct] > iso_ratio_tol:
f_out.write('{},{},{},{},{},{},{}\n'.format(
sum_formulae[sum_formula]['db_id'],
sum_formula,adduct,
mz_list[sum_formula][adduct][0][0],
measure_value_score[sum_formula][adduct],
iso_correlation_score[sum_formula][adduct],
iso_ratio_score[sum_formula][adduct]))
In [14]:
## Re-Load results
#from pySpatialMetabolomics.tools import results_tools
#filename_out = '{}{}{}_full_results.txt'.format(output_dir,os.sep,os.path.splitext(os.path.basename(filename_in))[0])
#(measure_value_score,iso_correlation_score,iso_ratio_score,moc_pass) = results_tools.load_results(filename_out)
In [15]:
## Make Parula colormap
# todo - generate unique colourmap to avoid IP issues
from pySpatialMetabolomics.tools import colourmaps
c_map = colourmaps.make_cmap([(53,42,135)
,(2,104,225)
,(16,142,210)
,(15,174,185)
,(101,190,134)
,(192,188,96)
,(255,195,55)
,(249,251,14)],bit=True,name='parula')
In [16]:
## Output pass molecules
def check_pass(pass_thresh,pass_val):
tf = []
for v,t in zip(pass_val,pass_thresh):
tf.append(v>t)
if all(tf):
return True
else:
return False
def plot_images(ion_datacube,iso_spect,iso_max):
fig = plt.figure(figsize=(20,15),dpi=300)
ax = [ plt.subplot2grid((2, 4), (0, 0)),
plt.subplot2grid((2, 4), (0, 1)),
plt.subplot2grid((2, 4), (0, 2)),
plt.subplot2grid((2, 4), (0, 3)),
plt.subplot2grid((2, 4), (1, 0), colspan=4, rowspan=1)
]
for a in ax:
a.cla()
# plot images
for ii in range(0,iso_max):
im = ion_datacube.xic_to_image(ii)
# hot-spot removal
notnull=im>0
if np.sum(notnull==False)==np.shape(im)[0]*np.shape(im)[1]:
im=im
else:
im_q = np.percentile(im[notnull],q_val)
im_rep = im>im_q
im[im_rep] = im_q
ax[ii].imshow(im,cmap=c_map)
ax[ii].set_title('m/z: {:3.4f}'.format(mz_list[sum_formula][adduct][0][ii]))
# plot spectrum
notnull=ion_datacube.xic_to_image(0)>0
data_spect = [np.sum(ion_datacube.xic_to_image(ii)) for ii in range(0,iso_max)]
data_spect = data_spect / np.linalg.norm(data_spect)
iso_spect = iso_spect/np.linalg.norm(iso_spect)
markerline, stemlines, baseline = ax[4].stem( mz_list[sum_formula][adduct][0][0:iso_max],iso_spect,'g')
plt.setp(stemlines, linewidth=2, color='g') # set stems colors
plt.setp(markerline, 'markerfacecolor', 'g','markeredgecolor','g') # make points
markerline, stemlines, baseline = ax[4].stem( mz_list[sum_formula][adduct][0][0:iso_max],data_spect,'r')
plt.setp(stemlines, linewidth=2, color='r') # set stems colors
plt.setp(markerline, 'markerfacecolor', 'r','markeredgecolor','r') # make points
#plot proxy artist
proxies=[]
h, = plt.plot(mz_list[sum_formula][adduct][0][0],[0],'-g')
proxies.append(h)
h, = plt.plot(mz_list[sum_formula][adduct][0][0],[0],'-r')
proxies.append(h)
ax[4].legend(proxies,('predicted pattern','data pattern'), numpoints=1)
return fig,ax
pass_formula = []
n_iso=4
q_val=99
for sum_formula in mz_list:
for adduct in adducts:
ch_pass = check_pass((measure_tol,iso_corr_tol,iso_ratio_tol),(measure_value_score[sum_formula][adduct],iso_correlation_score[sum_formula][adduct],iso_ratio_score[sum_formula][adduct]))
if ch_pass:
pass_formula.append('{} {}'.format(sum_formula,adduct))
ion_datacube = IMS_dataset.get_ion_image(mz_list[sum_formula][adduct][0],ppm)
iso_max = min((n_iso,len(mz_list[sum_formula][adduct][0])))
iso_spect = mz_list[sum_formula][adduct][1][0:iso_max]
fig, ax = plot_images(ion_datacube,iso_spect,iso_max)
fig.suptitle('{} [M+{}]. Measure of Chaos: {:3.4f}, Image Correlation: {:3.4f} Isotope Score: {:3.4f}'.format(sum_formula,adduct,measure_value_score[sum_formula][adduct],iso_correlation_score[sum_formula][adduct],iso_ratio_score[sum_formula][adduct]))
#SAVE figure
plt.savefig('{}/{}_{}_{}.png'.format(output_dir,os.path.basename(IMS_dataset.file_dir),sum_formula,adduct))
plt.clf()
In [18]:
# View the output from a specific metabolite
sum_formula='C21H21N'
adduct = 'H'
q=99
iso_correlation = np.corrcoef(ion_datacube.xic)[1:,0]
iso_correlation[np.isnan(iso_correlation)] = 0 # when alll values are the same (e.g. zeros) then correlation is undefined
ion_datacube = IMS_dataset.get_ion_image(mz_list[sum_formula][adduct][0],ppm)
iso_max = min((n_iso,len(mz_list[sum_formula][adduct][0])))
iso_spect = mz_list[sum_formula][adduct][1][0:iso_max]
fig, ax = plot_images(ax,ion_datacube,iso_spect,iso_max)
fig.suptitle('{} [M+{}]. Measure of Chaos: {:3.4f}, Image Correlation: {:3.4f} Isotope Score: {:3.4f}'.format(sum_formula,adduct,measure_value_score[sum_formula][adduct],iso_correlation_score[sum_formula][adduct],iso_ratio_score[sum_formula][adduct]))
In [ ]: