In [4]:
%matplotlib inline
from matplotlib import pylab as plt
from metatlas import h5_query
from metatlas import metatlas_objects
import glob, os
import numpy as np
import tables
In [9]:
%system ls -t /project/projectdirs/metatlas/data_for_metatlas_2/
Out[9]:
In [8]:
myPath = '/project/projectdirs/metatlas/data_for_metatlas_2/m6A_FinalSet/'
# myPath = '/project/projectdirs/metatlas/data_for_metatlas_2/m6AMSMS/'
# myPath = '/project/projectdirs/metatlas/data_for_metatlas_2/m6Astandards_FRESH/'
# # myPath = '/project/projectdirs/metatlas/data_for_metatlas_2/m6A_Set1/'
# # myPath = '/project/projectdirs/metatlas/data_for_metatlas_2/m6AStandards/'
myFiles = glob.glob('%s*.mzML'%myPath)
myFiles.sort()
for f in myFiles:
print f
metatlas_objects.mzml_to_hdf('%s'%(f))
In [7]:
%system ls $myPath
Out[7]:
In [65]:
c1 = metatlas_objects.Compound(name = '2''-deoxyadenosine',
formula = 'C10H13N5O3',
adducts = 'H+',
mz = 252.109115,
mz_threshold = 15,
rt_min = 2.5,
rt_max = 2.8,
rt_peak = 2.65,
neutral_mass = 251.101839,
pubchem_id = 13730)
c2 = metatlas_objects.Compound(name = 'N6-Methyl-2''-deoxyadenosine',
formula = 'C11H15N5O3',
adducts = 'H+',
mz = 266.124765,
mz_threshold = 15,
rt_min = 2.7,
rt_max = 3.0,
rt_peak = 2.85,
neutral_mass = 265.117489,
pubchem_id = 168948 )
In [66]:
myPath = '/project/projectdirs/metatlas/data_for_metatlas_2/m6AMSMS/'
myfile = '20150828_C18_POS_MSMS_ACN_Kin150mm_C4_Chlamy_DNA_4_R4_8ul_MSMS_thegoodone.h5'
msms_chlamy_file = '%s%s' % (myPath,myfile)
myfile = '20150828_C18_POS_MSMS_ACN_Kin150mm_A1_Arabidopsis_DNA_1_R25_8ul_MSMS.h5'
msms_arab_file = '%s%s' % (myPath,myfile)
myfile = '20150828_C18_POS_MSMS_ACN_Kin150mm_N1_Nanochloropsis_DNA_1_R31_8ul_MSMS.h5'
msms_nanno_file = '%s%s' % (myPath,myfile)
myfile = '20150828_C18_POS_MSMS_ACN_Kin150mm__pt_1_ugmL__8ul.h5'
msms_reference_file = '%s%s' % (myPath,myfile)
myPath = '/project/projectdirs/metatlas/data_for_metatlas_2/m6A_FinalSet/'
flist = ['20150824_C18_POS_MSMS_ACN_Kin150mm_A1_Arabidopsis_DNA_1_R25_8ul.h5',
'20150824_C18_POS_MSMS_ACN_Kin150mm_A2_Arabidopsis_DNA_2_R26_8ul.h5',
'20150824_C18_POS_MSMS_ACN_Kin150mm_A3_Arabidopsis_DNA_3_R27_8ul.h5']
arabidopsis_files = []
for myfile in flist:
arabidopsis_files.append('%s%s' % (myPath,myfile) )
flist = ['20150824_C18_POS_MSMS_ACN_Kin150mm_C1_Chlamy_DNA_1_R1_8ul.h5',
'20150824_C18_POS_MSMS_ACN_Kin150mm_C2_Chlamy_DNA_2_R2_8ul.h5',
'20150824_C18_POS_MSMS_ACN_Kin150mm_C3_Chlamy_DNA_3_R3_8ul.h5',
'20150824_C18_POS_MSMS_ACN_Kin150mm_C4_Chlamy_DNA_4_R4_8ul.h5']
chlamy_files = []
for myfile in flist:
chlamy_files.append('%s%s' % (myPath,myfile) )
flist = '20150824_C18_POS_MSMS_ACN_Kin150mm_N1_Nanochloropsis_DNA_1_R31_8ul.h5'
nanochloropsis_files = '%s%s' % (myPath,flist)
In [13]:
# Define a function to help us get
# MS1 EIC data
# get spectrum near RT-peak
# Get MSMS data for each compound
# get MS1 data and summarize the datapoints
# get heatmap
def get_XIC(h5file, min_mz, max_mz, ms_level, polarity, bins=None, **kwargs):
"""
Get Extracted-ion chromatogram (XIC) data - RT vs. cum sum of intensities
Parameters
----------
h5file : table file handle
Handle to an open tables file.
min_mz : float
Minimum m/z value.
max_mz : float
Maximum m/z value.
ms_level : int
MS Level.
polarity: int
Plus proton (1) or Minus proton (0).
bins : int or array-like, optional.
Desired bins to use for the histogram, defaults to unique retention
times.
**kwargs
Optional search modifiers. (e.g. precursor_MZ=1,
min_collision_energy=4)
Returns
-------
out : tuple of arrays
(rt_vals, i_vals) arrays in the desired range. The intensities are
scaled to [0, 100].
"""
import metatlas
data = metatlas.get_data(h5file, ms_level, polarity, min_mz=min_mz,
max_mz=max_mz, **kwargs)
if bins is None:
bins = np.unique(data['rt'])
i, rt = np.histogram(data['rt'], bins=bins, weights=data['i'])
# center the bins
rt = (rt[:-1] + rt[1:]) / 2
return rt, i
def get_data_for_a_compound(compound,what_to_get,h5file,polarity):
"""
A helper function to query the various metatlas data selection
commands for a compound defined in an experimental atlas.
Parameters
----------
compound : a MetAtlas Object for a Compound Class
this contains the m/z and retention time constraints to select data
what_to_get : a list of strings
this contains one or more of [ 'ms1_summary', 'eic', '2dhist', 'msms' ]
h5_file : str
Path to input_file
polarity : int
[0 or 1] for negative or positive ionzation
Returns
-------
"""
#TODO : polarity should be handled in the experiment and not a loose parameter
import numpy as np
import metatlas
import tables
#get a pointer to the hdf5 file
fid = tables.open_file(h5file)
#parse varaiables from the metatlas compound object
name = compound.name
formula = compound.formula
adducts = compound.adducts
mz_theor = compound.mz
ppm_uncertainty = compound.mz_threshold
ms_level = 1
rt_min = compound.rt_min
rt_max = compound.rt_max
mz_min = mz_theor - mz_theor * ppm_uncertainty / 1e6
mz_max = mz_theor + mz_theor * ppm_uncertainty / 1e6
return_data = {}
return_data['compound'] = {}
return_data['compound']['name'] = name
return_data['compound']['formula'] = formula
return_data['compound']['mz'] = mz_theor
return_data['compound']['adducts'] = adducts
if 'ms1_summary' in what_to_get:
#Get Summary Data
#First get MS1 Raw Data
ms_level=1
ms1_data = metatlas.get_data(fid,
ms_level,
polarity,
min_mz=mz_min,
max_mz=mz_max,
min_rt=rt_min,
max_rt=rt_max)
return_data['ms1_summary'] = {}
return_data['ms1_summary']['mz_centroid'] = np.sum(np.multiply(ms1_data['i'],ms1_data['mz'])) / np.sum(ms1_data['i'])
return_data['ms1_summary']['rt_centroid'] = np.sum(np.multiply(ms1_data['i'],ms1_data['rt'])) / np.sum(ms1_data['i'])
idx = np.argmax(ms1_data['i'])
return_data['ms1_summary']['mz_max'] = ms1_data['mz'][idx]
return_data['ms1_summary']['rt_max'] = ms1_data['rt'][idx]
return_data['ms1_summary']['peak_height'] = ms1_data['i'][idx]
return_data['ms1_summary']['peak_area'] = np.sum(ms1_data['i'])
if 'eic' in what_to_get:
#Get Extracted Ion Chromatogram
# TODO : If a person calls for summary, then they will already have the MS1 raw data
rt,intensity = get_XIC(fid,
mz_min,
mz_max,
ms_level,
polarity)
return_data['eic'] = {}
return_data['eic']['rt'] = rt
return_data['eic']['intensity'] = intensity
if '2dhist' in what_to_get:
#Get 2D histogram of intensity values in m/z and retention time
mzEdges = np.logspace(np.log10(100),np.log10(1000),10000)
# mzEdges = np.linspace(mz_theor - 3, mz_theor + 30,100) #TODO : number of mz bins should be an optional parameter
rtEdges = np.linspace(rt_min,rt_max,100) #TODO : number of rt bins should be an optional parameter. When not provided, it shoulddefauly to unique bins
ms_level = 1 #TODO : ms_level should be a parameter
return_data['2dhist'] = {}
return_data['2dhist'] = metatlas.get_heatmap(fid,mzEdges,rtEdges,ms_level,polarity)
if 'msms' in what_to_get:
#Get Fragmentation Data
ms_level=2
fragmentation_data = metatlas.get_data(fid,
ms_level,
polarity,
min_mz=0,
max_mz=mz_theor+10,#TODO : this needs to be a parameter
min_rt=rt_min,
max_rt=rt_max,
min_precursor_MZ=mz_min-0.01,
max_precursor_MZ=mz_max+0.01)
# min_precursor_intensity=0, #TODO : this needs to be a parameter
# max_precursor_intensity=0,#TODO : this needs to be a parameter
# min_collision_energy=0,#TODO : this needs to be a parameter
# max_collision_energy=0)#TODO : this needs to be a parameter
return_data['msms'] = fragmentation_data
return return_data
In [14]:
reference_data = get_data_for_a_compound(c2,['ms1_summary','msms','eic'],msms_reference_file,1)
chlamy_data = get_data_for_a_compound(c2,['ms1_summary','msms','eic'],msms_chlamy_file,1)
arab_data = get_data_for_a_compound(c2,['ms1_summary','msms','eic'],msms_arab_file,1)
nanno_data = get_data_for_a_compound(c2,['ms1_summary','msms','eic'],msms_nanno_file,1)
In [15]:
# create a new plot with a title and axis labels
f = plt.figure(num=None, figsize=(24, 7), dpi=300, facecolor='w', edgecolor='k')
# x = data['msms']['mz']
# y0 = np.zeros(data['msms']['i'].shape)
# y = data['msms']['i']
key = 'msms'
data = reference_data
x = data[key]['mz']
y0 = np.zeros(data[key]['i'].shape)
y = data[key]['i']
y = y / max(y)
markerline, stemlines, baseline = plt.stem(x,y,'-',label='Frag. Spectra from Standard')
plt.setp(markerline, 'markerfacecolor', 'k')
plt.setp(baseline, 'color','k', 'linewidth', 2)
plt.setp(stemlines, 'color','k', 'linewidth', 2)
data = arab_data
x = data[key]['mz']
y0 = np.zeros(data[key]['i'].shape)
y = data[key]['i']
y = -1* y / max(y)
markerline, stemlines, baseline = plt.stem(x, y, '-',label='Frag. Spectra from Arabidopsis')
plt.setp(markerline, 'markerfacecolor', 'r')
plt.setp(baseline, 'color','k', 'linewidth', 2)
plt.setp(stemlines, 'color','r', 'linewidth', 2)
# plt.setp(baseline, 'color','r', 'linewidth', 3)
plt.ylim(-1.05,1.05)
plt.xlim(50,300)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.ylabel('Intensity (au)',size=24)
plt.xlabel('m/z',size=24)
lg = plt.legend(loc = 'upper right',fontsize = 24)
lg.draw_frame(False)
plt.show()
f.savefig('6MA_Reference To Arabidopsis MSMS.pdf')
In [16]:
reference_data = get_data_for_a_compound(c1,['ms1_summary','msms','eic'],msms_reference_file,1)
chlamy_data = get_data_for_a_compound(c1,['ms1_summary','msms','eic'],msms_chlamy_file,1)
In [17]:
# create a new plot with a title and axis labels
f = plt.figure(num=None, figsize=(24, 7), dpi=300, facecolor='w', edgecolor='k')
# x = data['msms']['mz']
# y0 = np.zeros(data['msms']['i'].shape)
# y = data['msms']['i']
key = 'msms'
data = reference_data
x = data[key]['mz']
y0 = np.zeros(data[key]['i'].shape)
y = data[key]['i']
y = y / max(y)
markerline, stemlines, baseline = plt.stem(x,y,'-',label='Frag. Spectra from Standard')
plt.setp(markerline, 'markerfacecolor', 'k')
plt.setp(baseline, 'color','k', 'linewidth', 2)
plt.setp(stemlines, 'color','k', 'linewidth', 2)
data = chlamy_data
x = data[key]['mz']
y0 = np.zeros(data[key]['i'].shape)
y = data[key]['i']
y = -1* y / max(y)
markerline, stemlines, baseline = plt.stem(x, y, '-',label='Frag. Spectra from Chlamydomonas')
plt.setp(markerline, 'markerfacecolor', 'r')
plt.setp(baseline, 'color','k', 'linewidth', 2)
plt.setp(stemlines, 'color','r', 'linewidth', 2)
# plt.setp(baseline, 'color','r', 'linewidth', 3)
plt.ylim(-1.05,1.05)
plt.xlim(50,300)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.ylabel('Intensity (au)',size=24)
plt.xlabel('m/z',size=24)
lg = plt.legend(loc = 'upper right',fontsize = 24)
lg.draw_frame(False)
plt.show()
f.savefig('deoxyA_Reference To Chlamy MSMS.pdf')
In [67]:
chlamy_data = []
for f in chlamy_files:
chlamy_data.append(get_data_for_a_compound(c2,['ms1_summary','eic'],f,1))
arabidopsis_data = []
for f in arabidopsis_files:
arabidopsis_data.append(get_data_for_a_compound(c2,['ms1_summary','eic'],f,1))
nanochloropsis_data = []
nanochloropsis_data.append(get_data_for_a_compound(c2,['ms1_summary','eic'],nanochloropsis_files,1))
In [68]:
from scipy.signal import resample
GAUSS_WIDTH = 1 #ppm
x = np.linspace(-3*GAUSS_WIDTH,3*GAUSS_WIDTH,1 + 6*MZ_GAUSS_WIDTH / 0.5)
fi=np.exp(-0.5*((x)/MZ_GAUSS_WIDTH)**2)
fi = fi / sum(fi)
# filt_spectrum = np.convolve(TOTALSPECTRUM,f,'same')
f = plt.figure(num=None, figsize=(24, 7), dpi=300, facecolor='w', edgecolor='k')
for d in chlamy_data:
x = np.linspace(0,5,5/0.001)
y = np.interp(x,d['eic']['rt'],d['eic']['intensity'])
sy = np.convolve(y[:],fi[:],'same')
plt.plot(x,sy,'-',color='green',markersize=20,label='Chlamydomonas',linewidth=2)
for d in arabidopsis_data:
x = np.linspace(0,5,5/0.001)
y = np.interp(x,d['eic']['rt'],d['eic']['intensity'])
sy = np.convolve(y[:],fi[:],'same')
plt.plot(x,sy,'-',color='blue',markersize=20,label='Arabidopsis',linewidth=2)
for d in nanochloropsis_data:
x = np.linspace(0,5,5/0.001)
y = np.interp(x,d['eic']['rt'],d['eic']['intensity'])
sy = np.convolve(y[:],fi[:],'same')
plt.plot(x,y,'-',color='red',markersize=20,label='Nannochloropsis',linewidth=2)
plt.xlim(c2.rt_min,c2.rt_max)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.ylabel('Intensity (au)',size=24)
plt.xlabel('Time (min)',size=24)
from collections import OrderedDict
handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
lg = plt.legend(by_label.values(), by_label.keys(),fontsize=24)
lg.draw_frame(False)
plt.show()
f.savefig('6-methyl-deoxyadenosine_chromatograms_zoom_out.pdf')
In [69]:
chlamy_data = []
for f in chlamy_files:
chlamy_data.append(get_data_for_a_compound(c1,['ms1_summary','eic'],f,1))
arabidopsis_data = []
for f in arabidopsis_files:
arabidopsis_data.append(get_data_for_a_compound(c1,['ms1_summary','eic'],f,1))
nanochloropsis_data = []
nanochloropsis_data.append(get_data_for_a_compound(c1,['ms1_summary','eic'],nanochloropsis_files,1))
In [70]:
from scipy.signal import resample
GAUSS_WIDTH = 1 #ppm
x = np.linspace(-3*GAUSS_WIDTH,3*GAUSS_WIDTH,1 + 6*MZ_GAUSS_WIDTH / 0.5)
fi=np.exp(-0.5*((x)/MZ_GAUSS_WIDTH)**2)
fi = fi / sum(fi)
# filt_spectrum = np.convolve(TOTALSPECTRUM,f,'same')
f = plt.figure(num=None, figsize=(24, 7), dpi=300, facecolor='w', edgecolor='k')
for d in chlamy_data:
x = np.linspace(0,5,5/0.001)
y = np.interp(x,d['eic']['rt'],d['eic']['intensity'])
sy = np.convolve(y[:],fi[:],'same')
plt.plot(x,sy,'-',color='green',markersize=20,label='Chlamydomonas',linewidth=2)
for d in arabidopsis_data:
x = np.linspace(0,5,5/0.001)
y = np.interp(x,d['eic']['rt'],d['eic']['intensity'])
sy = np.convolve(y[:],fi[:],'same')
plt.plot(x,sy,'-',color='blue',markersize=20,label='Arabidopsis',linewidth=2)
for d in nanochloropsis_data:
x = np.linspace(0,5,5/0.001)
y = np.interp(x,d['eic']['rt'],d['eic']['intensity'])
sy = np.convolve(y[:],fi[:],'same')
plt.plot(x,y,'-',color='red',markersize=20,label='Nannochloropsis',linewidth=2)
plt.xlim(c1.rt_min,c1.rt_max)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.ylabel('Intensity (au)',size=24)
plt.xlabel('Time (min)',size=24)
from collections import OrderedDict
handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
lg = plt.legend(by_label.values(), by_label.keys(),fontsize=24)
lg.draw_frame(False)
plt.show()
f.savefig('deoxyadenosine_chromatograms_zoom_out.pdf')
In [1]:
%%javascript
var nb = IPython.notebook;
var kernel = IPython.notebook.kernel;
var command = "NOTEBOOK_FULL_PATH = '" + nb.base_url + nb.notebook_path + "'";
kernel.execute(command);
In [3]:
import os
filename = os.path.basename(NOTEBOOK_FULL_PATH)
%system cp $filename /project/projectdirs/openmsi/www/
temp = '%s/%s'%('/project/projectdirs/openmsi/www',filename)
%system chmod 775 $temp
print 'http://nbviewer.ipython.org/url/portal.nersc.gov/project/openmsi/%s?flush_cache=true'%filename
In [ ]: