In [1]:
import sys
# os.environ['R_LIBS_USER'] = '/project/projectdirs/metatlas/r_pkgs/'
curr_ld_lib_path = ''
# os.environ['LD_LIBRARY_PATH'] = curr_ld_lib_path + ':/project/projectdirs/openmsi/jupyterhub_libs/boost_1_55_0/lib' + ':/project/projectdirs/openmsi/jupyterhub_libs/lib'
# sys.path.insert(0, '/project/projectdirs/metatlas/python_pkgs/')
sys.path.insert(0,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
sys.path.append('/global/project/projectdirs/openmsi/jupyterhub_libs/anaconda/lib/python2.7/site-packages')
from metatlas import metatlas_objects as metob
from metatlas import h5_query as h5q
import qgrid
from matplotlib import pyplot as plt
import pandas as pd
import os
import tables
import pickle
%matplotlib inline
import dill
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
# from rdkit.Chem.rdMolDescriptors import ExactMolWt
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.Draw import IPythonConsole
from IPython.display import SVG,display
In [57]:
# my_file = 'save_data_pos_exudates_only.pkl'
# my_file = 'save_data_neg_exudates_only.pkl'
# my_file = 'save_data_pos_cassettes.pkl'
my_file = '20160203_KBL-BC_Root-Exudate_Hilic_QExactive_Trial-Run_POS.pkl'
# my_file = 'save_data_neg_archetypes_hilic.pkl'
# my_file = 'save_data_pos_archetypes_hilic_istds.pkl'
# project_label = 'kate_exudates_metatlas_exudates_neg_only'
project_label = '20160203_KBL-BC_Root-Exudate_Hilic_QExactive_Trial-Run_POS'
with open(my_file,'r') as f:
data = dill.load(f)
In [58]:
print data[1][1].keys()
print data[1][1]['data'].keys()
In [59]:
data[0][0]['data']
Out[59]:
In [60]:
import re
export_group_names = []
for i,d in enumerate(data):
newstr = d[0]['group'].name
export_group_names.append(newstr)
# print export_group_names
In [61]:
import re
export_file_names = []
for i,d in enumerate(data):
newstr = os.path.basename(d[0]['lcmsrun'].hdf5_file)
export_file_names.append(newstr)
# print export_file_names
In [62]:
import re
from collections import defaultdict
export_compound_names = []
export_compound_objects = []
for i,d in enumerate(data[0]):
export_compound_objects.append(d['identification'])
if len(d['identification'].compound) > 0:
str = d['identification'].compound[0].name
else:
str = d.name
newstr = '%s_%s_%s_%5.2f'%(str,d['identification'].mz_references[0].detected_polarity,d['identification'].mz_references[0].adduct,d['identification'].rt_references[0].rt_peak)
newstr = re.sub('\.', 'p', newstr) #2 or more in regexp
newstr = re.sub('[\[\]]','',newstr)
newstr = re.sub('[^A-Za-z0-9+-]+', '_', newstr)
newstr = re.sub('i_[A-Za-z]+_i_', '', newstr)
if newstr[0] == '_':
newstr = newstr[1:]
if newstr[0] == '-':
newstr = newstr[1:]
if newstr[-1] == '_':
newstr = newstr[:-1]
newstr = re.sub('[^A-Za-z0-9]{2,}', '', newstr) #2 or more in regexp
export_compound_names.append(newstr)
#If duplicate compound names exist, then append them with a number
D = defaultdict(list)
for i,item in enumerate(export_compound_names):
D[item].append(i)
D = {k:v for k,v in D.items() if len(v)>1}
for k in D.keys():
for i,f in enumerate(D[k]):
export_compound_names[f] = '%s%d'%(export_compound_names[f],i)
for i, e in enumerate (export_compound_names):
print i,e
In [63]:
def plot_chromatogram(d,file_name, ax=None):
import numpy as np
from textwrap import wrap
if ax is None:
ax = plt.gca()
plt.rcParams['pdf.fonttype']=42
plt.rcParams['pdf.use14corefonts'] = True
plt.rcParams['text.usetex'] = False
plt.rcParams.update({'font.size': 12})
plt.rcParams.update({'font.weight': 'bold'})
plt.rcParams['axes.linewidth'] = 2 # set the value globally
rt_min = d['identification'].rt_references[0].rt_min
rt_max = d['identification'].rt_references[0].rt_max
rt_peak = d['identification'].rt_references[0].rt_peak
if len(d['data']['eic']['rt']) > 0:
x = d['data']['eic']['rt']
y = d['data']['eic']['intensity']
ax.plot(x,y,'k-',linewidth=2.0,alpha=1.0)
myWhere = np.logical_and(x>=rt_min, x<=rt_max )
ax.fill_between(x,0,y,myWhere, facecolor='c', alpha=0.3)
ax.axvline(rt_min, color='k',linewidth=2.0)
ax.axvline(rt_max, color='k',linewidth=2.0)
ax.axvline(rt_peak, color='r',linewidth=2.0)
# ax.set_xlabel('Time (min)',weight='bold')
# ax.set_ylabel('Intensity (au)',weight='bold')
ax.set_title("\n".join(wrap(file_name,54)),fontsize=12,weight='bold')
def plot_all_chromatograms_all_files(data,nCols,export_file_names,export_compound_names,share_y,project_label):
import time
d = 'data/%s/chromatograms/'%project_label
if not os.path.exists(d):
os.makedirs(d)
nRows = int(np.ceil(len(export_file_names)/float(nCols)))
for compound_idx,compound in enumerate(export_compound_names):
starttime = time.time()
f, ax = plt.subplots(nRows, nCols, sharey=share_y,figsize=(8*nCols,nRows * 6)) #original 8 x 6
# plt.tight_layout()
print "figure created in ", time.time() - starttime, " seconds"
ax = ax.flatten()
parttime = time.time()
for i,fname in enumerate(export_file_names):
p = plot_chromatogram(data[i][compound_idx], fname, ax=ax[i])
print i, "subplots created in ", time.time() - parttime, " seconds"
# parttime = time.time()
# print "tight layout is ", time.time() - parttime, " seconds"
parttime = time.time()
f.savefig('%s%s.pdf'%(d,export_compound_names[compound_idx]))
print "time to save figure is ", time.time() - parttime, " seconds"
parttime = time.time()
# f.clear()
plt.close('all')#f.clear()
print "time to clear and close figure is ", time.time() - parttime, " seconds"
print " "
def plot_all_compounds_for_each_file(data,nCols,export_file_names,export_compound_names,share_y,project_label):
import time
d = 'data/%s/one_file_chromatograms/'%project_label
if not os.path.exists(d):
os.makedirs(d)
nRows = int(np.ceil(len(export_compound_names)/float(nCols)))
for file_idx,my_file in enumerate(export_file_names):
f, ax = plt.subplots(nRows, nCols, sharey=share_y,figsize=(8*nCols,nRows * 6)) #original 8 x 6
# plt.tight_layout()
ax = ax.flatten()
for i,compound in enumerate(export_compound_names):
p = plot_chromatogram(data[file_idx][i], compound, ax=ax[i])
f.savefig('%s%s.pdf'%(d,export_file_names[file_idx]))
plt.close('all')#f.clear()
In [64]:
""" contribution from Hans de Winter """
def _InitialiseNeutralisationReactions():
patts= (
# Imidazoles
('[n+;H]','n'),
# Amines
('[N+;!H0]','N'),
# Carboxylic acids and alcohols
('[$([O-]);!$([O-][#7])]','O'),
# Thiols
('[S-;X1]','S'),
# Sulfonamides
('[$([N-;X2]S(=O)=O)]','N'),
# Enamines
('[$([N-;X2][C,N]=C)]','N'),
# Tetrazoles
('[n-]','[nH]'),
# Sulfoxides
('[$([S-]=O)]','S'),
# Amides
('[$([N-]C=O)]','N'),
)
return [(Chem.MolFromSmarts(x),Chem.MolFromSmiles(y,False)) for x,y in patts]
_reactions=None
def NeutraliseCharges(mol, reactions=None):
global _reactions
if reactions is None:
if _reactions is None:
_reactions=_InitialiseNeutralisationReactions()
reactions=_reactions
# mol = Chem.MolFromSmiles(smiles)
replaced = False
for i,(reactant, product) in enumerate(reactions):
while mol.HasSubstructMatch(reactant):
replaced = True
rms = AllChem.ReplaceSubstructs(mol, reactant, product)
rms_smiles = Chem.MolToSmiles(rms[0])
mol = Chem.MolFromSmiles(rms_smiles)
if replaced:
return (mol, True) #Chem.MolToSmiles(mol,True)
else:
return (mol, False)
def drawStructure_ShowingFragment(pactolus_tree,fragment_idx,myMol,myMol_w_Hs):
drawer = rdMolDraw2D.MolDraw2DSVG(600,300)
fragment_atoms = np.where(pactolus_tree[fragment_idx]['atom_bool_arr'])[0]
mark_atoms_no_H = []
for a_index in fragment_atoms:
if myMol_w_Hs.GetAtomWithIdx(a_index).GetSymbol() != 'H':
mark_atoms_no_H.append(a_index)
rdDepictor.Compute2DCoords(myMol)
drawer.DrawMolecule(myMol,highlightAtoms=mark_atoms_no_H)
drawer.FinishDrawing()
svg = drawer.GetDrawingText().replace('svg:','')
return svg
def drawStructure_Fragment(pactolus_tree,fragment_idx,myMol,myMol_w_Hs):
fragment_atoms = np.where(pactolus_tree[fragment_idx]['atom_bool_arr'])[0]
depth_of_hit = np.sum(pactolus_tree[fragment_idx]['bond_bool_arr'])
mol2 = deepcopy(myMol_w_Hs)
# Now set the atoms you'd like to remove to dummy atoms with atomic number 0
fragment_atoms = np.where(pactolus_tree[fragment_idx]['atom_bool_arr']==False)[0]
for f in fragment_atoms:
mol2.GetAtomWithIdx(f).SetAtomicNum(0)
# Now remove dummy atoms using a query
mol3 = Chem.DeleteSubstructs(mol2, Chem.MolFromSmarts('[#0]'))
mol3 = Chem.RemoveHs(mol3)
# You get what you are looking for
return moltosvg(mol3),depth_of_hit
def moltosvg(mol,molSize=(450,150),kekulize=True):
mc = Chem.Mol(mol.ToBinary())
if kekulize:
try:
Chem.Kekulize(mc)
except:
mc = Chem.Mol(mol.ToBinary())
if not mc.GetNumConformers():
rdDepictor.Compute2DCoords(mc)
drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
drawer.DrawMolecule(mc)
drawer.FinishDrawing()
svg = drawer.GetDrawingText()
# It seems that the svg renderer used doesn't quite hit the spec.
# Here are some fixes to make it work in the notebook, although I think
# the underlying issue needs to be resolved at the generation step
return svg.replace('svg:','')
def get_ion_from_fragment(frag_info,spectrum):
hit_indices = np.where(np.sum(frag_info,axis=1))
hit = spectrum[hit_indices,:][0]
return hit,hit_indices
In [65]:
#plot msms and annotate
#compound name
#formula
#adduct
#theoretical m/z
#histogram of retention times
#scatter plot of retention time with peak area
#retention time
#print all chromatograms
#structure
def file_with_max_precursor_intensity(data,compound_idx):
idx = []
my_max = 0
for i,d in enumerate(data):
temp = d[compound_idx]['data']['msms']['data']
if d[compound_idx]['data']['msms']['polarity'] or d[compound_idx]['data']['msms']['polarity'] ==0 :
m = np.max(temp['precursor_intensity'])
if m > my_max:
my_max = m
idx = i
return idx,my_max
def make_identification_figure(data,file_idx,compound_idx,export_name,project_label):
d = 'data/%s/identification/'%project_label
if not os.path.exists(d):
os.makedirs(d)
fig = plt.figure(figsize=(20,20))
# fig = plt.figure()
ax = fig.add_subplot(211)
ax.set_title(data[file_idx][compound_idx]['identification'].compound[0].name,fontsize=12,weight='bold')
ax.set_xlabel('m/z',fontsize=12,weight='bold')
ax.set_ylabel('intensity',fontsize=12,weight='bold')
mz = data[file_idx][compound_idx]['data']['msms']['data']['mz']
zeros = np.zeros(data[file_idx][compound_idx]['data']['msms']['data']['mz'].shape)
intensity = data[idx][compound_idx]['data']['msms']['data']['i']
ax.vlines(mz,zeros,intensity,colors='r',linewidth = 2)
sx = np.argsort(intensity)[::-1]
labels = [1.001e9]
for i in sx:
if np.min(np.abs(mz[i] - labels)) > 0.1 and intensity[i] > 0.02 * np.max(intensity):
ax.annotate('%5.4f'%mz[i], xy=(mz[i], 1.01*intensity[i]),rotation = 90, horizontalalignment = 'center', verticalalignment = 'left')
labels.append(mz[i])
plt.tight_layout()
L = plt.ylim()
plt.ylim(L[0],L[1]*1.12)
inchi = data[file_idx][compound_idx]['identification'].compound[0].inchi
myMol = Chem.MolFromInchi(inchi.encode('utf-8'))
myMol,neutralised = NeutraliseCharges(myMol)
image = Draw.MolToImage(myMol, size = (300,300) )
ax2 = fig.add_subplot(223)
ax2.imshow(image)
ax2.axis('off')
# SVG(moltosvg(myMol))
ax3 = fig.add_subplot(224)
ax3.set_xlim(0,1)
mz_theoretical = data[file_idx][compound_idx]['identification'].mz_references[0].mz
mz_measured = data[file_idx][compound_idx]['data']['ms1_summary']['mz_centroid']
if not mz_measured:
mz_measured = 0
delta_mz = abs(mz_theoretical - mz_measured)
delta_ppm = delta_mz / mz_theoretical * 1e6
rt_theoretical = data[file_idx][compound_idx]['identification'].rt_references[0].rt_peak
rt_measured = data[file_idx][compound_idx]['data']['ms1_summary']['rt_centroid']
if not rt_measured:
rt_measured = 0
ax3.text(0,1,'%s'%os.path.basename(data[file_idx][compound_idx]['lcmsrun'].hdf5_file),fontsize=12)
ax3.text(0,0.95,'%s %s'%(data[file_idx][compound_idx]['identification'].compound[0].name, data[file_idx][compound_idx]['identification'].mz_references[0].adduct),fontsize=12)
ax3.text(0,0.9,'m/z theoretical = %5.4f, measured = %5.4f, %5.4f ppm difference'%(mz_theoretical, mz_measured, delta_ppm),fontsize=12)
ax3.text(0,0.85,'Expected Elution of %5.2f minutes, %5.2f min actual'%(rt_theoretical,rt_measured),fontsize=12)
ax3.set_ylim(0.2,1.01)
ax3.axis('off')
# plt.show()
fig.savefig('%sIdentifications_%s.pdf'%(d,export_name))
fig.clear()
plt.close('all')#f.clear()
In [66]:
inchi = '%s'%data[idx][9]['identification'].compound[0].InChI
myMol = Chem.MolFromInchi(inchi.encode('utf-8'))
SVG(moltosvg(myMol))
In [67]:
# column name list
# row name list
import pandas as pd
import os
peak_height = pd.DataFrame( index=export_compound_names, columns=export_file_names, dtype=float)
peak_area = pd.DataFrame( index=export_compound_names, columns=export_file_names, dtype=float)
# peak_height['compound'] = compound_list
# peak_height.set_index('compound',drop=True)
for i,dd in enumerate(data):
for j,d in enumerate(dd):
if not d['data']['ms1_summary']['peak_height']:
peak_height.ix[export_compound_names[j],export_file_names[i]] = 0
peak_area.ix[export_compound_names[j],export_file_names[i]] = 0
else:
peak_height.ix[export_compound_names[j],export_file_names[i]] = d['data']['ms1_summary']['peak_height']
peak_area.ix[export_compound_names[j],export_file_names[i]] = d['data']['ms1_summary']['peak_area']
columns = []
for i,f in enumerate(export_file_names):
columns.append((export_group_names[i],f))
peak_height.columns = pd.MultiIndex.from_tuples(columns,names=['group', 'file'])
peak_area.columns = pd.MultiIndex.from_tuples(columns,names=['group', 'file'])
In [68]:
d = 'data/%s/sheets/'%project_label
if not os.path.exists(d):
os.makedirs(d)
peak_height.to_csv('data/%s/sheets/compounds peak height.tab'%project_label,sep='\t')
peak_area.to_csv('data/%s/sheets/compounds peak area.tab'%project_label,sep='\t')
In [69]:
from matplotlib import pyplot as plt
def plot_errorbar_plots(df,compound_list,project_label):
d = 'data/%s/error_bar/'%project_label
if not os.path.exists(d):
os.makedirs(d)
for compound in compound_list:
m = df.ix[compound].groupby(level='group').mean()
e = df.ix[compound].groupby(level='group').std()
c = df.ix[compound].groupby(level='group').count()
for i in range(len(e)):
if c[i]>0:
e[i] = e[i] / c[i]**0.5
f, ax = plt.subplots(1, 1,figsize=(20,12))
m.plot(yerr=e, kind='bar',ax=ax)
ax.set_title(compound,fontsize=12,weight='bold')
plt.tight_layout()
f.savefig('%sErrorBar_%s.pdf'%(d,compound))
f.clear()
plt.close('all')#f.clear()
In [70]:
idx, m = file_with_max_precursor_intensity(data,compound_idx)
print idx
print m
In [51]:
for compound_idx in range(len(export_compound_names)):
idx, m = file_with_max_precursor_intensity(data,compound_idx)
if m:
make_identification_figure(data,idx,compound_idx,export_compound_names[compound_idx],project_label)
plot_errorbar_plots(peak_height,export_compound_names,project_label)
nCols = 2
compound_idx = 9
share_y=True
t = plot_all_chromatograms_all_files(data,nCols,export_file_names,export_compound_names,share_y,project_label)
In [71]:
nCols = 8
share_y=False
t = plot_all_compounds_for_each_file(data,nCols,export_file_names,export_compound_names,share_y,project_label)
In [ ]: