Let's experiment with creating a bokeh heatmap of sample by phylum-level taxa using Costello et al. 2009 "Whole Body" dataset. The bokeh plotting code is adapted from their US unemployment heatmap tutorial.
In [1]:
from collections import OrderedDict
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import average
from bokeh.plotting import *
from bokeh.objects import HoverTool
import skbio
from skbio.math.diversity.beta import pw_distances
output_notebook() # to display bokeh plots in the notebook
Let's define a couple of helper functions to assist with hierarchical clustering and collapsing taxonomy.
In [2]:
def ordered_ids_from_lm(lm, ids):
return [t.name for t in skbio.TreeNode.from_linkage_matrix(lm, ids).tips()]
def get_taxonomy_collapser(level):
def f(taxonomy):
taxonomy = taxonomy.split(';')
try:
result = taxonomy[level]
except IndexError:
result = None
return result
return f
Load "classic" (TSV) BIOM table into a DataFrame.
In [3]:
otu_table = pd.read_csv('data/otu_table_mc1_w_tax_even700.txt', sep='\t', skiprows=1, index_col="#OTU ID")
tax_series = otu_table['taxonomy']
otu_table = otu_table.drop('taxonomy', 1)
Next:
DistanceMatrix;DataFrame.
In [4]:
sample_dm = pw_distances(otu_table.T)
sample_lm = average(sample_dm.condensed_form())
clustered_sample_ids = ordered_ids_from_lm(sample_lm, otu_table.columns)
otu_table = otu_table.reindex(columns=clustered_sample_ids)
Collapse the OTU table at the phylum level.
In [5]:
level = 1
otu_table['taxonomy'] = tax_series
otu_table['tax_%d' % level] = otu_table['taxonomy'].apply(get_taxonomy_collapser(level))
otu_table = otu_table.drop('taxonomy', 1)
taxa_table = otu_table.groupby('tax_%d' % level).sum()
Next:
DistanceMatrix;DataFrame.
In [6]:
observation_dm = pw_distances(taxa_table)
observation_lm = average(observation_dm.condensed_form())
clustered_observation_ids = ordered_ids_from_lm(observation_lm, taxa_table.index)
taxa_table = taxa_table.reindex(index=clustered_observation_ids)
Pull our data from the DataFrame into a single-dimensional list for plotting. Map abundances to colors using a colormap.
In [7]:
taxa_table = taxa_table.T
samples = taxa_table.index.tolist()
taxa = map(str, taxa_table.columns.tolist())
plot_data = {'samples': [], 'taxa': [], 'abundances': []}
for sample in samples:
for tax in taxa:
plot_data['samples'].append(sample)
plot_data['taxa'].append(tax)
plot_data['abundances'].append(taxa_table[tax][sample])
rgba_colors = plt.get_cmap('YlGn')(mpl.colors.Normalize()(plot_data['abundances']))
hex_colors = [mpl.colors.rgb2hex(rgba) for rgba in rgba_colors]
plot_data['colors'] = hex_colors
Create a bokeh heatmap of samples by phylum-level taxa. This plot illustrates the clustering of samples by body site and groups phyla with similar abundance profiles.
In [8]:
source = ColumnDataSource(data=plot_data)
output_file('heatmap.html')
figure()
rect('taxa', 'samples', 1, 1, source=source,
x_range=list(reversed(taxa)), y_range=samples,
color='colors', line_color=None,
tools="resize,hover,previewsave", title="QIIME goes bokeh!",
plot_width=1200, plot_height=4000)
grid().grid_line_color = None
axis().axis_line_color = None
axis().major_tick_line_color = None
axis().major_label_text_font_size = "10pt"
axis().major_label_standoff = 0
xaxis().location = "top"
xaxis().major_label_orientation = np.pi/3
hover = [t for t in curplot().tools if isinstance(t, HoverTool)][0]
hover.tooltips = OrderedDict([
('Sample ID', '@samples'),
('Taxa ID', '@taxa'),
('Abundance', '@abundances'),
])
show()
We originally used matplotlib's jet colormap (see screenshot below) but decided to use a more accurately perceivable colormap based on this SciPy 2014 presentation.
In [9]:
from IPython.display import Image
Image(filename='jet-example.png')
Out[9]: