Heatmap visualisations for quantified (virus) metagenomics classifications

Initially developed for Genome Detective output

Date: 2018-05-16
Author: Sam Nooij

How to use:

Modify the variables in the 'Variables' code block to the desired values and run all cells
N.B. This requires the installation of Jupyter Notebook, using a Python 2 kernel, pandas and bokeh.


In [1]:
#Import all required libraries
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.io import output_notebook
import glob

Prepare Bokeh


In [2]:
output_notebook()


Loading BokehJS ...

Variables


In [3]:
#Set the variables.
FOLDER = "../results/"
FILE_WILDCARD = "*_results.csv"

COLUMN_CLASSIFICATION = "Assignment"
COLUMN_LOAD = "Mapped # Reads"

METHOD = "Genome Detective"

#Make sure to adjust these values to a size that fits your data.
HEATMAP_TITLE = "Viruses classified by %s" % METHOD
HEATMAP_WIDTH = 700
HEATMAP_HEIGHT = 500
HEATMAP_TITLE_FONT_SIZE = "16pt"
HEATMAP_AXIS_FONT_SIZE = "12pt"

COLOUR = ["#6b2d18"] #Selected from coffee beans: http://s.eatthis-cdn.com/media/images/ext/851818315/coffee-beans.jpg

#This is where you can save the resulting html file.
#Write the directory path + file name you want to use,
# and the title you want the file to have.
#This title will appear in the top bar of your browser window.
OUTPUT_FILE = "../results/test_heatmap.html"
OUTPUT_TITLE = "Heatmap of viruses classified by %s" % METHOD

Functions


In [4]:
#Define the required functions
def create_concatenated_dataframe(file_wildcard, folder):
    """
    Input: a filename with wildcard,
            and a folder name
    Output: One concatenated dataframe of all the input files
    """
    #Step 1: create a list of the files using glob
    file_list = glob.glob("%s%s" % (folder, file_wildcard))
    file_list = sorted(file_list)
    
    file_suffix = file_wildcard.lstrip("*")
    
    #Step 2: open the files as dataframe, remove "Contigs" column and add sample IDs
    
    df_list = []
    for results_file in file_list:
        results_df = pd.read_csv(results_file)
        results_df = results_df.drop("Contigs", axis = 1) #remove unnecessary (and long!) column
        sample_id = results_file[:results_file.index(file_suffix)].split('/')[-1]
        results_df["sample"] = sample_id
        df_list.append(results_df)

    #Step 3: concatenate the dataframes
    super_df = pd.concat(df_list, ignore_index=True)
    
    return(super_df)

def create_and_show_heatmap(dataframe, column_classification, column_load):
    """
    Input: Dataframe, names of columns:
        'classification'
        'load'
    Output: heatmap (shown in the notebook and 
            exported to the html file defined as variable above)
    """
    samples = dataframe["sample"]
    assignments = dataframe[column_classification]
    loads = dataframe[column_load]
    #a fix for numbers with tildes, e.g. '~80'
    loads_new = []
    for value in loads:
        if isinstance(value, str):
            value = value.strip('~')
        loads_new.append(int(value))
    
    colors = len(loads) * COLOUR #multiply to make an equally long list
    
    max_load = max(loads_new)
    alphas = [ min( x / float(max_load), 0.9) + 0.1 for x in loads_new ]
    
    source = ColumnDataSource(
            data = dict(samples=samples, assignments=assignments, colors=colors, loads=loads, alphas=alphas)
        )

    TOOLS = "hover, save, pan, box_zoom, wheel_zoom, reset"

    p = figure(title = HEATMAP_TITLE,
              x_range = list(sorted(set(samples))),
              y_range = list(reversed(sorted(set(assignments)))), #reverse to order 'from top to bottom'
              x_axis_location = "above",
              toolbar_location="right",
              tools = TOOLS)

    p.plot_width = HEATMAP_WIDTH
    p.plot_height = HEATMAP_HEIGHT
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = HEATMAP_AXIS_FONT_SIZE
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = np.pi/4
    p.title.text_color = COLOUR[0]
    p.title.text_font_size = HEATMAP_TITLE_FONT_SIZE

    p.rect("samples", "assignments", 1, 1, source=source,
           color="colors", alpha="alphas", line_color=None)

    p.select_one(HoverTool).tooltips = [
        ('Sample', "@samples"),
        ('Taxon' , "@assignments"),
        ('Number of reads', "@loads"),
    ]
    
    output_file(OUTPUT_FILE, title=OUTPUT_TITLE)
    
    show(p)

Execution


In [5]:
df = create_concatenated_dataframe(file_wildcard = FILE_WILDCARD, folder = FOLDER)

create_and_show_heatmap(dataframe=df,
                        column_classification=COLUMN_CLASSIFICATION,
                        column_load=COLUMN_LOAD)


On the X-axis are the sample names

On thy Y-axis are the viral taxa that have been identified

Mouse-over to see details, including numbers of reads identified for each taxon (i.e. reads mapped to the contigs)

Use the 'save' button on the right to save the image as png file