Date: 2018-05-16
Author: Sam Nooij
Modify the variables in the 'Variables' code block to the desired values and run all cells
N.B. This requires the installation of Jupyter Notebook, using a Python 2 kernel, pandas and bokeh.
In [1]:
#Import all required libraries
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.io import output_notebook
import glob
In [2]:
output_notebook()
In [3]:
#Set the variables.
FOLDER = "../results/"
FILE_WILDCARD = "*_results.csv"
COLUMN_CLASSIFICATION = "Assignment"
COLUMN_LOAD = "Mapped # Reads"
METHOD = "Genome Detective"
#Make sure to adjust these values to a size that fits your data.
HEATMAP_TITLE = "Viruses classified by %s" % METHOD
HEATMAP_WIDTH = 700
HEATMAP_HEIGHT = 500
HEATMAP_TITLE_FONT_SIZE = "16pt"
HEATMAP_AXIS_FONT_SIZE = "12pt"
COLOUR = ["#6b2d18"] #Selected from coffee beans: http://s.eatthis-cdn.com/media/images/ext/851818315/coffee-beans.jpg
#This is where you can save the resulting html file.
#Write the directory path + file name you want to use,
# and the title you want the file to have.
#This title will appear in the top bar of your browser window.
OUTPUT_FILE = "../results/test_heatmap.html"
OUTPUT_TITLE = "Heatmap of viruses classified by %s" % METHOD
In [4]:
#Define the required functions
def create_concatenated_dataframe(file_wildcard, folder):
"""
Input: a filename with wildcard,
and a folder name
Output: One concatenated dataframe of all the input files
"""
#Step 1: create a list of the files using glob
file_list = glob.glob("%s%s" % (folder, file_wildcard))
file_list = sorted(file_list)
file_suffix = file_wildcard.lstrip("*")
#Step 2: open the files as dataframe, remove "Contigs" column and add sample IDs
df_list = []
for results_file in file_list:
results_df = pd.read_csv(results_file)
results_df = results_df.drop("Contigs", axis = 1) #remove unnecessary (and long!) column
sample_id = results_file[:results_file.index(file_suffix)].split('/')[-1]
results_df["sample"] = sample_id
df_list.append(results_df)
#Step 3: concatenate the dataframes
super_df = pd.concat(df_list, ignore_index=True)
return(super_df)
def create_and_show_heatmap(dataframe, column_classification, column_load):
"""
Input: Dataframe, names of columns:
'classification'
'load'
Output: heatmap (shown in the notebook and
exported to the html file defined as variable above)
"""
samples = dataframe["sample"]
assignments = dataframe[column_classification]
loads = dataframe[column_load]
#a fix for numbers with tildes, e.g. '~80'
loads_new = []
for value in loads:
if isinstance(value, str):
value = value.strip('~')
loads_new.append(int(value))
colors = len(loads) * COLOUR #multiply to make an equally long list
max_load = max(loads_new)
alphas = [ min( x / float(max_load), 0.9) + 0.1 for x in loads_new ]
source = ColumnDataSource(
data = dict(samples=samples, assignments=assignments, colors=colors, loads=loads, alphas=alphas)
)
TOOLS = "hover, save, pan, box_zoom, wheel_zoom, reset"
p = figure(title = HEATMAP_TITLE,
x_range = list(sorted(set(samples))),
y_range = list(reversed(sorted(set(assignments)))), #reverse to order 'from top to bottom'
x_axis_location = "above",
toolbar_location="right",
tools = TOOLS)
p.plot_width = HEATMAP_WIDTH
p.plot_height = HEATMAP_HEIGHT
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = HEATMAP_AXIS_FONT_SIZE
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = np.pi/4
p.title.text_color = COLOUR[0]
p.title.text_font_size = HEATMAP_TITLE_FONT_SIZE
p.rect("samples", "assignments", 1, 1, source=source,
color="colors", alpha="alphas", line_color=None)
p.select_one(HoverTool).tooltips = [
('Sample', "@samples"),
('Taxon' , "@assignments"),
('Number of reads', "@loads"),
]
output_file(OUTPUT_FILE, title=OUTPUT_TITLE)
show(p)
In [5]:
df = create_concatenated_dataframe(file_wildcard = FILE_WILDCARD, folder = FOLDER)
create_and_show_heatmap(dataframe=df,
column_classification=COLUMN_CLASSIFICATION,
column_load=COLUMN_LOAD)