In [17]:
# Import required modules
# Import the pandas dataframe library
import pandas as pd
# Import the seaborn library for plotting
import seaborn as sns
# Import numerical python for nd-array & calculations
import numpy as np
# Import the plotting framework
import matplotlib.pyplot as plt
# Convert RGB colors to hex colors for portability
from matplotlib.colors import rgb2hex
# K-nearest neighbors cell clustering from Dana Pe'er's lab
import phenograph
# Import interactive modules
from ipywidgets import interact,interactive,fixed
# Import operating system module
import os
# Bokeh - interactive plotting in the browser
from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.models.widgets import Panel, Tabs
from bokeh.layouts import widgetbox
from bokeh.io import output_notebook
# Local file: networkplots.py
import networkplots
import networkx
# Put all the plots directly into the notebook
%matplotlib inline
# Allow bokeh to show plots inside notebook
output_notebook()
In [4]:
# Setup a dictionary with all input files to be used
input_dict = {
"raw filtered":"Test_filtered.mtx",
"lowrank filtered":"Test_filtered_lr_pos.mtx",
"top expressed":"Test_filtered_top.mtx",
"lowrank top expressed":"Test_filtered_lr_pos_top.mtx",
"top highly variable":"Test_filtered_HVtop_100.mtx",
"lowrank top highly variable":"Test_filtered_lr_pos_HVtop_100.mtx"
}
In [8]:
def hcluster(inFile):
subset = pd.read_table(inFile, sep=',',index_col=0)
# Get basename of the file
base = os.path.basename(inFile).split(sep=".")[0]
# Transpose data matrix to prepare it for correlation calculation
subsetT = subset.T
print("Shape of transposed subset matrix is: ", subsetT.shape)
# Rank genes in each cell
subsetTrk = subsetT.rank(axis=0)
# Calculate Pearson correlation using the ranked data matrix (Pearson on ranked matrix == Spearman)
subsetCorr = subsetTrk.corr(method='pearson')
print("Shape of correlation matrix is: ", subsetCorr.shape)
# Plot correlation as heatmap
sns.clustermap(subsetCorr,method='ward',metric='cityblock',figsize=(4,4))
plt.title(base)
return
In [9]:
interact(hcluster, inFile=input_dict)
Out[9]:
In [2]:
def correlation_to_distance(correlations):
"""Convert -1 to 1 similarity measure to a metric distance"""
return np.sqrt(2*(1-correlations))
# plot_graph() from @olgabot networkplots.py
# Bokeh tools in sidebar
TOOLS = "crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset," \
"tap,save,box_select,poly_select,lasso_select"
X_COL = 'xs'
Y_COL = 'ys'
def plot_graph(nodes_source, edges_source, legend_col, color_col, title,
tab=False):
"""Draw a network in 2d space using Bokeh"""
# names=['cell'] matches p1.circle(... name='cell' ...) so that the
# hover tooltip only shows up on the cells, not on the edges
hover1 = HoverTool(names=['cell'], tooltips=[
("Cell Barcode", "@barcode"),
("Group", f"@{legend_col}"),
])
p1 = figure(tools=[TOOLS, hover1], plot_height=500, plot_width=750,
title=title)
edges = p1.multi_line(X_COL, Y_COL, line_width=1.5,
alpha='alphas', color='black',
source=edges_source)
nodes = p1.circle(X_COL, Y_COL, size=10,
fill_alpha=0.5, hover_alpha=1,
hover_fill_color=color_col,
muted_alpha=0.2,
source=nodes_source, legend=legend_col,
color=color_col, name='cell')
p1.legend.click_policy = "mute"
if tab:
tab1 = Panel(child=p1, title=title)
return tab1
else:
return p1
def KNN(inFile, k = 20):
# Read in file
mtx = pd.read_csv(inFile,index_col=0)
nrow, ncol = mtx.shape
print("This is a %d cell by %d gene matrix" % (nrow, ncol))
# Get basename of the file
base = os.path.basename(inFile).split(sep=".")[0]
# Calculate correlation
correlations = mtx.T.rank().corr()
print("The shape of correlation matrix is: ", correlations.shape)
# Convert correlation to distance
distance = correlation_to_distance(correlations)
sns.distplot(distance.values.flat,axlabel="Distance")
# Build communities. Notice the k setting. Try smaller if it is after downsampling
communities, sparse_matrix, Q = phenograph.cluster(distance, k)
# Create networkx plots from sparse matrix
graph = networkx.from_scipy_sparse_matrix(sparse_matrix)
# Plot the graph made by networkx
# networkx.draw(graph)
# Obtain data point coordinates from graph
positions = networkx.spring_layout(graph)
# Make a temporary cell metadata (when there is not existing clustering results)
empty_cell_metadata = pd.DataFrame(index=mtx.index)
# Get nodes and edges. Using networkplots from @olgabot
nodes_specs = networkplots.get_nodes_specs(
positions, empty_cell_metadata, distance.index,
communities, palette='Set2')
edges_specs = networkplots.get_edges_specs(graph, positions)
#
nodes_source = ColumnDataSource(nodes_specs)
edges_source = ColumnDataSource(edges_specs)
fig = plot_graph(nodes_source, edges_source,
legend_col='community', color_col='community_color',
tab=False, title='KNN Clustering')
show(fig)
# Save the cluster info
outname=base+"_k"+str(k)+"-KNN_nodes.txt"
nodes_specs.to_csv(outname)
return nodes_specs, edges_specs
In [5]:
interact(KNN, inFile = input_dict, k = (1,100))
Out[5]:
In [15]:
# Combine metadata info, and assign colors.
## Take 'barcode' and 'community' column from each saved nodes_specs file, and merge by 'barcode' column using pd.merge()
## Saved combined metadata as "Test_cell_communities.txt"
## Read in te metadata
dffinal = pd.read_csv("Test_cell_communities.txt",index_col=0)
print(dffinal.shape)
dffinal.head()
Out[15]:
In [18]:
# Assign color to each of the community columns.
# From @olgabot networkplots.py
def labels_to_colors(labels, palette=None):
"""Convert a column of labels into categorical colors"""
categories = sorted(np.unique(labels))
rgb = sns.color_palette(palette, n_colors=len(categories))
colors = map(rgb2hex, rgb)
colormap = dict(zip(categories, colors))
return [colormap[x] for x in labels]
# For each community label column, get the total number of colors required,
# take that number of colors out from color palette,
# convert the RGB color to hex code,
# make dictionary using the unique community label as key and the hex code color as value.
for x in dffinal.columns:
if x != "barcode":
color_col = x + "_colors"
dffinal[color_col] = labels_to_colors(dffinal[x])
In [21]:
dffinal.head()
Out[21]:
In [22]:
nodes_spec, edges_spec = KNN("Test_filtered.mtx", k = 20)
print(nodes_spec.shape, edges_spec.shape)
In [7]:
nodes_spec.head()
Out[7]:
In [23]:
data = pd.merge(nodes_spec, dffinal, on='barcode')
print(data.shape)
data.head()
Out[23]:
In [24]:
nodes_source = ColumnDataSource(data)
edges_source = ColumnDataSource(edges_spec)
In [25]:
# Plot different clustering as tabs in bokeh
tab1 = plot_graph(nodes_source, edges_source,
legend_col='community',
color_col='community_color', tab=True,
title='KNN Clustering')
tab2 = plot_graph(nodes_source, edges_source,
legend_col='F3', tab=True,
color_col='F3_colors',
title="lowrank")
tab3 = plot_graph(nodes_source, edges_source,
legend_col='F5', tab=True,
color_col='F5_colors',
title="TopExpressed")
tab4 = plot_graph(nodes_source, edges_source,
legend_col='F4', tab=True,
color_col='F4_colors',
title="TopExpressed_lowrank")
tab5 = plot_graph(nodes_source, edges_source,
legend_col='F1', tab=True,
color_col='F1_colors',
title="HighlyVariable")
tabs = Tabs(tabs=[tab1, tab2, tab3, tab4, tab5])
show(tabs)
In [ ]: