Group / separate cells based on their pairwise distances

  • For cell number < 500: Hierarchical clustering -> Rank genes in each cell based their expression level, then calculate Pearson correlation in cell-cell pairs. Plot heatmap based on the correlations
  • For cell number > 500: Convert correlation to distance, then cluster cells by k-nearest neighbors

Hierarchical clustering based on Spearman correlation between cells


In [17]:
# Import required modules
# Import the pandas dataframe library
import pandas as pd

# Import the seaborn library for plotting
import seaborn as sns

# Import numerical python for nd-array & calculations
import numpy as np
# Import the plotting framework
import matplotlib.pyplot as plt
# Convert RGB colors to hex colors for portability
from matplotlib.colors import rgb2hex
# K-nearest neighbors cell clustering from Dana Pe'er's lab
import phenograph
# Import interactive modules
from ipywidgets import interact,interactive,fixed
# Import operating system module
import os

# Bokeh - interactive plotting in the browser
from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.models.widgets import Panel, Tabs
from bokeh.layouts import widgetbox
from bokeh.io import output_notebook

# Local file: networkplots.py
import networkplots
import networkx
# Put all the plots directly into the notebook
%matplotlib inline

# Allow bokeh to show plots inside notebook
output_notebook()


Loading BokehJS ...

In [4]:
# Setup a dictionary with all input files to be used
input_dict = {
    "raw filtered":"Test_filtered.mtx",
    "lowrank filtered":"Test_filtered_lr_pos.mtx",
    "top expressed":"Test_filtered_top.mtx",
    "lowrank top expressed":"Test_filtered_lr_pos_top.mtx",
    "top highly variable":"Test_filtered_HVtop_100.mtx",
    "lowrank top highly variable":"Test_filtered_lr_pos_HVtop_100.mtx"
}

In [8]:
def hcluster(inFile):
    subset = pd.read_table(inFile, sep=',',index_col=0)
    # Get basename of the file
    base = os.path.basename(inFile).split(sep=".")[0]
    # Transpose data matrix to prepare it for correlation calculation
    subsetT = subset.T
    print("Shape of transposed subset matrix is: ", subsetT.shape)
    # Rank genes in each cell
    subsetTrk = subsetT.rank(axis=0)
    # Calculate Pearson correlation using the ranked data matrix (Pearson on ranked matrix == Spearman)
    subsetCorr = subsetTrk.corr(method='pearson')
    print("Shape of correlation matrix is: ", subsetCorr.shape)
    # Plot correlation as heatmap
    sns.clustermap(subsetCorr,method='ward',metric='cityblock',figsize=(4,4))
    plt.title(base)
    return

In [9]:
interact(hcluster, inFile=input_dict)


Out[9]:
<function __main__.hcluster>

k-nearest neighbors

  • Spearman correlation -> distance matrix -> knn

In [2]:
def correlation_to_distance(correlations):
    """Convert -1 to 1 similarity measure to a metric distance"""
    return np.sqrt(2*(1-correlations))

# plot_graph() from @olgabot networkplots.py
# Bokeh tools in sidebar
TOOLS = "crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset," \
        "tap,save,box_select,poly_select,lasso_select"
X_COL = 'xs'
Y_COL = 'ys'
def plot_graph(nodes_source, edges_source, legend_col, color_col, title,
               tab=False):
    """Draw a network in 2d space using Bokeh"""

    # names=['cell'] matches p1.circle(... name='cell' ...) so that the
    # hover tooltip only shows up on the cells, not on the edges
    hover1 = HoverTool(names=['cell'], tooltips=[
        ("Cell Barcode", "@barcode"),
        ("Group", f"@{legend_col}"),
    ])

    p1 = figure(tools=[TOOLS, hover1], plot_height=500, plot_width=750,
                title=title)
    edges = p1.multi_line(X_COL, Y_COL, line_width=1.5,
                          alpha='alphas', color='black',
                          source=edges_source)
    nodes = p1.circle(X_COL, Y_COL, size=10,
                      fill_alpha=0.5, hover_alpha=1,
                      hover_fill_color=color_col,
                      muted_alpha=0.2,
                      source=nodes_source, legend=legend_col,
                      color=color_col, name='cell')
    p1.legend.click_policy = "mute"
    if tab:
        tab1 = Panel(child=p1, title=title)
        return tab1
    else:
        return p1

def KNN(inFile, k = 20):
    # Read in file
    mtx = pd.read_csv(inFile,index_col=0)
    nrow, ncol = mtx.shape
    print("This is a %d cell by %d gene matrix" % (nrow, ncol))
    # Get basename of the file
    base = os.path.basename(inFile).split(sep=".")[0]
    # Calculate correlation
    correlations = mtx.T.rank().corr()
    print("The shape of correlation matrix is: ", correlations.shape)
    # Convert correlation to distance
    distance = correlation_to_distance(correlations)
    sns.distplot(distance.values.flat,axlabel="Distance")
    # Build communities. Notice the k setting. Try smaller if it is after downsampling
    communities, sparse_matrix, Q = phenograph.cluster(distance, k)
    # Create networkx plots from sparse matrix
    graph = networkx.from_scipy_sparse_matrix(sparse_matrix)
    # Plot the graph made by networkx
    # networkx.draw(graph)
    # Obtain data point coordinates from graph
    positions = networkx.spring_layout(graph)
    # Make a temporary cell metadata (when there is not existing clustering results)
    empty_cell_metadata = pd.DataFrame(index=mtx.index)
    # Get nodes and edges. Using networkplots from @olgabot
    nodes_specs = networkplots.get_nodes_specs(
        positions, empty_cell_metadata, distance.index, 
        communities, palette='Set2')
    edges_specs = networkplots.get_edges_specs(graph, positions)
    # 
    nodes_source = ColumnDataSource(nodes_specs)
    edges_source = ColumnDataSource(edges_specs)
    fig = plot_graph(nodes_source, edges_source, 
                                  legend_col='community', color_col='community_color',
                                  tab=False, title='KNN Clustering')
    show(fig)
    # Save the cluster info
    outname=base+"_k"+str(k)+"-KNN_nodes.txt"
    nodes_specs.to_csv(outname)
    return nodes_specs, edges_specs

In [5]:
interact(KNN, inFile = input_dict, k = (1,100))


Out[5]:
<function __main__.KNN>
  • Are the clustering results the same?
    • Make a multi-color labeling matrix.
    • Combine metadata from all other clustering results with the coordinates from the original filtered dataset.
    • Plot the points, and show color difference with different clustering.

In [15]:
# Combine metadata info, and assign colors.
## Take 'barcode' and 'community' column from each saved nodes_specs file, and merge by 'barcode' column using pd.merge()
## Saved combined metadata as "Test_cell_communities.txt"
## Read in te metadata
dffinal = pd.read_csv("Test_cell_communities.txt",index_col=0)
print(dffinal.shape)
dffinal.head()


(135, 6)
Out[15]:
barcode F1 F2 F3 F4 F5
0 ATCTTGGCGGCG Community #0 Community #-1 Community #0 Community #0 Community #0
1 ATAGAGTCGCAT Community #0 Community #-1 Community #0 Community #0 Community #0
2 GCGCACGTTGAG Community #0 Community #-1 Community #0 Community #0 Community #0
3 TTCACGGCTGAG Community #0 Community #-1 Community #0 Community #0 Community #0
4 GCGAGACACAGC Community #0 Community #-1 Community #0 Community #0 Community #0

In [18]:
# Assign color to each of the community columns.
# From @olgabot networkplots.py
def labels_to_colors(labels, palette=None):
    """Convert a column of labels into categorical colors"""
    categories = sorted(np.unique(labels))
    rgb = sns.color_palette(palette, n_colors=len(categories))
    colors = map(rgb2hex, rgb)
    colormap = dict(zip(categories, colors))
    return [colormap[x] for x in labels]
# For each community label column, get the total number of colors required,
# take that number of colors out from color palette, 
# convert the RGB color to hex code,
# make dictionary using the unique community label as key and the hex code color as value.
for x in dffinal.columns:
    if x != "barcode":
        color_col = x + "_colors"
        dffinal[color_col] = labels_to_colors(dffinal[x])

In [21]:
dffinal.head()


Out[21]:
barcode F1 F2 F3 F4 F5 F1_colors F2_colors F3_colors F4_colors F5_colors
0 ATCTTGGCGGCG Community #0 Community #-1 Community #0 Community #0 Community #0 #4c72b0 #4c72b0 #4c72b0 #4c72b0 #4c72b0
1 ATAGAGTCGCAT Community #0 Community #-1 Community #0 Community #0 Community #0 #4c72b0 #4c72b0 #4c72b0 #4c72b0 #4c72b0
2 GCGCACGTTGAG Community #0 Community #-1 Community #0 Community #0 Community #0 #4c72b0 #4c72b0 #4c72b0 #4c72b0 #4c72b0
3 TTCACGGCTGAG Community #0 Community #-1 Community #0 Community #0 Community #0 #4c72b0 #4c72b0 #4c72b0 #4c72b0 #4c72b0
4 GCGAGACACAGC Community #0 Community #-1 Community #0 Community #0 Community #0 #4c72b0 #4c72b0 #4c72b0 #4c72b0 #4c72b0

In [22]:
nodes_spec, edges_spec = KNN("Test_filtered.mtx", k = 20)
print(nodes_spec.shape, edges_spec.shape)


This is a 135 cell by 3185 gene matrix
The shape of correlation matrix is:  (135, 135)
Finding 20 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10786604881286621 seconds
Jaccard graph constructed in 0.02614116668701172 seconds
Wrote graph to binary file in 0.004717111587524414 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.652091
Louvain completed 21 runs in 0.25792908668518066 seconds
PhenoGraph complete in 0.40439796447753906 seconds
(135, 5) (1813, 3)

In [7]:
nodes_spec.head()


Out[7]:
xs ys community barcode community_color
97 0.194489 0.261227 Community #-1 ATCTTGGCGGCG #66c2a5
48 0.020398 0.626423 Community #-1 ATAGAGTCGCAT #66c2a5
56 0.028661 0.668916 Community #-1 GCGCACGTTGAG #66c2a5
5 0.047751 0.680641 Community #-1 TTCACGGCTGAG #66c2a5
121 0.020171 0.648142 Community #-1 GCGAGACACAGC #66c2a5

In [23]:
data = pd.merge(nodes_spec, dffinal, on='barcode')
print(data.shape)
data.head()


(135, 15)
Out[23]:
xs ys community barcode community_color F1 F2 F3 F4 F5 F1_colors F2_colors F3_colors F4_colors F5_colors
0 0.280661 0.951813 Community #-1 ATCTTGGCGGCG #66c2a5 Community #0 Community #-1 Community #0 Community #0 Community #0 #4c72b0 #4c72b0 #4c72b0 #4c72b0 #4c72b0
1 0.586076 0.953690 Community #-1 ATAGAGTCGCAT #66c2a5 Community #0 Community #-1 Community #0 Community #0 Community #0 #4c72b0 #4c72b0 #4c72b0 #4c72b0 #4c72b0
2 0.547560 0.936233 Community #-1 GCGCACGTTGAG #66c2a5 Community #0 Community #-1 Community #0 Community #0 Community #0 #4c72b0 #4c72b0 #4c72b0 #4c72b0 #4c72b0
3 0.461449 1.000000 Community #-1 TTCACGGCTGAG #66c2a5 Community #0 Community #-1 Community #0 Community #0 Community #0 #4c72b0 #4c72b0 #4c72b0 #4c72b0 #4c72b0
4 0.544097 0.895329 Community #-1 GCGAGACACAGC #66c2a5 Community #0 Community #-1 Community #0 Community #0 Community #0 #4c72b0 #4c72b0 #4c72b0 #4c72b0 #4c72b0

In [24]:
nodes_source = ColumnDataSource(data)
edges_source = ColumnDataSource(edges_spec)

In [25]:
# Plot different clustering as tabs in bokeh
tab1 = plot_graph(nodes_source, edges_source, 
                               legend_col='community',
                  color_col='community_color', tab=True,
                  title='KNN Clustering')

tab2 = plot_graph(nodes_source, edges_source,
                  legend_col='F3', tab=True,
                  color_col='F3_colors',
                  title="lowrank")

tab3 = plot_graph(nodes_source, edges_source,
                  legend_col='F5', tab=True,
                  color_col='F5_colors',
                  title="TopExpressed")

tab4 = plot_graph(nodes_source, edges_source,
                  legend_col='F4', tab=True,
                  color_col='F4_colors',
                  title="TopExpressed_lowrank")

tab5 = plot_graph(nodes_source, edges_source,
                  legend_col='F1', tab=True,
                  color_col='F1_colors',
                  title="HighlyVariable")

tabs = Tabs(tabs=[tab1, tab2, tab3, tab4, tab5])
show(tabs)


  • Are the cell clustering largely determined by detection rate?

In [ ]: