Group / separate cells based on their pairwise distances

For cell number < 500: Hierarchical clustering -> Rank genes in each cell based their expression level, then calculate Pearson correlation in cell-cell pairs. Plot heatmap based on the correlations
For cell number > 500: Convert correlation to distance, then cluster cells by k-nearest neighbors

Hierarchical clustering based on Spearman correlation between cells



In [17]:

    
# Import required modules
# Import the pandas dataframe library
import pandas as pd

# Import the seaborn library for plotting
import seaborn as sns

# Import numerical python for nd-array & calculations
import numpy as np
# Import the plotting framework
import matplotlib.pyplot as plt
# Convert RGB colors to hex colors for portability
from matplotlib.colors import rgb2hex
# K-nearest neighbors cell clustering from Dana Pe'er's lab
import phenograph
# Import interactive modules
from ipywidgets import interact,interactive,fixed
# Import operating system module
import os

# Bokeh - interactive plotting in the browser
from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.models.widgets import Panel, Tabs
from bokeh.layouts import widgetbox
from bokeh.io import output_notebook

# Local file: networkplots.py
import networkplots
import networkx
# Put all the plots directly into the notebook
%matplotlib inline

# Allow bokeh to show plots inside notebook
output_notebook()









    





    
        
        Loading BokehJS ...



In [4]:

    
# Setup a dictionary with all input files to be used
input_dict = {
    "raw filtered":"Test_filtered.mtx",
    "lowrank filtered":"Test_filtered_lr_pos.mtx",
    "top expressed":"Test_filtered_top.mtx",
    "lowrank top expressed":"Test_filtered_lr_pos_top.mtx",
    "top highly variable":"Test_filtered_HVtop_100.mtx",
    "lowrank top highly variable":"Test_filtered_lr_pos_HVtop_100.mtx"
}



In [8]:

    
def hcluster(inFile):
    subset = pd.read_table(inFile, sep=',',index_col=0)
    # Get basename of the file
    base = os.path.basename(inFile).split(sep=".")[0]
    # Transpose data matrix to prepare it for correlation calculation
    subsetT = subset.T
    print("Shape of transposed subset matrix is: ", subsetT.shape)
    # Rank genes in each cell
    subsetTrk = subsetT.rank(axis=0)
    # Calculate Pearson correlation using the ranked data matrix (Pearson on ranked matrix == Spearman)
    subsetCorr = subsetTrk.corr(method='pearson')
    print("Shape of correlation matrix is: ", subsetCorr.shape)
    # Plot correlation as heatmap
    sns.clustermap(subsetCorr,method='ward',metric='cityblock',figsize=(4,4))
    plt.title(base)
    return



In [9]:

    
interact(hcluster, inFile=input_dict)









    





 
 










    Out[9]:





<function __main__.hcluster>

k-nearest neighbors

Spearman correlation -> distance matrix -> knn



In [2]:

    
def correlation_to_distance(correlations):
    """Convert -1 to 1 similarity measure to a metric distance"""
    return np.sqrt(2*(1-correlations))

# plot_graph() from @olgabot networkplots.py
# Bokeh tools in sidebar
TOOLS = "crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset," \
        "tap,save,box_select,poly_select,lasso_select"
X_COL = 'xs'
Y_COL = 'ys'
def plot_graph(nodes_source, edges_source, legend_col, color_col, title,
               tab=False):
    """Draw a network in 2d space using Bokeh"""

    # names=['cell'] matches p1.circle(... name='cell' ...) so that the
    # hover tooltip only shows up on the cells, not on the edges
    hover1 = HoverTool(names=['cell'], tooltips=[
        ("Cell Barcode", "@barcode"),
        ("Group", f"@{legend_col}"),
    ])

    p1 = figure(tools=[TOOLS, hover1], plot_height=500, plot_width=750,
                title=title)
    edges = p1.multi_line(X_COL, Y_COL, line_width=1.5,
                          alpha='alphas', color='black',
                          source=edges_source)
    nodes = p1.circle(X_COL, Y_COL, size=10,
                      fill_alpha=0.5, hover_alpha=1,
                      hover_fill_color=color_col,
                      muted_alpha=0.2,
                      source=nodes_source, legend=legend_col,
                      color=color_col, name='cell')
    p1.legend.click_policy = "mute"
    if tab:
        tab1 = Panel(child=p1, title=title)
        return tab1
    else:
        return p1

def KNN(inFile, k = 20):
    # Read in file
    mtx = pd.read_csv(inFile,index_col=0)
    nrow, ncol = mtx.shape
    print("This is a %d cell by %d gene matrix" % (nrow, ncol))
    # Get basename of the file
    base = os.path.basename(inFile).split(sep=".")[0]
    # Calculate correlation
    correlations = mtx.T.rank().corr()
    print("The shape of correlation matrix is: ", correlations.shape)
    # Convert correlation to distance
    distance = correlation_to_distance(correlations)
    sns.distplot(distance.values.flat,axlabel="Distance")
    # Build communities. Notice the k setting. Try smaller if it is after downsampling
    communities, sparse_matrix, Q = phenograph.cluster(distance, k)
    # Create networkx plots from sparse matrix
    graph = networkx.from_scipy_sparse_matrix(sparse_matrix)
    # Plot the graph made by networkx
    # networkx.draw(graph)
    # Obtain data point coordinates from graph
    positions = networkx.spring_layout(graph)
    # Make a temporary cell metadata (when there is not existing clustering results)
    empty_cell_metadata = pd.DataFrame(index=mtx.index)
    # Get nodes and edges. Using networkplots from @olgabot
    nodes_specs = networkplots.get_nodes_specs(
        positions, empty_cell_metadata, distance.index, 
        communities, palette='Set2')
    edges_specs = networkplots.get_edges_specs(graph, positions)
    # 
    nodes_source = ColumnDataSource(nodes_specs)
    edges_source = ColumnDataSource(edges_specs)
    fig = plot_graph(nodes_source, edges_source, 
                                  legend_col='community', color_col='community_color',
                                  tab=False, title='KNN Clustering')
    show(fig)
    # Save the cluster info
    outname=base+"_k"+str(k)+"-KNN_nodes.txt"
    nodes_specs.to_csv(outname)
    return nodes_specs, edges_specs



In [5]:

    
interact(KNN, inFile = input_dict, k = (1,100))









    





 
 










    Out[5]:





<function __main__.KNN>

Are the clustering results the same?
- Make a multi-color labeling matrix.
- Combine metadata from all other clustering results with the coordinates from the original filtered dataset.
- Plot the points, and show color difference with different clustering.



In [15]:

    
# Combine metadata info, and assign colors.
## Take 'barcode' and 'community' column from each saved nodes_specs file, and merge by 'barcode' column using pd.merge()
## Saved combined metadata as "Test_cell_communities.txt"
## Read in te metadata
dffinal = pd.read_csv("Test_cell_communities.txt",index_col=0)
print(dffinal.shape)
dffinal.head()









    



(135, 6)






    Out[15]:







  
    
      
      barcode
      F1
      F2
      F3
      F4
      F5
    
  
  
    
      0
      ATCTTGGCGGCG
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
    
    
      1
      ATAGAGTCGCAT
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
    
    
      2
      GCGCACGTTGAG
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
    
    
      3
      TTCACGGCTGAG
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
    
    
      4
      GCGAGACACAGC
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0



In [18]:

    
# Assign color to each of the community columns.
# From @olgabot networkplots.py
def labels_to_colors(labels, palette=None):
    """Convert a column of labels into categorical colors"""
    categories = sorted(np.unique(labels))
    rgb = sns.color_palette(palette, n_colors=len(categories))
    colors = map(rgb2hex, rgb)
    colormap = dict(zip(categories, colors))
    return [colormap[x] for x in labels]
# For each community label column, get the total number of colors required,
# take that number of colors out from color palette, 
# convert the RGB color to hex code,
# make dictionary using the unique community label as key and the hex code color as value.
for x in dffinal.columns:
    if x != "barcode":
        color_col = x + "_colors"
        dffinal[color_col] = labels_to_colors(dffinal[x])



In [21]:

    
dffinal.head()









    Out[21]:







  
    
      
      barcode
      F1
      F2
      F3
      F4
      F5
      F1_colors
      F2_colors
      F3_colors
      F4_colors
      F5_colors
    
  
  
    
      0
      ATCTTGGCGGCG
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
    
    
      1
      ATAGAGTCGCAT
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
    
    
      2
      GCGCACGTTGAG
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
    
    
      3
      TTCACGGCTGAG
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
    
    
      4
      GCGAGACACAGC
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0



In [22]:

    
nodes_spec, edges_spec = KNN("Test_filtered.mtx", k = 20)
print(nodes_spec.shape, edges_spec.shape)









    



This is a 135 cell by 3185 gene matrix
The shape of correlation matrix is:  (135, 135)
Finding 20 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10786604881286621 seconds
Jaccard graph constructed in 0.02614116668701172 seconds
Wrote graph to binary file in 0.004717111587524414 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.652091
Louvain completed 21 runs in 0.25792908668518066 seconds
PhenoGraph complete in 0.40439796447753906 seconds






    






    
        
    







    



(135, 5) (1813, 3)



In [7]:

    
nodes_spec.head()









    Out[7]:







  
    
      
      xs
      ys
      community
      barcode
      community_color
    
  
  
    
      97
      0.194489
      0.261227
      Community #-1
      ATCTTGGCGGCG
      #66c2a5
    
    
      48
      0.020398
      0.626423
      Community #-1
      ATAGAGTCGCAT
      #66c2a5
    
    
      56
      0.028661
      0.668916
      Community #-1
      GCGCACGTTGAG
      #66c2a5
    
    
      5
      0.047751
      0.680641
      Community #-1
      TTCACGGCTGAG
      #66c2a5
    
    
      121
      0.020171
      0.648142
      Community #-1
      GCGAGACACAGC
      #66c2a5



In [23]:

    
data = pd.merge(nodes_spec, dffinal, on='barcode')
print(data.shape)
data.head()









    



(135, 15)






    Out[23]:







  
    
      
      xs
      ys
      community
      barcode
      community_color
      F1
      F2
      F3
      F4
      F5
      F1_colors
      F2_colors
      F3_colors
      F4_colors
      F5_colors
    
  
  
    
      0
      0.280661
      0.951813
      Community #-1
      ATCTTGGCGGCG
      #66c2a5
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
    
    
      1
      0.586076
      0.953690
      Community #-1
      ATAGAGTCGCAT
      #66c2a5
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
    
    
      2
      0.547560
      0.936233
      Community #-1
      GCGCACGTTGAG
      #66c2a5
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
    
    
      3
      0.461449
      1.000000
      Community #-1
      TTCACGGCTGAG
      #66c2a5
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
    
    
      4
      0.544097
      0.895329
      Community #-1
      GCGAGACACAGC
      #66c2a5
      Community #0
      Community #-1
      Community #0
      Community #0
      Community #0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0
      #4c72b0



In [24]:

    
nodes_source = ColumnDataSource(data)
edges_source = ColumnDataSource(edges_spec)



In [25]:

    
# Plot different clustering as tabs in bokeh
tab1 = plot_graph(nodes_source, edges_source, 
                               legend_col='community',
                  color_col='community_color', tab=True,
                  title='KNN Clustering')

tab2 = plot_graph(nodes_source, edges_source,
                  legend_col='F3', tab=True,
                  color_col='F3_colors',
                  title="lowrank")

tab3 = plot_graph(nodes_source, edges_source,
                  legend_col='F5', tab=True,
                  color_col='F5_colors',
                  title="TopExpressed")

tab4 = plot_graph(nodes_source, edges_source,
                  legend_col='F4', tab=True,
                  color_col='F4_colors',
                  title="TopExpressed_lowrank")

tab5 = plot_graph(nodes_source, edges_source,
                  legend_col='F1', tab=True,
                  color_col='F1_colors',
                  title="HighlyVariable")

tabs = Tabs(tabs=[tab1, tab2, tab3, tab4, tab5])
show(tabs)

Are the cell clustering largely determined by detection rate?



In [ ]:

	barcode	F1	F2	F3	F4	F5
0	ATCTTGGCGGCG	Community #0	Community #-1	Community #0	Community #0	Community #0
1	ATAGAGTCGCAT	Community #0	Community #-1	Community #0	Community #0	Community #0
2	GCGCACGTTGAG	Community #0	Community #-1	Community #0	Community #0	Community #0
3	TTCACGGCTGAG	Community #0	Community #-1	Community #0	Community #0	Community #0
4	GCGAGACACAGC	Community #0	Community #-1	Community #0	Community #0	Community #0

	xs	ys	community	barcode	community_color
97	0.194489	0.261227	Community #-1	ATCTTGGCGGCG	#66c2a5
48	0.020398	0.626423	Community #-1	ATAGAGTCGCAT	#66c2a5
56	0.028661	0.668916	Community #-1	GCGCACGTTGAG	#66c2a5
5	0.047751	0.680641	Community #-1	TTCACGGCTGAG	#66c2a5
121	0.020171	0.648142	Community #-1	GCGAGACACAGC	#66c2a5

	xs	ys	community	barcode	community_color	F1	F2	F3	F4	F5	F1_colors	F2_colors	F3_colors	F4_colors	F5_colors
0	0.280661	0.951813	Community #-1	ATCTTGGCGGCG	#66c2a5	Community #0	Community #-1	Community #0	Community #0	Community #0	#4c72b0	#4c72b0	#4c72b0	#4c72b0	#4c72b0
1	0.586076	0.953690	Community #-1	ATAGAGTCGCAT	#66c2a5	Community #0	Community #-1	Community #0	Community #0	Community #0	#4c72b0	#4c72b0	#4c72b0	#4c72b0	#4c72b0
2	0.547560	0.936233	Community #-1	GCGCACGTTGAG	#66c2a5	Community #0	Community #-1	Community #0	Community #0	Community #0	#4c72b0	#4c72b0	#4c72b0	#4c72b0	#4c72b0
3	0.461449	1.000000	Community #-1	TTCACGGCTGAG	#66c2a5	Community #0	Community #-1	Community #0	Community #0	Community #0	#4c72b0	#4c72b0	#4c72b0	#4c72b0	#4c72b0
4	0.544097	0.895329	Community #-1	GCGAGACACAGC	#66c2a5	Community #0	Community #-1	Community #0	Community #0	Community #0	#4c72b0	#4c72b0	#4c72b0	#4c72b0	#4c72b0