In [1]:
%matplotlib inline

Clustering of high-content screen images to discover off-target phenotypes

This sample notebook collects an entire analysis of a subset of images from a screen.

What is high-content screening and why do you care?

HCS: reverse genetics on a massive scale

HCS: massive success in drug-finding

I want to cluster because I want more from my data

Set up IPython parallel

Launch client


In [2]:
from IPython.parallel import Client
rc = Client()


/Users/nuneziglesiasj/anaconda/lib/python2.7/site-packages/IPython/parallel/client/client.py:446: RuntimeWarning: 
            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='you@172.20.10.2')
            or instruct your controller to listen on an external IP.
  RuntimeWarning)

Set up dill as pickler


In [3]:
import pickle
import dill
from types import FunctionType
from IPython.utils.pickleutil import can_map

can_map.pop(FunctionType, None)

from IPython.kernel.zmq import serialize
serialize.pickle = pickle

Map images to features

Write map function

2D image --> 1D set of "features" or "descriptors"

Example image

“Every time I fire a linguist, the performance of our speech recognition system goes up.” — Fred Jelinek


In [4]:
from husc.screens import myofusion as myo
from skimage import io
def fn2feat(filename):
    im = io.imread(filename)
    return myo.feature_map(im, sample_size=100)[0]

"Don't care" feature extraction

def feature_vector_from_rgb(image, sample_size=None):
    """Compute a feature vector from the composite images.

    The channels are assumed to be in the following order:
     - Red: MCF-7
     - Green: cytoplasm GFP
     - Blue: DAPI/Hoechst

    Parameters
    ----------
    im : array, shape (M, N, 3)
        The input image.
    sample_size : int, optional
        For features based on quantiles, sample this many objects
        rather than computing full distribution. This can considerably
        speed up computation with little cost to feature accuracy.

    Returns
    -------
    fs : 1D array of float
        The features of the image.
    names : list of string
        The feature names.
    """
    all_fs, all_names = [], []
    ims = np.rollaxis(image[..., :3], -1, 0) # toss out alpha chan if present
    mcf, cells, nuclei = ims
    prefixes = ['mcf', 'cells', 'nuclei']
    for im, prefix in zip(ims, prefixes):
        fs, names = features.intensity_object_features(im,
                                                       sample_size=sample_size)
        names = [prefix + '-' + name for name in names]
        all_fs.append(fs)
        all_names.extend(names)
    nuclei_mean = nd.label(nuclei > np.mean(nuclei))[0]
    fs, names = features.nearest_neighbors(nuclei_mean)
    all_fs.append(fs)
    all_names.extend(['nuclei-' + name for name in names])
    mcf_mean = nd.label(mcf)[0]
    fs, names = features.fraction_positive(nuclei_mean, mcf_mean,
                                           positive_name='mcf')
    all_fs.append(fs)
    all_names.extend(names)
    cells_t_otsu = cells > threshold_otsu(cells)
    cells_t_adapt = threshold_adaptive(cells, 51)
    fs, names = features.nuclei_per_cell_histogram(nuclei_mean, cells_t_otsu)
    all_fs.append(fs)
    all_names.extend(['otsu-' + name for name in names])
    fs, names = features.nuclei_per_cell_histogram(nuclei_mean, cells_t_adapt)
    all_fs.append(fs)
    all_names.extend(['adapt-' + name for name in names])
    return np.concatenate(all_fs), all_names
def object_features(bin_im, im, erode=2, sample_size=None):
    """Compute features about objects in a binary image.

    Parameters
    ----------
    bin_im : 2D np.ndarray of bool
        The image of objects.
    im : 2D np.ndarray of float or uint8
        The actual image.
    erode : int, optional
        Radius of erosion of objects.
    sample_size : int, optional
        Sample this many objects randomly, rather than measuring all
        objects.

    Returns
    -------
    fs : 1D np.ndarray of float
        The feature vector.
    names : list of string
        The names of each feature.
    """
    selem = skmorph.disk(erode)
    if erode > 0:
        bin_im = nd.binary_opening(bin_im, selem)
    lab_im, n_objs = nd.label(bin_im)
    if sample_size is None:
        sample_size = n_objs
        sample_indices = np.arange(n_objs)
    else:
        sample_indices = np.random.randint(0, n_objs, size=sample_size)
    prop_names = ['area', 'eccentricity', 'euler_number', 'extent',
                  'min_intensity', 'mean_intensity', 'max_intensity',
                  'solidity']
    objects = measure.regionprops(lab_im, intensity_image=im)
    properties = np.empty((sample_size, len(prop_names)), dtype=np.float)
    for i, j in enumerate(sample_indices):
        properties[i] = [getattr(objects[j], prop) for prop in prop_names]
    quantiles = [0.05, 0.25, 0.5, 0.75, 0.95]
    feature_quantiles = mquantiles(properties, quantiles, axis=0).T
    fs = np.concatenate([np.array([n_objs], np.float),
                         feature_quantiles.ravel()])
    names = (['num-objs'] +
             ['%s-percentile%i' % (prop, int(q * 100))
              for prop, q in it.product(prop_names, quantiles)])
    return fs, names

Execute imports in worker nodes


In [5]:
%%px
import numpy as np
np.random.seed(0)
from husc.screens import myofusion as myo
from skimage import io

Find images


In [6]:
datadir = '/Users/nuneziglesiasj/Data/myo/'

In [7]:
import os
import scandir

def all_images(path):
    for p, _, files in scandir.walk(path):
        for f in files:
            if f.endswith('.tif'):
                yield os.path.join(p, f)

image_fns = list(all_images(datadir))

Execute conversion


In [8]:
import numpy as np

lbview = rc.load_balanced_view()
lbview.block = True
e0 = rc[0]
e0.block = True

feats0, feature_names = e0.apply_sync(myo.feature_map,
                                      io.imread(image_fns[0]),
                                      sample_size=5)
feature_vectors = lbview.map(fn2feat, image_fns,
                             chunksize=10)

import pandas as pd
data = pd.DataFrame(np.vstack(feature_vectors),
                    index=map(myo.filename2coord, image_fns),
                    columns=feature_names)

data now contains a pandas dataframe where each row corresponds to an image and each column to a statistical feature summarising an aspect of that image.


In [9]:
data.head()


Out[9]:
mcf-otsu-threshold-num-objs mcf-otsu-threshold-area-percentile5 mcf-otsu-threshold-area-percentile25 mcf-otsu-threshold-area-percentile50 mcf-otsu-threshold-area-percentile75 mcf-otsu-threshold-area-percentile95 mcf-otsu-threshold-eccentricity-percentile5 mcf-otsu-threshold-eccentricity-percentile25 mcf-otsu-threshold-eccentricity-percentile50 mcf-otsu-threshold-eccentricity-percentile75 ... adapt-cells-with-2-nuclei adapt-cells-with-3-nuclei adapt-cells-with-4-nuclei adapt-cells-with-5-nuclei adapt-cells-with-6-nuclei adapt-cells-with-7-nuclei adapt-cells-with-8-nuclei adapt-cells-with-9-nuclei adapt-cells-with-10-nuclei adapt-cells-with-11-nuclei
(2490688, C03) 481 13 19.35 63.5 157.65 616.20 0 0.588348 0.869246 0.957243 ... 0.179245 0.048445 0.024732 0.019888 0.008924 0.005864 0.004589 0.004080 0.001275 0.012749
(2490688, C04) 703 13 18.00 33.0 91.50 275.34 0 0.397360 0.707107 0.937461 ... 0.075543 0.011264 0.004856 0.002563 0.001551 0.001282 0.001349 0.001012 0.000607 0.008094
(2490688, C05) 489 13 23.00 68.0 168.05 600.97 0 0.649465 0.866184 0.949769 ... 0.128662 0.037240 0.014825 0.011295 0.006530 0.005118 0.003530 0.001941 0.002294 0.010942
(2490688, C06) 515 13 21.00 61.0 121.15 408.81 0 0.674433 0.836927 0.932772 ... 0.105574 0.026983 0.010911 0.008405 0.005603 0.004129 0.003539 0.003391 0.003244 0.012091
(2490688, C07) 546 13 34.80 78.0 173.50 691.75 0 0.673485 0.897959 0.961817 ... 0.066646 0.011197 0.005103 0.003123 0.001828 0.002285 0.001600 0.000762 0.000686 0.006703

5 rows × 301 columns

MongoDB

  • To keep track of gene <> image file <> (plate, well) mappings.

  • (Originally: Python dicts going every which way, built from CSV file!)

  • "Documents" in MongoDB are [{key: value}] collections

  • Fast, open source, wide support, Python bindings

Start MongoDB daemon


In [10]:
import subprocess as sp
sp.Popen(["mongod", "--dbpath", "/Users/nuneziglesiasj/mongodb"])


Out[10]:
<subprocess.Popen at 0x10d029450>

Start MongoDB client

... with a usage example.


In [11]:
from pymongo import MongoClient
collection = MongoClient()["myofusion"]["wells"]

In [12]:
collection.find_one({"gene_name": "Ppp2cb"})
# use collection.find() to iterate over all matching docs


Out[12]:
{u'_id': u'2490696-I03',
 u'cell_plate_barcode': 2490696,
 u'cell_plate_label': u'MY-pass1-j01-MP5-1',
 u'column': u'3',
 u'experimental_content_type_name': u'SAMPLE',
 u'filename': u'marcelle/raw-data/pass1/job01/MYORES-p1-j01-110210_02490696_92ea70f6-d093-4154-88a1-0a853afacab3_EXPORT/MYORES-p1-j01-110210_02490696_92ea70f6-d093-4154-88a1-0a853afacab3_I03_w1_stitched.chs.tif',
 u'gene_acc': u'19053',
 u'gene_name': u'Ppp2cb',
 u'label': u'sample',
 u'molecule_design_id': u'10386683',
 u'row': u'I',
 u'source_plate_barcode': u'2490684',
 u'source_plate_label': u'MYORES_pass1_p4',
 u'well': u'I03'}

Scale the data, fit the model


In [13]:
from __future__ import division
from sklearn.preprocessing import StandardScaler
from sklearn import cluster
import numpy as np
np.random.seed(0)
X = StandardScaler().fit_transform(data)
avg_cluster_size = 10
K = cluster.MiniBatchKMeans(n_clusters=(X.shape[0] // avg_cluster_size),
                            batch_size=1680)
K = K.fit(X)

Examine clusters


In [14]:
def get_cluster_gene_data(model, dataframe, mongo_collection, cluster_id):
    coords = [data.index[i]
              for i in np.flatnonzero(model.labels_ == cluster_id)]
    docs = [mongo_collection.find_one({"_id": myo.key2mongo(c)})
            for c in coords]
    return coords, docs

In [15]:
from matplotlib import pyplot as plt
cluster_sizes = np.bincount(K.labels_)
plt.plot(cluster_sizes)


Out[15]:
[<matplotlib.lines.Line2D at 0x10f48d190>]

In [43]:
c0 = np.flatnonzero(cluster_sizes == 6)[0]
c1 = np.flatnonzero(cluster_sizes == 9)[2]

In [17]:
coords0, docs0 = get_cluster_gene_data(K, data, collection, c0)

In [18]:
[d['gene_name'] for d in docs0]


Out[18]:
[u'Ap3b2', u'Nat14', u'Vwa5a', u'Pmm1', u'Spesp1', u'']

Getting the images


In [22]:
from skimage import io

def get_images(docs):
    ims = []
    titles = []
    for doc in docs:
        fn0 = doc['filename'].split('/')[-2:]
        fn = os.path.join(datadir, *fn0)
        ims.append(io.imread(fn))
        key = doc["_id"]
        gene_name = doc.get("gene_name", "None")
        titles.append(' '.join([key, gene_name]))
    return ims, titles

In [21]:
docs0[0]


Out[21]:
{u'_id': u'2490689-E10',
 u'cell_plate_barcode': 2490689,
 u'cell_plate_label': u'MY-pass1-j01-MP6-2',
 u'column': u'10',
 u'experimental_content_type_name': u'SAMPLE',
 u'filename': u'marcelle/raw-data/pass1/job01/MYORES-p1-j01-110210_02490689_8fda946f-d01e-48b4-b9a1-7ad2633aad60_EXPORT/MYORES-p1-j01-110210_02490689_8fda946f-d01e-48b4-b9a1-7ad2633aad60_E10_w1_stitched.chs.tif',
 u'gene_acc': u'11775',
 u'gene_name': u'Ap3b2',
 u'label': u'sample',
 u'molecule_design_id': u'10386882',
 u'row': u'E',
 u'source_plate_barcode': u'2490686',
 u'source_plate_label': u'MYORES_pass1_p6',
 u'well': u'E10'}

In [23]:
images0, titles0 = get_images(docs0)

Okay, let's look at some clusters!


In [24]:
import skdemo
reload(skdemo)
skdemo.imshow_all(*images0, titles=titles0, shape=(2, 3))


Another cluster...


In [44]:
coords1, docs1 = get_cluster_gene_data(K, data, collection, c1)
images1, titles1 = get_images(docs1)
skdemo.imshow_all(*images1, titles=titles1, shape=(2, 3))


UI Mockup

Here we demonstrate a mockup of what an interactive UI with e.g. Bokeh would look like. The figure has five elements:

  • a PCA plot, containing a scatterplot of all the samples
  • an image, showing the currently selected image
  • two smaller "duplicates" images, showing duplicates of the current image. These are also highlighted in the PCA plot.
  • six smaller "neighbors" images, the images closest to the current image in feature space. These should be labelled with the corresponding gene names.
  • a feature map, a simple line plot of the normalised feature map of the current image.
  • a feature histogram, the histogram of the distribution of the current feature across all samples.

First, we need to build a neighbor graph, and get the PCA decomposition, both in a scalable manner.


In [26]:
from sklearn import neighbors, decomposition
ng = neighbors.kneighbors_graph(X, 6, mode='distance')
reduced_data = decomposition.RandomizedPCA(2).fit_transform(X)

In [27]:
print(type(ng))
print(np.shape(reduced_data))


<class 'scipy.sparse.csr.csr_matrix'>
(1680, 2)

Next, we build up the multi-panel interface. It's a doozy!


In [28]:
import matplotlib as mpl
import itertools as it

def set_border_color(axes, color):
    for child in axes.get_children():
        if isinstance(child, mpl.spines.Spine):
            child.set_color(color)

def make_ui(data,
            norm_data, reduced_data, neighbors_graph,
            coords, selected_feature):
    # define layout
    fig = plt.figure(figsize=(15, 10))
    grid_spacing = (4, 6)
    pca_loc, pca_span = (0, 0), (2, 3)
    im_loc, im_span = (0, 3), (2, 2)
    dups_loc, dups_span = (0, 5), (2, 1)
    feat_loc, feat_span = (2, 0), (1, 3)
    hist_loc, hist_span = (3, 0), (1, 3)
    
    # get keys, indices, titles ready
    k = myo.key2mongo(coords)
    image_index = data.index.get_loc(coords)
    doc = collection.find_one(k)
    gene = doc['gene_name']
    image, title = zip(*get_images([doc]))[0]
    neighbor_indices = data.index[ng[1302].indices]
    neighbor_mongo_keys = map(myo.key2mongo, neighbor_indices)
    nimages, ntitles = get_images([collection.find_one({'_id': kn})
                                   for kn in neighbor_mongo_keys])
    dups_mongo_keys = [d for d in collection.find({'gene_name': gene})
                       if d['_id'] != k]
    dimages, dtitles = get_images(dups_mongo_keys)
    
    # plot stuff!
    # first, scatter
    gray, orange, blue = '#999999', '#e69f00', '#56b4e9'
    colors = [gray] * len(reduced_data)
    colors[1302] = orange
    for i in ng[1302].indices:
        colors[i] = blue
    ax_pca = plt.subplot2grid(grid_spacing, pca_loc, *pca_span)
    ax_pca.scatter(*(reduced_data.T), marker='x', c=colors)
    ax_pca.set_title("samples")
    
    # next, image
    ax_im = plt.subplot2grid(grid_spacing, im_loc, *im_span)
    ax_im.imshow(image)
    ax_im.set_title(title)
    ax_im.set_xticks([])
    ax_im.set_yticks([])
    set_border_color(ax_im, orange)
    
    # next, plot the feature map
    ax_feat = plt.subplot2grid(grid_spacing, feat_loc, *feat_span)
    ax_feat.plot(data.iloc[image_index], c=orange, lw=2)
    ax_feat.set_title('features')
    ax_feat.vlines(data.columns.get_loc(selected_feature),
                   *ax_feat.get_ylim(), colors='g', linestyles='dashed',
                   lw=2)
    # to-do: set feature titles as tooltips
    
    # next, plot a histogram of a chosen feature
    feat = data[selected_feature]
    ax_hist = plt.subplot2grid(grid_spacing, hist_loc, *hist_span)
    ax_hist.hist(feat, bins=80, normed=True, color=(0, 0, 1, 0.5))
    ax_hist.set_title('distribution of ' + selected_feature)
    value_in_curr_image = feat[image_index]
    ax_hist.vlines(value_in_curr_image, 0, 1,
                   colors='g', linestyles='dashed', lw=2)
    # to-do: this needs LOD or "stacked scatter" to allow interactivity

    # plot neighbor images
    for n_loc, nimage, ntitle in zip(it.product([2, 3], [3, 4, 5]),
                                     nimages, ntitles):
        ax_n = plt.subplot2grid(grid_spacing, n_loc)
        ax_n.imshow(nimage)
        ax_n.set_title(ntitle)
        ax_n.set_xticks([])
        ax_n.set_yticks([])
        set_border_color(ax_n, blue)
        
    # plot duplicate images
    for d_loc, d_image, d_title in zip([(0, 5), (1, 5)],
                                       dimages, dtitles):
        ax_d = plt.subplot2grid(grid_spacing, d_loc)
        ax_d.imshow(d_image)
        ax_d.set_title(d_title)
        ax_d.set_xticks([])
        ax_d.set_yticks([])
        
    plt.tight_layout()
    plt.show()

Finally, we test it out:


In [45]:
coords = (2490700, 'K12')
#feature = 'mcf-otsu-threshold-Area-percentile50'
feature = 'nuclei-adaptive-threshold-min_intensity-percentile50'
make_ui(data, X, reduced_data, ng, coords, feature)


Conclusions and future directions

  • Image clustering in Python works pretty close to "out of the box"
  • Goal is not an easy GUI for "standard" screen analysis (covered by excellent CellProfiler, also using SciPy stack)
  • Goal is to allow bioinformaticians to quickly define a feature set and ship an interactive, multivariate, exploratory analysis to biologists
  • Use clustering/neighbours network for gene functional prediction
  • Various minor improvements:
    • Name change: "HUSC" short, pronounceable, not a word, but not Googleable
    • Improve image preprocessing
    • Improve clustering methods