In [1]:
%matplotlib inline
This sample notebook collects an entire analysis of a subset of images from a screen.
In [2]:
from IPython.parallel import Client
rc = Client()
In [3]:
import pickle
import dill
from types import FunctionType
from IPython.utils.pickleutil import can_map
can_map.pop(FunctionType, None)
from IPython.kernel.zmq import serialize
serialize.pickle = pickle
In [4]:
from husc.screens import myofusion as myo
from skimage import io
def fn2feat(filename):
im = io.imread(filename)
return myo.feature_map(im, sample_size=100)[0]
def feature_vector_from_rgb(image, sample_size=None):
"""Compute a feature vector from the composite images.
The channels are assumed to be in the following order:
- Red: MCF-7
- Green: cytoplasm GFP
- Blue: DAPI/Hoechst
Parameters
----------
im : array, shape (M, N, 3)
The input image.
sample_size : int, optional
For features based on quantiles, sample this many objects
rather than computing full distribution. This can considerably
speed up computation with little cost to feature accuracy.
Returns
-------
fs : 1D array of float
The features of the image.
names : list of string
The feature names.
"""
all_fs, all_names = [], []
ims = np.rollaxis(image[..., :3], -1, 0) # toss out alpha chan if present
mcf, cells, nuclei = ims
prefixes = ['mcf', 'cells', 'nuclei']
for im, prefix in zip(ims, prefixes):
fs, names = features.intensity_object_features(im,
sample_size=sample_size)
names = [prefix + '-' + name for name in names]
all_fs.append(fs)
all_names.extend(names)
nuclei_mean = nd.label(nuclei > np.mean(nuclei))[0]
fs, names = features.nearest_neighbors(nuclei_mean)
all_fs.append(fs)
all_names.extend(['nuclei-' + name for name in names])
mcf_mean = nd.label(mcf)[0]
fs, names = features.fraction_positive(nuclei_mean, mcf_mean,
positive_name='mcf')
all_fs.append(fs)
all_names.extend(names)
cells_t_otsu = cells > threshold_otsu(cells)
cells_t_adapt = threshold_adaptive(cells, 51)
fs, names = features.nuclei_per_cell_histogram(nuclei_mean, cells_t_otsu)
all_fs.append(fs)
all_names.extend(['otsu-' + name for name in names])
fs, names = features.nuclei_per_cell_histogram(nuclei_mean, cells_t_adapt)
all_fs.append(fs)
all_names.extend(['adapt-' + name for name in names])
return np.concatenate(all_fs), all_names
def object_features(bin_im, im, erode=2, sample_size=None):
"""Compute features about objects in a binary image.
Parameters
----------
bin_im : 2D np.ndarray of bool
The image of objects.
im : 2D np.ndarray of float or uint8
The actual image.
erode : int, optional
Radius of erosion of objects.
sample_size : int, optional
Sample this many objects randomly, rather than measuring all
objects.
Returns
-------
fs : 1D np.ndarray of float
The feature vector.
names : list of string
The names of each feature.
"""
selem = skmorph.disk(erode)
if erode > 0:
bin_im = nd.binary_opening(bin_im, selem)
lab_im, n_objs = nd.label(bin_im)
if sample_size is None:
sample_size = n_objs
sample_indices = np.arange(n_objs)
else:
sample_indices = np.random.randint(0, n_objs, size=sample_size)
prop_names = ['area', 'eccentricity', 'euler_number', 'extent',
'min_intensity', 'mean_intensity', 'max_intensity',
'solidity']
objects = measure.regionprops(lab_im, intensity_image=im)
properties = np.empty((sample_size, len(prop_names)), dtype=np.float)
for i, j in enumerate(sample_indices):
properties[i] = [getattr(objects[j], prop) for prop in prop_names]
quantiles = [0.05, 0.25, 0.5, 0.75, 0.95]
feature_quantiles = mquantiles(properties, quantiles, axis=0).T
fs = np.concatenate([np.array([n_objs], np.float),
feature_quantiles.ravel()])
names = (['num-objs'] +
['%s-percentile%i' % (prop, int(q * 100))
for prop, q in it.product(prop_names, quantiles)])
return fs, names
In [5]:
%%px
import numpy as np
np.random.seed(0)
from husc.screens import myofusion as myo
from skimage import io
In [6]:
datadir = '/Users/nuneziglesiasj/Data/myo/'
In [7]:
import os
import scandir
def all_images(path):
for p, _, files in scandir.walk(path):
for f in files:
if f.endswith('.tif'):
yield os.path.join(p, f)
image_fns = list(all_images(datadir))
In [8]:
import numpy as np
lbview = rc.load_balanced_view()
lbview.block = True
e0 = rc[0]
e0.block = True
feats0, feature_names = e0.apply_sync(myo.feature_map,
io.imread(image_fns[0]),
sample_size=5)
feature_vectors = lbview.map(fn2feat, image_fns,
chunksize=10)
import pandas as pd
data = pd.DataFrame(np.vstack(feature_vectors),
index=map(myo.filename2coord, image_fns),
columns=feature_names)
data now contains a pandas dataframe where each row corresponds to an image and each column to a statistical feature summarising an aspect of that image.
In [9]:
data.head()
Out[9]:
In [10]:
import subprocess as sp
sp.Popen(["mongod", "--dbpath", "/Users/nuneziglesiasj/mongodb"])
Out[10]:
In [11]:
from pymongo import MongoClient
collection = MongoClient()["myofusion"]["wells"]
In [12]:
collection.find_one({"gene_name": "Ppp2cb"})
# use collection.find() to iterate over all matching docs
Out[12]:
In [13]:
from __future__ import division
from sklearn.preprocessing import StandardScaler
from sklearn import cluster
import numpy as np
np.random.seed(0)
X = StandardScaler().fit_transform(data)
avg_cluster_size = 10
K = cluster.MiniBatchKMeans(n_clusters=(X.shape[0] // avg_cluster_size),
batch_size=1680)
K = K.fit(X)
In [14]:
def get_cluster_gene_data(model, dataframe, mongo_collection, cluster_id):
coords = [data.index[i]
for i in np.flatnonzero(model.labels_ == cluster_id)]
docs = [mongo_collection.find_one({"_id": myo.key2mongo(c)})
for c in coords]
return coords, docs
In [15]:
from matplotlib import pyplot as plt
cluster_sizes = np.bincount(K.labels_)
plt.plot(cluster_sizes)
Out[15]:
In [43]:
c0 = np.flatnonzero(cluster_sizes == 6)[0]
c1 = np.flatnonzero(cluster_sizes == 9)[2]
In [17]:
coords0, docs0 = get_cluster_gene_data(K, data, collection, c0)
In [18]:
[d['gene_name'] for d in docs0]
Out[18]:
In [22]:
from skimage import io
def get_images(docs):
ims = []
titles = []
for doc in docs:
fn0 = doc['filename'].split('/')[-2:]
fn = os.path.join(datadir, *fn0)
ims.append(io.imread(fn))
key = doc["_id"]
gene_name = doc.get("gene_name", "None")
titles.append(' '.join([key, gene_name]))
return ims, titles
In [21]:
docs0[0]
Out[21]:
In [23]:
images0, titles0 = get_images(docs0)
In [24]:
import skdemo
reload(skdemo)
skdemo.imshow_all(*images0, titles=titles0, shape=(2, 3))
In [44]:
coords1, docs1 = get_cluster_gene_data(K, data, collection, c1)
images1, titles1 = get_images(docs1)
skdemo.imshow_all(*images1, titles=titles1, shape=(2, 3))
Here we demonstrate a mockup of what an interactive UI with e.g. Bokeh would look like. The figure has five elements:
First, we need to build a neighbor graph, and get the PCA decomposition, both in a scalable manner.
In [26]:
from sklearn import neighbors, decomposition
ng = neighbors.kneighbors_graph(X, 6, mode='distance')
reduced_data = decomposition.RandomizedPCA(2).fit_transform(X)
In [27]:
print(type(ng))
print(np.shape(reduced_data))
Next, we build up the multi-panel interface. It's a doozy!
In [28]:
import matplotlib as mpl
import itertools as it
def set_border_color(axes, color):
for child in axes.get_children():
if isinstance(child, mpl.spines.Spine):
child.set_color(color)
def make_ui(data,
norm_data, reduced_data, neighbors_graph,
coords, selected_feature):
# define layout
fig = plt.figure(figsize=(15, 10))
grid_spacing = (4, 6)
pca_loc, pca_span = (0, 0), (2, 3)
im_loc, im_span = (0, 3), (2, 2)
dups_loc, dups_span = (0, 5), (2, 1)
feat_loc, feat_span = (2, 0), (1, 3)
hist_loc, hist_span = (3, 0), (1, 3)
# get keys, indices, titles ready
k = myo.key2mongo(coords)
image_index = data.index.get_loc(coords)
doc = collection.find_one(k)
gene = doc['gene_name']
image, title = zip(*get_images([doc]))[0]
neighbor_indices = data.index[ng[1302].indices]
neighbor_mongo_keys = map(myo.key2mongo, neighbor_indices)
nimages, ntitles = get_images([collection.find_one({'_id': kn})
for kn in neighbor_mongo_keys])
dups_mongo_keys = [d for d in collection.find({'gene_name': gene})
if d['_id'] != k]
dimages, dtitles = get_images(dups_mongo_keys)
# plot stuff!
# first, scatter
gray, orange, blue = '#999999', '#e69f00', '#56b4e9'
colors = [gray] * len(reduced_data)
colors[1302] = orange
for i in ng[1302].indices:
colors[i] = blue
ax_pca = plt.subplot2grid(grid_spacing, pca_loc, *pca_span)
ax_pca.scatter(*(reduced_data.T), marker='x', c=colors)
ax_pca.set_title("samples")
# next, image
ax_im = plt.subplot2grid(grid_spacing, im_loc, *im_span)
ax_im.imshow(image)
ax_im.set_title(title)
ax_im.set_xticks([])
ax_im.set_yticks([])
set_border_color(ax_im, orange)
# next, plot the feature map
ax_feat = plt.subplot2grid(grid_spacing, feat_loc, *feat_span)
ax_feat.plot(data.iloc[image_index], c=orange, lw=2)
ax_feat.set_title('features')
ax_feat.vlines(data.columns.get_loc(selected_feature),
*ax_feat.get_ylim(), colors='g', linestyles='dashed',
lw=2)
# to-do: set feature titles as tooltips
# next, plot a histogram of a chosen feature
feat = data[selected_feature]
ax_hist = plt.subplot2grid(grid_spacing, hist_loc, *hist_span)
ax_hist.hist(feat, bins=80, normed=True, color=(0, 0, 1, 0.5))
ax_hist.set_title('distribution of ' + selected_feature)
value_in_curr_image = feat[image_index]
ax_hist.vlines(value_in_curr_image, 0, 1,
colors='g', linestyles='dashed', lw=2)
# to-do: this needs LOD or "stacked scatter" to allow interactivity
# plot neighbor images
for n_loc, nimage, ntitle in zip(it.product([2, 3], [3, 4, 5]),
nimages, ntitles):
ax_n = plt.subplot2grid(grid_spacing, n_loc)
ax_n.imshow(nimage)
ax_n.set_title(ntitle)
ax_n.set_xticks([])
ax_n.set_yticks([])
set_border_color(ax_n, blue)
# plot duplicate images
for d_loc, d_image, d_title in zip([(0, 5), (1, 5)],
dimages, dtitles):
ax_d = plt.subplot2grid(grid_spacing, d_loc)
ax_d.imshow(d_image)
ax_d.set_title(d_title)
ax_d.set_xticks([])
ax_d.set_yticks([])
plt.tight_layout()
plt.show()
Finally, we test it out:
In [45]:
coords = (2490700, 'K12')
#feature = 'mcf-otsu-threshold-Area-percentile50'
feature = 'nuclei-adaptive-threshold-min_intensity-percentile50'
make_ui(data, X, reduced_data, ng, coords, feature)