Sample Notebook 3 for Picasso

This notebook shows how to perform HDBSCAN clustering with picasso.

Load the localizations


In [1]:
from picasso import io
path = 'testdata_locs.hdf5'
locs, info = io.load_locs(path)

print('Loaded {} locs.'.format(len(locs)))


Loaded 11975 locs.

HDBSCAN wrapper for locs


In [3]:
from hdbscan import HDBSCAN as _HDBSCAN
import numpy as _np
from scipy.spatial import ConvexHull

from picasso import lib as _lib

# Clustering with HDBSCAN

def hdbscan(locs, min_samples, min_cluster_size):
    print("Identifying clusters...")
    if hasattr(locs, "z"):
        print("z-coordinates detected")
        pixelsize = int(input("Enter the pixelsize in nm/px:"))
        locs = locs[
            _np.isfinite(locs.x) & _np.isfinite(locs.y) & _np.isfinite(locs.z)
        ]
        X = _np.vstack((locs.x, locs.y, locs.z / pixelsize)).T
        db = _HDBSCAN(
            min_samples=min_samples, min_cluster_size=min_cluster_size
        ).fit(X)
        group = _np.int32(db.labels_)  # int32 for Origin compatiblity
        locs = _lib.append_to_rec(locs, group, "group")
        locs = locs[locs.group != -1]
        print("Generating cluster information...")
        groups = _np.unique(locs.group)
        n_groups = len(groups)
        mean_frame = _np.zeros(n_groups)
        std_frame = _np.zeros(n_groups)
        com_x = _np.zeros(n_groups)
        com_y = _np.zeros(n_groups)
        com_z = _np.zeros(n_groups)
        std_x = _np.zeros(n_groups)
        std_y = _np.zeros(n_groups)
        std_z = _np.zeros(n_groups)
        convex_hull = _np.zeros(n_groups)
        volume = _np.zeros(n_groups)
        n = _np.zeros(n_groups, dtype=_np.int32)
        for i, group in enumerate(groups):
            group_locs = locs[locs.group == i]
            mean_frame[i] = _np.mean(group_locs.frame)
            com_x[i] = _np.mean(group_locs.x)
            com_y[i] = _np.mean(group_locs.y)
            com_z[i] = _np.mean(group_locs.z)
            std_frame[i] = _np.std(group_locs.frame)
            std_x[i] = _np.std(group_locs.x)
            std_y[i] = _np.std(group_locs.y)
            std_z[i] = _np.std(group_locs.z)
            n[i] = len(group_locs)
            X_group = _np.stack(
                [group_locs.x, group_locs.y, group_locs.z / pixelsize], axis=0
            ).T
            volume[i] = (
                _np.power(
                    (std_x[i] + std_y[i] + (std_z[i] / pixelsize)) / 3 * 2, 3
                )
                * _np.pi
                * 4
                / 3
            )
            try:
                hull = ConvexHull(X_group)
                convex_hull[i] = hull.volume
            except Exception as e:
                print(e)
                convex_hull[i] = 0
        clusters = _np.rec.array(
            (
                groups,
                convex_hull,
                volume,
                mean_frame,
                com_x,
                com_y,
                com_z,
                std_frame,
                std_x,
                std_y,
                std_z,
                n,
            ),
            dtype=[
                ("groups", groups.dtype),
                ("convex_hull", "f4"),
                ("volume", "f4"),
                ("mean_frame", "f4"),
                ("com_x", "f4"),
                ("com_y", "f4"),
                ("com_z", "f4"),
                ("std_frame", "f4"),
                ("std_x", "f4"),
                ("std_y", "f4"),
                ("std_z", "f4"),
                ("n", "i4"),
            ],
        )
    else:
        locs = locs[_np.isfinite(locs.x) & _np.isfinite(locs.y)]
        X = _np.vstack((locs.x, locs.y)).T
        db = _HDBSCAN(
            min_samples=min_samples, min_cluster_size=min_cluster_size
        ).fit(X)
        group = _np.int32(db.labels_)  # int32 for Origin compatiblity
        locs = _lib.append_to_rec(locs, group, "group")
        locs = locs[locs.group != -1]
        print("Generating cluster information...")
        groups = _np.unique(locs.group)
        n_groups = len(groups)
        mean_frame = _np.zeros(n_groups)
        std_frame = _np.zeros(n_groups)
        com_x = _np.zeros(n_groups)
        com_y = _np.zeros(n_groups)
        std_x = _np.zeros(n_groups)
        std_y = _np.zeros(n_groups)
        convex_hull = _np.zeros(n_groups)
        area = _np.zeros(n_groups)
        n = _np.zeros(n_groups, dtype=_np.int32)
        for i, group in enumerate(groups):
            group_locs = locs[locs.group == i]
            mean_frame[i] = _np.mean(group_locs.frame)
            com_x[i] = _np.mean(group_locs.x)
            com_y[i] = _np.mean(group_locs.y)
            std_frame[i] = _np.std(group_locs.frame)
            std_x[i] = _np.std(group_locs.x)
            std_y[i] = _np.std(group_locs.y)
            n[i] = len(group_locs)
            X_group = _np.stack([group_locs.x, group_locs.y], axis=0).T
            area[i] = _np.power((std_x[i] + std_y[i]), 2) * _np.pi
            try:
                hull = ConvexHull(X_group)
                convex_hull[i] = hull.volume
            except Exception as e:
                print(e)
                convex_hull[i] = 0
        clusters = _np.rec.array(
            (
                groups,
                convex_hull,
                area,
                mean_frame,
                com_x,
                com_y,
                std_frame,
                std_x,
                std_y,
                n,
            ),
            dtype=[
                ("groups", groups.dtype),
                ("convex_hull", "f4"),
                ("area", "f4"),
                ("mean_frame", "f4"),
                ("com_x", "f4"),
                ("com_y", "f4"),
                ("std_frame", "f4"),
                ("std_x", "f4"),
                ("std_y", "f4"),
                ("n", "i4"),
            ],
        )
    return clusters, locs

Calculating clusters


In [4]:
min_samples = 10
min_cluster_size = 10
clusters, locs = hdbscan(locs, min_samples, min_cluster_size)


Identifying clusters...
Generating cluster information...

Save


In [5]:
from picasso import io
import os
from h5py import File

base, ext = os.path.splitext(path)
dbscan_info = {
    "Generated by": "Picasso HDBSCAN",
    "Min samples": min_samples,
    "Min cluster size": min_cluster_size,
}
info.append(dbscan_info)
io.save_locs(base + "_dbscan.hdf5", locs, info)
with File(base + "_dbclusters.hdf5", "w") as clusters_file:
    clusters_file.create_dataset("clusters", data=clusters)
print('Complete')


Complete