In [ ]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import time
import warnings; warnings.filterwarnings("ignore")

from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from functools import partial
from sklearn.cluster import DBSCAN, MiniBatchKMeans
from sklearn.neighbors import BallTree
from sklearn.preprocessing import LabelEncoder

from icing.core import distances; reload(distances)
from icing.core.distances import *
from icing.externals.DbCore import parseAllele, gene_regex, junction_re
from icing.utils import io
from icing import inference

ICING tutorial


ICING is a IG clonotype inference library developed in Python.

NB: This is NOT a quickstart guide for ICING. This intended as a detailed tutorial on how ICING works internally. If you're only interested into using ICING, please refer to the Quickstart Manual on github, or the Quickstart section at the end of this notebook.

ICING needs as input a file (TAB-delimited or CSV) which contains, in each row, a particular sequence. The format used is the same as returned by Change-O's MakeDb.py script, which, starting from a IMGT results, it builds a single file with all the information extracted from IMGT starting from the RAW fasta sequences.

0. Data loading

Load the dataset into a single pandas dataframe called 'df'.

The dataset MUST CONTAIN at least the following columns (NOT case-sensitive):

  • SEQUENCE_ID
  • V_CALL
  • J_CALL
  • JUNCTION
  • MUT (only if correct is True)

In [ ]:
db_file = '../examples/data/clones_100.100.tab'

# dialect="excel" for CSV or XLS files
# for computational reasons, let's limit the dataset to the first 1000 sequences
X = io.load_dataframe(db_file, dialect="excel-tab")[:1000] 

# turn the following off if data are real
# otherwise, assume that the "SEQUENCE_ID" field is composed as
# "[db]_[extension]_[id]_[id-true-clonotype]_[other-info]"
# See the example file for the format of the input.
X['true_clone'] = [x[3] for x in X.sequence_id.str.split('_')]

1. Preprocessing step: data shrinking

Specially in CLL patients, most of the input sequences have the same V genes AND junction. In this case, it is possible to remove such sequences from the analysis (we just need to remember them after.) In other words, we can collapse repeated sequences into a single one, which will weight as high as the number of sequences it represents.


In [ ]:
# group by junction and v genes
groups = X.groupby(["v_gene_set_str", "junc"]).groups.values()
idxs = np.array([elem[0] for elem in groups])  # take one of them
weights = np.array([len(elem) for elem in groups])  # assign its weight

2. High-level group inference

The number of sequences at this point may be still very high, in particular when IGs are mutated and there is not much replication. However, we rely on the fact that IG similarity is mainly constrained on their junction length. Therefore, we infer high-level groups based on their junction lengths.

This is a fast and efficient step. Also, by exploiting MiniBatchKMeans, we can specify an upperbound on the number of clusters we want to obtain. However, contrary to the standard KMeans algorithm, in this case some clusters may vanish. If one is expected to have related IGs with very different junction lengths, however, it is reasonable to specify a low value of clusters.

Keep in mind, however, that a low number of clusters correspond to higher computational workload of the method in the next phases.


In [ ]:
n_clusters = 50

X_all = idxs.reshape(-1,1)
kmeans = MiniBatchKMeans(n_init=100, n_clusters=min(n_clusters, X_all.shape[0]))

lengths = X['junction_length'].values
kmeans.fit(lengths[idxs].reshape(-1,1))

3. Fine-grained group inference

Now we have higih-level groups of IGs we have to extract clonotypes from. Divide the dataset based on the labels extracted from MiniBatchKMeans. For each one of the cluster, find clonotypes contained in it using DBSCAN.

This algorithm allows us to use a custom metric between IGs.

[ADVANCED] To develop a custom metric, see the format of icing.core.distances.distance_dataframe. If you use a custom function, then you only need to put it as parameter of DBSCAN metric. Note that partial is required if the metric has more than 2 parameters. To be a valid metric for DBSCAN, the function must take ONLY two params (the two elements to compare). For this reason, the other arguments are pre-computed with partial in the following example.


In [ ]:
dbscan = DBSCAN(min_samples=20, n_jobs=-1, algorithm='brute', eps=0.2,
                metric=partial(distance_dataframe, X, 
                    junction_dist=distances.StringDistance(model='ham'),
                    correct=True, tol=0))

dbscan_labels = np.zeros_like(kmeans.labels_).ravel()
for label in np.unique(kmeans.labels_):
    idx_row = np.where(kmeans.labels_ == label)[0]
    
    X_idx = idxs[idx_row].reshape(-1,1).astype('float64')
    weights_idx = weights[idx_row]
    
    if idx_row.size == 1:
        db_labels = np.array([0])
    
    db_labels = dbscan.fit_predict(X_idx, sample_weight=weights_idx)
    
    if len(dbscan.core_sample_indices_) < 1:
        db_labels[:] = 0
        
    if -1 in db_labels:
        # this means that DBSCAN found some IG as noise. We choose to assign to the nearest cluster
        balltree = BallTree(
            X_idx[dbscan.core_sample_indices_],
            metric=dbscan.metric)
        noise_labels = balltree.query(
            X_idx[db_labels == -1], k=1, return_distance=False).ravel()
        # get labels for core points, then assign to noise points based
        # on balltree
        dbscan_noise_labels = db_labels[
            dbscan.core_sample_indices_][noise_labels]
        db_labels[db_labels == -1] = dbscan_noise_labels
    
    # hopefully, there are no noisy samples at this time
    db_labels[db_labels > -1] = db_labels[db_labels > -1] + np.max(dbscan_labels) + 1
    dbscan_labels[idx_row] = db_labels  # + np.max(dbscan_labels) + 1

labels = dbscan_labels

# new part: put together the labels
labels_ext = np.zeros(X.shape[0], dtype=int)
labels_ext[idxs] = labels
for i, list_ in enumerate(groups):
    labels_ext[list_] = labels[i]
labels = labels_ext

Quickstart


All of the above-mentioned steps are integrated in ICING with a simple call to the class inference.ICINGTwoStep. The following is an example of a working script.


In [ ]:
db_file = '../examples/data/clones_100.100.tab'
correct = True
tolerance = 0

X = io.load_dataframe(db_file)[:1000]

# turn the following off if data are real
X['true_clone'] = [x[3] for x in X.sequence_id.str.split('_')] 
true_clones = LabelEncoder().fit_transform(X.true_clone.values)

ii = inference.ICINGTwoStep(
    model='nt', eps=0.2, method='dbscan', verbose=True,
    kmeans_params=dict(n_init=100, n_clusters=20),
    dbscan_params=dict(min_samples=20, n_jobs=-1, algorithm='brute',
            metric=partial(distance_dataframe, X, **dict(
                junction_dist=StringDistance(model='ham'),
                correct=correct, tol=tolerance))))

tic = time.time()
labels = ii.fit_predict(X)
tac = time.time() - tic
print("\nElapsed time: %.1fs" % tac)

If you want to save the results:


In [ ]:
X['icing_clones (%s)' % ('_'.join(('StringDistance', str(eps), '0', 'corr' if correct else  'nocorr',
                                    "%.4f" % tac)))] = labels

X.to_csv(db_file.split('/')[-1] + '_icing.csv')

How is the result?


In [ ]:
from sklearn import metrics
true_clones = LabelEncoder().fit_transform(X.true_clone.values)

print "FMI: %.5f" % (metrics.fowlkes_mallows_score(true_clones, labels))
print "ARI: %.5f" % (metrics.adjusted_rand_score(true_clones, labels))
print "AMI: %.5f" % (metrics.adjusted_mutual_info_score(true_clones, labels))
print "NMI: %.5f" % (metrics.normalized_mutual_info_score(true_clones, labels))
print "Hom: %.5f" % (metrics.homogeneity_score(true_clones, labels))
print "Com: %.5f" % (metrics.completeness_score(true_clones, labels))
print "Vsc: %.5f" % (metrics.v_measure_score(true_clones, labels))

Is it better or worse than the result with everyone at the same time?


In [ ]:
labels = dbscan.fit_predict(np.arange(X.shape[0]).reshape(-1, 1))

print "FMI: %.5f" % metrics.fowlkes_mallows_score(true_clones, labels)
print "ARI: %.5f" % (metrics.adjusted_rand_score(true_clones, labels))
print "AMI: %.5f" % (metrics.adjusted_mutual_info_score(true_clones, labels))
print "NMI: %.5f" % (metrics.normalized_mutual_info_score(true_clones, labels))
print "Hom: %.5f" % (metrics.homogeneity_score(true_clones, labels))
print "Com: %.5f" % (metrics.completeness_score(true_clones, labels))
print "Vsc: %.5f" % (metrics.v_measure_score(true_clones, labels))

This should be the same (or worse). We reduced the computational workload while maintaining or improving the result we would obtain without the step 1 and 2.