In this example, we use CellCnn to analyze a mass cytometry dataset acquired to characterize human natural killer (NK) cell diversity and associate NK cell subsets with genetic and environmental factors, namely prior Cytomegalovirus (CMV) infection [1]. This dataset comprises mass cytometry measurements of 36 markers, including 28 NK cell receptors, for PBMC samples of 20 donors with varying serology for CMV.

We will train CellCnn to identify CMV seropositivity-associated cell populations from the ungated data (after removal of dead cells and doublets). To run this example, please download the NK cell dataset and place the decompressed folder in the cellCnn/examples directory.

[1] Horowitz, A. et al. Genetic and environmental determinants of human NK cell diversity revealed by mass cytometry. Sci. Transl. Med. 5 (2013).



In [1]:

    
import os, sys, errno, glob
import numpy as np
import pandas as pd

import cellCnn
from cellCnn.utils import loadFCS, ftrans, mkdir_p, get_items
from cellCnn.model import CellCnn
from cellCnn.plotting import plot_results
from sklearn.metrics import roc_auc_score

%pylab inline









    



Using Theano backend.






    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
# define input and output directories
WDIR = os.path.join(cellCnn.__path__[0], 'examples')
FCS_DATA_PATH = os.path.join(WDIR, 'NK_cell_dataset', 'gated_alive')

# define output directory
OUTDIR = os.path.join(WDIR, 'output_NK_ungated')
mkdir_p(OUTDIR)



In [4]:

    
# look at the measured markers
data_fcs = loadFCS(glob.glob(FCS_DATA_PATH + '/*.fcs')[0], transform=None, auto_comp=False)
print data_fcs.channels









    



[u'Time', u'Cell_length', u'CD3', u'Dead', u'(La139)Dd', u'CD27', u'CD19', u'CD4', u'CD8', u'CD57', u'2DL1-S1', u'TRAIL', u'2DL2-L3-S2', u'CD16', u'CD10', u'3DL1-S1', u'CD117', u'2DS4', u'ILT2-CD85j', u'NKp46', u'NKG2D', u'NKG2C', u'2B4', u'CD33', u'CD11b', u'NKp30', u'CD122', u'3DL1', u'NKp44', u'CD127', u'2DL1', u'CD94', u'CD34', u'CCR7', u'2DL3', u'NKG2A', u'HLA-DR', u'2DL4', u'CD56', u'2DL5', u'CD25', u'DNA1', u'DNA2']



In [5]:

    
# select the relevant markers for further analysis
markers = ['CD3', 'CD27', 'CD19', 'CD4', 'CD8', 'CD57', '2DL1-S1', 'TRAIL', '2DL2-L3-S2',
           'CD16', 'CD10', '3DL1-S1', 'CD117', '2DS4', 'ILT2-CD85j', 'NKp46', 'NKG2D', 'NKG2C',
           '2B4', 'CD33', 'CD11b', 'NKp30', 'CD122', '3DL1', 'NKp44', 'CD127', '2DL1', 'CD94',
           'CD34', 'CCR7', '2DL3', 'NKG2A', 'HLA-DR', '2DL4', 'CD56', '2DL5', 'CD25']
marker_idx = [data_fcs.channels.index(label) for label in markers]
nmark = len(markers)



In [6]:

    
# load the sample names and corresponding labels (0: CMV-, 1: CMV+), here from a CSV file
# prior CMV infection status is obtained from the original study (Horowitz et al. 2013)
csv_file = 'ungated_NK_fcs_samples_with_labels.csv'
fcs_info = np.array(pd.read_csv(csv_file, sep=','))
sample_ids = fcs_info[:, 0]
sample_labels = fcs_info[:, 1].astype(int)



In [7]:

    
# set random seed for reproducible results
np.random.seed(12345)

# cofactor for arcsinh transformation
cofactor = 5

# split the fcs files into training, validation and test set
group1 = np.where(sample_labels == 0)[0]
group2 = np.where(sample_labels == 1)[0]
l1, l2 = len(group1), len(group2)
ntrain_per_class = 7
ntest_group1 = l1 - ntrain_per_class
ntest_group2 = l2 - ntrain_per_class

# get the sample indices
train_idx1 = list(np.random.choice(group1, size=ntrain_per_class, replace=False))
test_idx1 = [i for i in group1 if i not in train_idx1]
train_idx2 = list(np.random.choice(group2, size=ntrain_per_class, replace=False))
test_idx2 = [i for i in group2 if i not in train_idx2]

# load the training samples
group1_list, group2_list = [], []
for idx in train_idx1:
    fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
    x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
    x = ftrans(x_full[:,marker_idx], cofactor)
    group1_list.append(x)

for idx in train_idx2:
    fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
    x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
    x = ftrans(x_full[:,marker_idx], cofactor)
    group2_list.append(x)

# load the test samples
t_group1_list, t_group2_list = [], []
test_phenotypes = []
for idx in test_idx1:
    fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
    x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
    x = ftrans(x_full[:,marker_idx], cofactor)
    t_group1_list.append(x)
    test_phenotypes.append(0)

for idx in test_idx2:
    fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
    x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
    x = ftrans(x_full[:,marker_idx], cofactor)
    t_group2_list.append(x)
    test_phenotypes.append(1)

# finally prepare training and vallidation data
cut = int(.8 * len(group1_list))
train_samples = group1_list[:cut] + group2_list[:cut]
train_phenotypes = [0] * len(group1_list[:cut]) + [1] * len(group2_list[:cut])
valid_samples = group1_list[cut:] + group2_list[cut:]
valid_phenotypes = [0] * len(group1_list[cut:]) + [1] * len(group2_list[cut:])
test_samples = t_group1_list + t_group2_list



In [9]:

    
# run a CellCnn analysis

model = CellCnn(ncell=3000, nsubset=1000, maxpool_percentages=[0.5, 1], verbose=0)
model.fit(train_samples=train_samples, train_phenotypes=train_phenotypes,
          valid_samples=valid_samples, valid_phenotypes=valid_phenotypes, outdir=OUTDIR)









    



Generating multi-cell inputs...
Done.
Number of filters: 4
Cells pooled: 15
2000/2000 [==============================] - 2s     
Best validation accuracy: 0.78
Number of filters: 6
Cells pooled: 30
2000/2000 [==============================] - 2s     
Best validation accuracy: 0.85
Number of filters: 7
Cells pooled: 15
2000/2000 [==============================] - 2s     
Best validation accuracy: 0.75
Number of filters: 4
Cells pooled: 30
2000/2000 [==============================] - 2s     
Best validation accuracy: 0.75
Number of filters: 7
Cells pooled: 15
2000/2000 [==============================] - 2s     
Best validation accuracy: 0.78
Number of filters: 3
Cells pooled: 30
2000/2000 [==============================] - 2s     
Best validation accuracy: 0.75
Number of filters: 6
Cells pooled: 15
2000/2000 [==============================] - 2s     
Best validation accuracy: 0.77
Number of filters: 3
Cells pooled: 30
2000/2000 [==============================] - 2s     
Best validation accuracy: 0.75
Number of filters: 5
Cells pooled: 15
2000/2000 [==============================] - 2s     
Best validation accuracy: 0.78
Number of filters: 6
Cells pooled: 30
2000/2000 [==============================] - 2s     
Best validation accuracy: 0.79
Number of filters: 9
Cells pooled: 15
2000/2000 [==============================] - 3s     
Best validation accuracy: 0.77
Number of filters: 5
Cells pooled: 30
2000/2000 [==============================] - 2s     
Best validation accuracy: 0.90
Number of filters: 9
Cells pooled: 15
2000/2000 [==============================] - 3s     
Best validation accuracy: 0.80
Number of filters: 5
Cells pooled: 30
2000/2000 [==============================] - 2s     
Best validation accuracy: 0.79
Number of filters: 6
Cells pooled: 15
2000/2000 [==============================] - 2s     
Best validation accuracy: 0.80






    Out[9]:





<cellCnn.model.CellCnn at 0x119713150>



In [11]:

    
# now make predictions using the trained model
train_pred = model.predict(train_samples)
valid_pred = model.predict(valid_samples)
test_pred = model.predict(test_samples)
print train_pred, train_phenotypes
print valid_pred, valid_phenotypes
print test_pred, test_phenotypes

# calculate area under the ROC curve
train_auc = roc_auc_score(train_phenotypes, train_pred[:,1])
valid_auc = roc_auc_score(valid_phenotypes, valid_pred[:,1])
test_auc = roc_auc_score(test_phenotypes, test_pred[:,1])
print train_auc, valid_auc, test_auc









    



Predictions based on multi-cell inputs containing 82324 cells.
Predictions based on multi-cell inputs containing 92403 cells.
Predictions based on multi-cell inputs containing 90075 cells.
[[ 0.74422669  0.25577331]
 [ 0.68398641  0.31601359]
 [ 0.77945113  0.22054886]
 [ 0.77746236  0.22253765]
 [ 0.83990906  0.16009095]
 [ 0.18527946  0.81472053]
 [ 0.32901263  0.67098737]
 [ 0.03633906  0.96366096]
 [ 0.10460788  0.89539214]
 [ 0.04876866  0.95123134]] [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
[[ 0.8363443   0.1636557 ]
 [ 0.64203864  0.35796137]
 [ 0.06598661  0.93401339]
 [ 0.49698047  0.50301952]] [0, 0, 1, 1]
[[ 0.68328202  0.31671797]
 [ 0.78654806  0.21345194]
 [ 0.52160385  0.47839614]
 [ 0.55448604  0.44551396]
 [ 0.08702733  0.91297267]
 [ 0.33864736  0.66135265]] [0, 0, 0, 0, 1, 1]
1.0 1.0 1.0



In [13]:

    
# plot the results of the CellCnn analysis in the output directory
plot_results(model.results, test_samples, test_phenotypes,
             markers, OUTDIR, filter_response_thres=0.2,
             group_a='CMV-', group_b='CMV+')









    



Loading the weights of consensus filters.






    Out[13]:





[(0, 0.2)]






    





<matplotlib.figure.Figure at 0x117b330d0>






    





<matplotlib.figure.Figure at 0x2797a8650>






    





<matplotlib.figure.Figure at 0x223df3d50>



In [ ]: