Classifying Tumor vs. Normal from Gene Expression

See if it's possible to train a deep neural network tumor/normal binary classifier using just the Toil TCGA, TARGET and GTEX expression data (see ingest.ipynb for details on these data)



In [1]:

    
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

# fix random seed for reproducibility
np.random.seed(42)









    



/opt/conda/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.



In [2]:

    
%%time
# Load training set
X = pd.read_hdf("data/tcga_target_gtex.h5", "expression")
Y = pd.read_hdf("data/tcga_target_gtex.h5", "labels")









    



CPU times: user 240 ms, sys: 2.96 s, total: 3.2 s
Wall time: 3.21 s



In [3]:

    
# Convert tumor_normal and primary_site into numerical values for two-hot multi-class training
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y["tumor_normal_value"] = pd.Series(encoder.fit_transform(Y["tumor_normal"]), index=Y.index)
encoder = LabelEncoder()
Y["primary_site_value"] = pd.Series(encoder.fit_transform(Y["primary_site"]), index=Y.index)
Y.describe(include="all", percentiles=[])









    Out[3]:







  
    
      
      category
      disease
      primary_site
      sample_type
      gender
      study
      tumor_normal
      tumor_normal_value
      primary_site_value
    
  
  
    
      count
      19126
      19126
      19126
      19126
      19126
      19126
      19126
      19126.000000
      19126.000000
    
    
      unique
      93
      93
      46
      16
      3
      3
      2
      NaN
      NaN
    
    
      top
      Breast Invasive Carcinoma
      Breast Invasive Carcinoma
      Brain
      Primary Tumor
      Male
      TCGA
      Tumor
      NaN
      NaN
    
    
      freq
      1212
      1212
      1846
      9185
      10453
      10534
      10530
      NaN
      NaN
    
    
      mean
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      0.550559
      20.651992
    
    
      std
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      0.497450
      12.419634
    
    
      min
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      0.000000
      0.000000
    
    
      50%
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.000000
      19.000000
    
    
      max
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.000000
      45.000000



In [4]:

    
# Split into stratified training and test sets based on classes (i.e. tissue type) so that we have equal
# proportions of each tissue type in the train and test sets
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X.values, Y["primary_site_value"]):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = Y["tumor_normal_value"][train_index], Y["tumor_normal_value"][test_index]
    classes_train, classes_test = Y["primary_site_value"].values[train_index], Y["primary_site_value"].values[test_index]

print(X_train.shape, X_test.shape)









    



(15300, 58581) (3826, 58581)



In [8]:

    
"""
Batch normalization with a sparse layer.
"""
from keras.models import Model, Sequential
from keras.layers import InputLayer, Dense, BatchNormalization, Activation, Dropout
from keras.callbacks import EarlyStopping
from keras import regularizers

classify = [
    InputLayer(input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    
    Dense(128),
    BatchNormalization(),
    Activation('relu'),

    Dense(64, activity_regularizer=regularizers.l1(1e-5)),
    BatchNormalization(),
    Activation('relu'),
    
    Dense(1),
    Activation('sigmoid')
]

model = Sequential(classify)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

callbacks=[EarlyStopping(monitor='acc', min_delta=0.05, patience=2, verbose=2, mode="max")]

model.fit(X_train, y_train, epochs=5, batch_size=128, shuffle="batch", callbacks=callbacks)

print(model.metrics_names, model.evaluate(X_test, y_test))









    



Epoch 1/5
15300/15300 [==============================] - 166s 11ms/step - loss: 0.1050 - acc: 0.9761
Epoch 2/5
15300/15300 [==============================] - 49s 3ms/step - loss: 0.0350 - acc: 0.9956
Epoch 3/5
15300/15300 [==============================] - 45s 3ms/step - loss: 0.0220 - acc: 0.9992
Epoch 00003: early stopping
3826/3826 [==============================] - 3s 851us/step
['loss', 'acc'] [0.020764561297719586, 0.9968635650810246]

	category	disease	primary_site	sample_type	gender	study	tumor_normal	tumor_normal_value	primary_site_value
count	19126	19126	19126	19126	19126	19126	19126	19126.000000	19126.000000
unique	93	93	46	16	3	3	2	NaN	NaN
top	Breast Invasive Carcinoma	Breast Invasive Carcinoma	Brain	Primary Tumor	Male	TCGA	Tumor	NaN	NaN
freq	1212	1212	1846	9185	10453	10534	10530	NaN	NaN
mean	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.550559	20.651992
std	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.497450	12.419634
min	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000
50%	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.000000	19.000000
max	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.000000	45.000000