Classifying Tumor vs. Normal from Gene Expression

See if it's possible to train a deep neural network tumor/normal binary classifier using just the Toil TCGA, TARGET and GTEX expression data (see ingest.ipynb for details on these data)


In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

# fix random seed for reproducibility
np.random.seed(42)


/opt/conda/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

In [2]:
%%time
# Load training set
X = pd.read_hdf("data/tcga_target_gtex.h5", "expression")
Y = pd.read_hdf("data/tcga_target_gtex.h5", "labels")


CPU times: user 240 ms, sys: 2.96 s, total: 3.2 s
Wall time: 3.21 s

In [3]:
# Convert tumor_normal and primary_site into numerical values for two-hot multi-class training
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y["tumor_normal_value"] = pd.Series(encoder.fit_transform(Y["tumor_normal"]), index=Y.index)
encoder = LabelEncoder()
Y["primary_site_value"] = pd.Series(encoder.fit_transform(Y["primary_site"]), index=Y.index)
Y.describe(include="all", percentiles=[])


Out[3]:
category disease primary_site sample_type gender study tumor_normal tumor_normal_value primary_site_value
count 19126 19126 19126 19126 19126 19126 19126 19126.000000 19126.000000
unique 93 93 46 16 3 3 2 NaN NaN
top Breast Invasive Carcinoma Breast Invasive Carcinoma Brain Primary Tumor Male TCGA Tumor NaN NaN
freq 1212 1212 1846 9185 10453 10534 10530 NaN NaN
mean NaN NaN NaN NaN NaN NaN NaN 0.550559 20.651992
std NaN NaN NaN NaN NaN NaN NaN 0.497450 12.419634
min NaN NaN NaN NaN NaN NaN NaN 0.000000 0.000000
50% NaN NaN NaN NaN NaN NaN NaN 1.000000 19.000000
max NaN NaN NaN NaN NaN NaN NaN 1.000000 45.000000

In [4]:
# Split into stratified training and test sets based on classes (i.e. tissue type) so that we have equal
# proportions of each tissue type in the train and test sets
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X.values, Y["primary_site_value"]):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = Y["tumor_normal_value"][train_index], Y["tumor_normal_value"][test_index]
    classes_train, classes_test = Y["primary_site_value"].values[train_index], Y["primary_site_value"].values[test_index]

print(X_train.shape, X_test.shape)


(15300, 58581) (3826, 58581)

In [8]:
"""
Batch normalization with a sparse layer.
"""
from keras.models import Model, Sequential
from keras.layers import InputLayer, Dense, BatchNormalization, Activation, Dropout
from keras.callbacks import EarlyStopping
from keras import regularizers

classify = [
    InputLayer(input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    
    Dense(128),
    BatchNormalization(),
    Activation('relu'),

    Dense(64, activity_regularizer=regularizers.l1(1e-5)),
    BatchNormalization(),
    Activation('relu'),
    
    Dense(1),
    Activation('sigmoid')
]

model = Sequential(classify)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

callbacks=[EarlyStopping(monitor='acc', min_delta=0.05, patience=2, verbose=2, mode="max")]

model.fit(X_train, y_train, epochs=5, batch_size=128, shuffle="batch", callbacks=callbacks)

print(model.metrics_names, model.evaluate(X_test, y_test))


Epoch 1/5
15300/15300 [==============================] - 166s 11ms/step - loss: 0.1050 - acc: 0.9761
Epoch 2/5
15300/15300 [==============================] - 49s 3ms/step - loss: 0.0350 - acc: 0.9956
Epoch 3/5
15300/15300 [==============================] - 45s 3ms/step - loss: 0.0220 - acc: 0.9992
Epoch 00003: early stopping
3826/3826 [==============================] - 3s 851us/step
['loss', 'acc'] [0.020764561297719586, 0.9968635650810246]