In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
# fix random seed for reproducibility
np.random.seed(42)
In [2]:
%%time
# Load training set
X = pd.read_hdf("data/tcga_target_gtex.h5", "expression")
Y = pd.read_hdf("data/tcga_target_gtex.h5", "labels")
In [3]:
# Convert tumor_normal and primary_site into numerical values for two-hot multi-class training
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y["tumor_normal_value"] = pd.Series(encoder.fit_transform(Y["tumor_normal"]), index=Y.index)
encoder = LabelEncoder()
Y["primary_site_value"] = pd.Series(encoder.fit_transform(Y["primary_site"]), index=Y.index)
Y.describe(include="all", percentiles=[])
Out[3]:
In [4]:
# Split into stratified training and test sets based on classes (i.e. tissue type) so that we have equal
# proportions of each tissue type in the train and test sets
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X.values, Y["primary_site_value"]):
X_train, X_test = X.values[train_index], X.values[test_index]
y_train, y_test = Y["tumor_normal_value"][train_index], Y["tumor_normal_value"][test_index]
classes_train, classes_test = Y["primary_site_value"].values[train_index], Y["primary_site_value"].values[test_index]
print(X_train.shape, X_test.shape)
In [8]:
"""
Batch normalization with a sparse layer.
"""
from keras.models import Model, Sequential
from keras.layers import InputLayer, Dense, BatchNormalization, Activation, Dropout
from keras.callbacks import EarlyStopping
from keras import regularizers
classify = [
InputLayer(input_shape=(X_train.shape[1],)),
BatchNormalization(),
Dense(128),
BatchNormalization(),
Activation('relu'),
Dense(64, activity_regularizer=regularizers.l1(1e-5)),
BatchNormalization(),
Activation('relu'),
Dense(1),
Activation('sigmoid')
]
model = Sequential(classify)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
callbacks=[EarlyStopping(monitor='acc', min_delta=0.05, patience=2, verbose=2, mode="max")]
model.fit(X_train, y_train, epochs=5, batch_size=128, shuffle="batch", callbacks=callbacks)
print(model.metrics_names, model.evaluate(X_test, y_test))