In [3]:
import os
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import matplotlib.pyplot as plt
# fix random seed for reproducibility
np.random.seed(42)
# See https://github.com/h5py/h5py/issues/712
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
In [4]:
X = pd.read_hdf("data/tcga_target_gtex.h5", "expression")
X.head()
Out[4]:
In [5]:
Y = pd.read_hdf("data/tcga_target_gtex.h5", "labels")
Y.head()
Out[5]:
In [6]:
# Load gene sets from downloaded MSigDB gmt file
# KEGG to for now as its experimental vs. computational)
with open("data/c2.cp.kegg.v6.1.symbols.gmt") as f:
gene_sets = {line.strip().split("\t")[0]: line.strip().split("\t")[2:]
for line in f.readlines()}
print("Loaded {} gene sets".format(len(gene_sets)))
# Drop genes not in X - sort so order is the same as X_pruned.columns
gene_sets = {name: sorted([gene for gene in genes if gene in X.columns.values])
for name, genes in gene_sets.items()}
# Find union of all gene's in all gene sets in order to filter our input rows
all_gene_set_genes = sorted(list(set().union(*[gene_set for gene_set in gene_sets.values()])))
print("Subsetting to {} genes".format(len(all_gene_set_genes)))
# Prune X to only include genes in the gene sets
X_pruned = X.drop(labels=(set(X.columns) - set(all_gene_set_genes)), axis=1, errors="ignore")
assert X_pruned["TP53"]["TCGA-ZP-A9D4-01"] == X["TP53"]["TCGA-ZP-A9D4-01"]
print("X_pruned shape", X_pruned.shape)
# Make sure the genes are the same and in the same order
assert len(all_gene_set_genes) == len(X_pruned.columns.values)
assert list(X_pruned.columns.values) == all_gene_set_genes
In [7]:
# Convert primary_site into numerical values for one-hot multi-class training
from sklearn.preprocessing import LabelEncoder
tumor_normal_encoder = LabelEncoder()
Y["tumor_normal_value"] = pd.Series(
tumor_normal_encoder.fit_transform(Y["tumor_normal"]), index=Y.index)
primary_site_encoder = LabelEncoder()
Y["primary_site_value"] = pd.Series(
primary_site_encoder.fit_transform(Y["primary_site"]), index=Y.index)
disease_encoder = LabelEncoder()
Y["disease_value"] = pd.Series(
disease_encoder.fit_transform(Y["disease"]), index=Y.index)
Y.describe(include="all", percentiles=[])
Out[7]:
In [8]:
# Create a multi-class one hot output all four classifications
from keras.utils import np_utils
Y_onehot = np.append(
Y["tumor_normal_value"].values.reshape(Y.shape[0],-1),
np_utils.to_categorical(Y["primary_site_value"]), axis=1)
Y_onehot = np.append(Y_onehot,
np_utils.to_categorical(Y["disease_value"]), axis=1)
print(Y_onehot.shape)
In [9]:
# Check out one-hot compound output vector
for i in range(0,20000,5000):
y = Y.iloc[i]
print("{} {} {}".format(y.tumor_normal, y.primary_site, y.disease))
yo = Y_onehot[i]
print(tumor_normal_encoder.classes_.tolist()[int(yo[0])],
primary_site_encoder.classes_.tolist()[np.argmax(yo[1:1+46])],
disease_encoder.classes_.tolist()[np.argmax(yo[1+46:-1])])
In [10]:
# Split into stratified training and test sets based primary site
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X_pruned.values, Y["primary_site_value"]):
X_train = X_pruned.values[train_index]
X_test = X_pruned.values[test_index]
y_train = Y_onehot[train_index]
y_test = Y_onehot[test_index]
primary_site_train = Y["primary_site_value"].values[train_index]
primary_site_test = Y["primary_site_value"].values[test_index]
disease_train = Y["disease_value"].values[train_index]
disease_test = Y["disease_value"].values[test_index]
print(X_train.shape, X_test.shape)
In [11]:
# Lets see how big each class is based on primary site
plt.hist(primary_site_train, alpha=0.5, label='Train')
plt.hist(primary_site_test, alpha=0.5, label='Test')
plt.legend(loc='upper right')
plt.title("Primary Site distribution between train and test")
plt.show()
# Lets see how big each class is based on primary site
plt.hist(disease_train, alpha=0.5, label='Train')
plt.hist(disease_test, alpha=0.5, label='Test')
plt.legend(loc='upper right')
plt.title("Disease distribution between train and test")
plt.show()
In [12]:
%%time
from keras.models import Model, Sequential
from keras.layers import InputLayer, Dense, BatchNormalization, Dropout
from keras.callbacks import EarlyStopping
from keras import regularizers
classify = [
InputLayer(input_shape=(X_train.shape[1],)),
BatchNormalization(),
Dense(128, activation='relu'),
Dropout(0.5),
Dense(128, activity_regularizer=regularizers.l1(1e-5), activation='relu'),
Dropout(0.5),
# Sigmoid as we have multiple labels
Dense(y_train.shape[1], activation='sigmoid')
]
model = Sequential(classify)
print(model.summary())
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
callbacks = [EarlyStopping(monitor='acc', min_delta=0.05, patience=2, verbose=2, mode="max")]
model.fit(X_train, y_train, epochs=10, batch_size=128, shuffle="batch", callbacks=callbacks)
print(model.metrics_names, model.evaluate(X_test, y_test))
In [13]:
# Save the model for separate inference
with open("models/multi_label.params.json", "w") as f:
f.write(json.dumps({
"tumor_normal": tumor_normal_encoder.classes_.tolist(),
"primary_site": primary_site_encoder.classes_.tolist(),
"disease": disease_encoder.classes_.tolist(),
"genes": all_gene_set_genes}))
with open("models/multi_label.model.json", "w") as f:
f.write(model.to_json())
model.save_weights("models/multi_label.weights.h5")
In [14]:
# Load the model and predict the test set
model = keras.models.model_from_json(open("models/multi_label.model.json").read())
model.load_weights("models/multi_label.weights.h5")
params = json.loads(open("models/multi_label.params.json").read())
predictions = model.predict(X_test)
In [15]:
# Plot the distribution of tumor/normal values
plt.hist(predictions[:,0])
plt.show()
In [16]:
# Plot the confusion matrix disease
from sklearn.metrics import confusion_matrix
print("Tumor/Normal")
print(confusion_matrix(y_test[:,0], np.round(predictions[:,0])))
print("Primary Site")
plt.matshow(confusion_matrix(primary_site_test,
np.array([np.argmax(p) for p in predictions[:,1:1+46]])),
cmap='jet')
plt.title("Primary Site Prediction Confusion Matrix")
plt.show()
plt.matshow(confusion_matrix(disease_test,
np.array([np.argmax(p) for p in predictions[:,1+46:-1]])),
cmap='jet')
plt.title("Disease Prediction Confusion Matrix")
plt.show()
In [17]:
# Let's eye ball a few predictions compared to the original
for i in range(0,20000,2000):
print("Original: {} {} {}".format(Y.iloc[i].tumor_normal, Y.iloc[i].primary_site, Y.iloc[i].disease))
prediction = model.predict(X_pruned.iloc[i:i+1])[0]
print("Predicted:",
tumor_normal_encoder.classes_.tolist()[int(round(prediction[0]))],
primary_site_encoder.classes_.tolist()[np.argmax(prediction[1:1+46])],
disease_encoder.classes_.tolist()[np.argmax(prediction[1+46:-1])])