Use gene sets from MSigDB to both prune the number of genes/features as well as a source of pathway information to encorporate into layer design.
In [1]:
import os
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import matplotlib.pyplot as plt
# fix random seed for reproducibility
np.random.seed(42)
# See https://github.com/h5py/h5py/issues/712
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
In [2]:
X = pd.read_hdf("data/tcga_target_gtex.h5", "expression")
Y = pd.read_hdf("data/tcga_target_gtex.h5", "labels")
In [3]:
# Load gene sets from downloaded MSigDB gmt file
# KEGG to for now as its experimental vs. computational)
with open("data/c2.cp.kegg.v6.1.symbols.gmt") as f:
gene_sets = {line.strip().split("\t")[0]: line.strip().split("\t")[2:]
for line in f.readlines()}
print("Loaded {} gene sets".format(len(gene_sets)))
# Drop genes not in X - sort so order is the same as X_pruned.columns
gene_sets = {name: sorted([gene for gene in genes if gene in X.columns.values])
for name, genes in gene_sets.items()}
# Find union of all gene's in all gene sets in order to filter our input rows
all_gene_set_genes = sorted(list(set().union(*[gene_set for gene_set in gene_sets.values()])))
print("Subsetting to {} genes".format(len(all_gene_set_genes)))
# Prune X to only include genes in the gene sets
X_pruned = X.drop(labels=(set(X.columns) - set(all_gene_set_genes)), axis=1, errors="ignore")
assert X_pruned["TP53"]["TCGA-ZP-A9D4-01"] == X["TP53"]["TCGA-ZP-A9D4-01"]
print("X_pruned shape", X_pruned.shape)
# Make sure the genes are the same and in the same order
assert len(all_gene_set_genes) == len(X_pruned.columns.values)
assert list(X_pruned.columns.values) == all_gene_set_genes
In [4]:
# Convert primary_site into numerical values for one-hot multi-class training
from sklearn.preprocessing import LabelEncoder
tumor_normal_encoder = LabelEncoder()
Y["tumor_normal_value"] = pd.Series(
tumor_normal_encoder.fit_transform(Y["tumor_normal"]), index=Y.index)
primary_site_encoder = LabelEncoder()
Y["primary_site_value"] = pd.Series(
primary_site_encoder.fit_transform(Y["primary_site"]), index=Y.index)
disease_encoder = LabelEncoder()
Y["disease_value"] = pd.Series(
disease_encoder.fit_transform(Y["disease"]), index=Y.index)
Y.describe(include="all", percentiles=[])
Out[4]:
In [5]:
# Create a multi-class one hot output all three classifications
from keras.utils import np_utils
Y_onehot = np.append(
Y["tumor_normal_value"].values.reshape(Y.shape[0],-1),
np_utils.to_categorical(Y["primary_site_value"]), axis=1)
Y_onehot = np.append(Y_onehot,
np_utils.to_categorical(Y["disease_value"]), axis=1)
print(Y_onehot.shape)
In [6]:
# Split into stratified training and test sets based primary site
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X_pruned.values, Y["primary_site_value"]):
X_train = X_pruned.values[train_index]
X_test = X_pruned.values[test_index]
Y_train = Y.iloc[train_index]
Y_test = Y.iloc[test_index]
y_train = Y_onehot[train_index]
y_test = Y_onehot[test_index]
primary_site_train = Y["primary_site_value"].values[train_index]
primary_site_test = Y["primary_site_value"].values[test_index]
disease_train = Y["disease_value"].values[train_index]
disease_test = Y["disease_value"].values[test_index]
print(X_train.shape, X_test.shape)
In [7]:
# Lets see how big each class is based on primary site
plt.hist(primary_site_train, alpha=0.5, label='Train')
plt.hist(primary_site_test, alpha=0.5, label='Test')
plt.legend(loc='upper right')
plt.title("Primary Site distribution between train and test")
plt.show()
# Lets see how big each class is based on primary site
plt.hist(disease_train, alpha=0.5, label='Train')
plt.hist(disease_test, alpha=0.5, label='Test')
plt.legend(loc='upper right')
plt.title("Disease distribution between train and test")
plt.show()
For each pathway build a custom input layer that extracts the expression levels for the genes in the pathway from the full input vector and feeds this into a dense single output hidden neuron. These are then aggregated and fed into a standard set of stacked layers and trained to classify tumor/norml and primary site. The hidden per pathway neurons are named after the pathway an 'may' indicate which pathway lead to a given classification.
In [8]:
# group genes by pathway
gene_groups = {name: np.searchsorted(X_pruned.columns.values, genes)for name, genes in gene_sets.items()}
print("Pathway KEGG_ABC_TRANSPORTERS Gene Indexes:", gene_groups["KEGG_ABC_TRANSPORTERS"])
# DEBUGGING: Prune to just 4 to speed up an debug everything from here on down
# gene_groups = {k: gene_groups[k] for k in list(gene_groups)[:4]}
In [9]:
%%time
"""
Build pathways using custom Keras Layer
"""
from keras.models import Model, Sequential
from keras.layers import Input, Lambda, Dense, BatchNormalization, Dropout
from keras.callbacks import EarlyStopping
from keras import regularizers
from keras.layers.merge import concatenate
from keras import backend as K
from keras.engine.topology import Layer
class GroupBy(Layer):
"""
Subset input features into multiple groups that may overlap
groups: list of list of input feature indices
"""
def __init__(self, groups, **kwargs):
super(GroupBy, self).__init__(**kwargs)
self.groups = groups
def build(self, input_shape):
super(GroupBy, self).build(input_shape)
def call(self, x):
return [K.concatenate([x[:, i:i+1] for i in indexes]) for _, indexes in self.groups.items()]
def compute_output_shape(self, input_shape):
return [(None, len(indexes)) for _, indexes in self.groups.items()]
main_input = Input(shape=(X_train.shape[1],), name="main_input")
x = main_input
x = BatchNormalization()(x)
"""
Build per pathway sub-networks
"""
x = GroupBy(gene_groups)(x)
# Add a dense layer per pathway with width proportional to the number of genes in the pathway
x = [Dense(max(2, len(i)//4), activation='relu')(p)
for p, i in zip(x, gene_groups.values())]
# Add a named binary output for each pathway
x = [Dense(1, activation='relu', name=name)(p)
for p, name in zip(x, gene_groups.keys())]
# Concatenate binary outputs of each of sub-networks back into single vector
x = keras.layers.concatenate(x, name="pathways")
"""
Add traditional stacked network for final multi-label classification
"""
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(128, activity_regularizer=regularizers.l1(
1e-5), activation='relu')(x)
x = Dropout(0.5)(x)
main_output = Dense(y_train.shape[1], activation='sigmoid')(x)
model = Model(inputs=[main_input], outputs=[main_output])
# print(model.summary()) # Too detailed when building full set of pathways
print("Trainable params::", np.sum(
[np.product(v.shape) for v in model.trainable_weights]).value)
model.compile(loss='binary_crossentropy',
optimizer='adam', metrics=['accuracy'])
In [10]:
# %%time
# """
# Build pathway's using Lambda
# """
# from keras.models import Model, Sequential
# from keras.layers import Input, Lambda, Dense, BatchNormalization, Dropout
# from keras.callbacks import EarlyStopping
# from keras import regularizers
# from keras.layers.merge import concatenate
# from keras import backend as K
# import itertools
# main_input = Input(shape=(X_train.shape[1],), name="main_input")
# x = main_input
# x = BatchNormalization()(x)
# """
# Build per pathway sub-networks
# """
# # Set to a small number (~4) for debugging, set to None to build all pathways
# # max_num_pathways = 4
# max_num_pathways = None
# # Extract features/gene's for each pathway from the aggregate x input vector
# x = [Lambda(lambda e: K.concatenate([e[:, i:i+1] for i in indexes]))(x)
# for name, indexes in itertools.islice(gene_set_indexes.items(), max_num_pathways)]
# # Add a dense layer per pathway with width proportional to the number of genes in the pathway
# x = [Dense(max(2, len(i)//4), activation='relu')(p)
# for p, i in zip(x, gene_set_indexes.values())]
# # Add a named binary output for each pathway
# x = [Dense(1, activation='relu', name=name)(p)
# for p, name in zip(x, gene_set_indexes.keys())]
# # Concatenate binary outputs of each of sub-networks back into single vector
# x = keras.layers.concatenate(x, name="pathways")
# """
# Add traditional stacked network for final multi-label classification
# """
# x = Dense(128, activation='relu')(x)
# x = Dropout(0.5)(x)
# x = Dense(128, activity_regularizer=regularizers.l1(1e-5), activation='relu')(x)
# x = Dropout(0.5)(x)
# main_output = Dense(y_train.shape[1], activation='sigmoid')(x)
# model = Model(inputs=[main_input], outputs=[main_output])
# # print(model.summary()) # Too detailed when building full set of pathways
# print("Trainable params::", np.sum(
# [np.product(v.shape) for v in model.trainable_weights]).value)
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
In [11]:
%%time
callbacks = [EarlyStopping(monitor='acc', min_delta=0.05, patience=2, verbose=2, mode="max")]
model.fit(X_train, y_train, epochs=10, batch_size=128, shuffle="batch", callbacks=callbacks)
print(model.metrics_names, model.evaluate(X_test, y_test))
In [12]:
%%time
# Predict all test samples
predictions = model.predict(X_test)
# Build a mode to access the activiations of the pathway layer
pathway_model = Model(inputs=model.layers[0].input, outputs=model.get_layer("pathways").output)
pathway_predictions = pathway_model.predict(X_test)
In [13]:
# First lets look at the distribution of our binary tumor/normal predictions
plt.hist(predictions[:,0])
plt.show()
In [14]:
# Create a new dataframe of the test original and predicted labels
Y_test_predictions = Y_test.copy()
Y_test_predictions["predicted_tumor_normal"] = ["Normal ({:0.2f})".format(p) if np.round(p) <= 0.5
else "Tumor ({:0.2f})".format(p)
for p in predictions[:,0]]
labels = primary_site_encoder.classes_.tolist()
Y_test_predictions["predicted_primary_site"] = [
", ".join(["{} ({:0.2f})".format(labels[i], p[1:1+46][i]) for i in p[1:1+46].argsort()[-3:][::-1]])
for i, p in enumerate(predictions)]
labels = disease_encoder.classes_.tolist()
Y_test_predictions["predicted_disease"] = [
", ".join(["{} ({:0.2f})".format(labels[i], p[1+46:-1][i]) for i in p[1+46:-1].argsort()[-3:][::-1]])
for i, p in enumerate(predictions)]
labels = list(gene_sets.keys())
Y_test_predictions["predicted_pathways"] = [
", ".join(["{} ({:0.2f})".format(labels[i], p[i]) for i in p.argsort()[-3:][::-1]])
for i, p in enumerate(pathway_predictions)]
Y_test_predictions.to_csv("models/Y_test_predictions.tsv", sep="\t")
Y_test_predictions.head()
Out[14]:
In [15]:
# Plot confusion matrix for primary site
import sklearn.metrics
import matplotlib.ticker as ticker
confusion_matrix = sklearn.metrics.confusion_matrix(
primary_site_test, np.array([np.argmax(p[1:1+46]) for p in predictions]))
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111)
cax = ax.matshow(confusion_matrix, cmap=plt.cm.gray)
ax.set_xticklabels(primary_site_encoder.classes_.tolist(), rotation=90)
ax.set_yticklabels(primary_site_encoder.classes_.tolist())
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
ax.set_xlabel("Primary Site Confusion Matrix for Holdout Test Data")
plt.show()
In [16]:
# Show only where there are errors
row_sums = confusion_matrix.sum(axis=1, keepdims=True)
norm_conf_mx = confusion_matrix / row_sums
np.fill_diagonal(norm_conf_mx, 0)
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111)
cax = ax.matshow(norm_conf_mx, cmap=plt.cm.gray)
ax.set_xticklabels(primary_site_encoder.classes_.tolist(), rotation=90)
ax.set_yticklabels(primary_site_encoder.classes_.tolist())
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
ax.set_xlabel("Primary Site Prediction Errors")
plt.show()
In [17]:
# Plot confusion matrix for disease
import sklearn.metrics
import matplotlib.ticker as ticker
confusion_matrix = sklearn.metrics.confusion_matrix(
disease_test, np.array([np.argmax(p[1+46:-1]) for p in predictions]))
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111)
cax = ax.matshow(confusion_matrix, cmap=plt.cm.gray)
ax.set_xticklabels(disease_encoder.classes_.tolist(), rotation=90)
ax.set_yticklabels(disease_encoder.classes_.tolist())
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
ax.set_xlabel("Disease Confusion Matrix for Holdout Test Data")
plt.show()
In [18]:
# Show only where there are errors
row_sums = confusion_matrix.sum(axis=1, keepdims=True)
norm_conf_mx = confusion_matrix / row_sums
np.fill_diagonal(norm_conf_mx, 0)
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111)
cax = ax.matshow(norm_conf_mx, cmap=plt.cm.gray)
ax.set_xticklabels(disease_encoder.classes_.tolist(), rotation=90)
ax.set_yticklabels(disease_encoder.classes_.tolist())
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
ax.set_xlabel("Disease Prediction Errors")
plt.show()
In [19]:
# Load and prepare for predicting
X_treehouse = pd.read_hdf("data/treehouse.h5", "expression")
Y_treehouse = pd.read_hdf("data/treehouse.h5", "labels")
X_treehouse_pruned = X_treehouse.drop(labels=(set(X_treehouse.columns) - set(all_gene_set_genes)),
axis=1, errors="ignore")
In [20]:
# Load clinical Treehouse data
import glob
treehouse_clinical = pd.read_csv(
"/treehouse/archive/compendium/v4/clin.essential.final_v4_20170420.tsv",
sep="\t", index_col=0)
# Extract top 3 pathways from Treehouse GSEA Top 5 tertiary output
top_gsea_paths = {
id: sorted(
glob.glob(
"/treehouse/archive/downstream/{}/tertiary/treehouse-protocol*/compendium*/gsea_*_top5" \
.format(id)))
for id in Y_treehouse.index.values}
top_gsea_pathways = pd.DataFrame({
id: pd.read_csv(path[-1], sep="\t", header=6, nrows=10).sort_values(
by=["k/K"], axis=0, ascending=False)["Gene Set Name"][0:3].values
for id, path in top_gsea_paths.items() if path}).T
top_gsea_pathways.head()
Out[20]:
In [21]:
%%time
# Predict all test samples
predictions = model.predict(X_treehouse_pruned)
# Build a mode to access the activiations of the pathway layer
pathway_model = Model(inputs=model.layers[0].input, outputs=model.get_layer("pathways").output)
pathway_predictions = pathway_model.predict(X_treehouse_pruned)
In [22]:
# Add predictions to our label dataframe
Y_treehouse["predicted_tumor_normal"] = ["Normal ({:0.2f})".format(p) if np.round(p) <= 0.5
else "Tumor ({:0.2f})".format(p)
for p in predictions[:,0]]
labels = primary_site_encoder.classes_.tolist()
Y_treehouse["predicted_primary_site"] = [
", ".join(["{} ({:0.2f})".format(labels[i], p[1:1+46][i]) for i in p[1:1+46].argsort()[-3:][::-1]])
for i, p in enumerate(predictions)]
labels = disease_encoder.classes_.tolist()
Y_treehouse["predicted_disease"] = [
", ".join(["{} ({:0.2f})".format(labels[i], p[1+46:-1][i]) for i in p[1+46:-1].argsort()[-3:][::-1]])
for i, p in enumerate(predictions)]
labels = list(gene_sets.keys())
Y_treehouse["predicted_pathways"] = [
", ".join(["{} ({:0.2f})".format(labels[i], p[i]) for i in p.argsort()[-3:][::-1]])
for i, p in enumerate(pathway_predictions)]
Y_treehouse.head()
Out[22]:
In [23]:
# Merge treehouse clinical and top gsea paths and write out
Y_treehouse.join(
[treehouse_clinical.Anatomical_location, top_gsea_pathways]
).to_csv("models/Y_treehouse_predictions.tsv", sep="\t")