Classify Treehouse

Load models trained in other notebooks and see how they do on the Treehouse samples


In [1]:
import os
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import matplotlib.pyplot as pyplot

# fix random seed for reproducibility
np.random.seed(42)

# See https://github.com/h5py/h5py/issues/712
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"


/opt/conda/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

Load Datasets


In [70]:
%%time
X = pd.read_hdf("data/tcga_target_gtex.h5", "expression")
Y = pd.read_hdf("data/tcga_target_gtex.h5", "labels")

X_treehouse = pd.read_hdf("data/treehouse.h5", "expression")
Y_treehouse = pd.read_hdf("data/treehouse.h5", "labels")


CPU times: user 0 ns, sys: 13.7 s, total: 13.7 s
Wall time: 15.5 s

Primary Site Classifier


In [31]:
# Load the model
model = keras.models.model_from_json(open("models/primary_site.model.json").read())
model.load_weights("models/primary_site.weights.h5")
params = json.loads(open("models/primary_site.params.json").read())

In [37]:
# Let's run it on the training set just to make sure we haven't lost something...
from sklearn import preprocessing
encoder = preprocessing.LabelBinarizer()
y_onehot = encoder.fit_transform(Y.primary_site.values)

# Prune X to only include genes in the gene sets
X_pruned = X.drop(labels=(set(X.columns) - set(params["genes"])), axis=1, errors="ignore")

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.evaluate(X_pruned, y_onehot)


19126/19126 [==============================] - 2s 103us/step
Out[37]:
[0.07240446106504723, 0.9799294959211106]

In [38]:
# Now let's try on Treehouse

# Prune X to only include genes in the gene sets
X_treehouse_pruned = X_treehouse.drop(labels=(set(X.columns) - set(params["genes"])), axis=1, errors="ignore")

In [66]:
Y_treehouse["primary_site_predicted"] = [", ".join(["{}({:0.2f})".format(params["labels"][i], p[i]) 
                                                    for i in p.argsort()[-3:][::-1]]) 
                                         for p in model.predict(X_treehouse_pruned)]
Y_treehouse.primary_site_predicted[0:3]


Out[66]:
id
TH01_0051_S01                Kidney(0.17), Lung(0.15), Brain(0.06)
TH01_0053_S01    White blood cell(0.25), Brain(0.16), Kidney(0.15)
TH01_0054_S01       Lung(0.14), Skin(0.14), White blood cell(0.10)
Name: primary_site_predicted, dtype: object

In [67]:
Y_treehouse.to_csv("models/treehouse_predictions.tsv", sep="\t")

Treehouse Pathways

Load predictions from pathway model, enrich with pathways and disease from tertiary protocol and analyze


In [5]:
Y = pd.read_csv("models/Y_treehouse_predictions.tsv", sep="\t", )
Y.head()


Out[5]:
id age_in_years gender disease predicted_tumor_normal predicted_primary_site predicted_disease predicted_pathways
0 TH01_0051_S01 NaN Not Reported Hepatoblastoma Tumor (0.77) Kidney (0.17), Liver (0.10), Lung (0.05) Kidney Clear Cell Carcinoma (0.12), Liver Hepa... KEGG_ABC_TRANSPORTERS (2.85), KEGG_LEUKOCYTE_T...
1 TH01_0053_S01 NaN Not Reported Acute Myeloid Leukemia Tumor (0.68) White blood cell (0.29), Blood (0.22), Lung (0... Acute Myeloid Leukemia (0.22), Whole Blood (0.... KEGG_CELL_ADHESION_MOLECULES_CAMS (3.85), KEGG...
2 TH01_0054_S01 NaN Not Reported Acute Lymphoblastic Leukemia Tumor (0.59) White blood cell (0.29), Blood (0.25), Stomach... Acute Myeloid Leukemia (0.20), Whole Blood (0.... KEGG_FC_GAMMA_R_MEDIATED_PHAGOCYTOSIS (3.61), ...
3 TH01_0055_S01 NaN Not Reported Glioma Tumor (0.59) Brain (0.64), Kidney (0.05), Esophagus (0.05) Brain Lower Grade Glioma (0.21), Head & Neck S... KEGG_CALCIUM_SIGNALING_PATHWAY (4.15), KEGG_GL...
4 TH01_0061_S01 NaN Not Reported Germ Cell Tumor Tumor (0.77) Kidney (0.14), Lung (0.09), Skin (0.07) Kidney Clear Cell Carcinoma (0.08), Lung Squam... KEGG_BIOSYNTHESIS_OF_UNSATURATED_FATTY_ACIDS (...

In [ ]:


In [67]:
import glob
import json


id = "TH01_0051_S01"



conf_path = glob.glob(
        "/treehouse/archive/downstream/{}/tertiary/treehouse-protocol*/compendium*/conf.json".format(y.id))


    
#     if conf_path:
#         with open(conf_path[0]) as f:
#             conf = json.loads(f.read())
            
#             if "disease" in conf["info"]:
#                 print(conf["info"]["disease"])

In [63]:
clinical.head()


Out[63]:
id Disease Dataset Gender Anatomical_location Sample_type Stage_all_cancers Grade_all_cancers Age_at_dx Ped_AYA Histology_all_cancers Subcategory Race Ethnicity
0 TH01_0053_S01 acute lymphoblastic leukemia TH NaN not noted NaN NaN NaN NaN yes NaN NaN NaN NaN
1 TH01_0054_S01 lymphoblastic leukemia TH NaN not noted NaN NaN NaN NaN yes b-cell T1, relapse NaN NaN
2 TH01_0055_S01 glioma TH NaN not noted NaN NaN unknown NaN yes astrocytoma NaN NaN NaN
3 TH01_0061_S01 germ cell tumor TH NaN not noted NaN NaN NaN NaN yes mixed NaN NaN NaN
4 TH01_0062_S01 acute lymphoblastic leukemia TH NaN not noted NaN NaN NaN NaN yes NaN NaN NaN NaN

In [30]:
conf


Out[30]:
{'cohort_pathsafe_name': 'v4',
 'dir': {'base': '/data',
  'cohort': '/data/references/compendium/v4',
  'cohort_clinical': '/data/references/compendium/v4/clinical',
  'cohortbase': '/data/references/compendium',
  'gene_expression_plots_dir': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/expression_plots',
  'ref': '/data/references/external',
  'sample': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01',
  'samplebase': '/data/notebooks/e-t-k/protocol_batches/thops74/output'},
 'file': {'5_out': {'genes_pc_up': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/genes_TH01_0051_S01_pc_up',
   'genes_pd_up': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/genes_TH01_0051_S01_pd_up'},
  '7_out': {'all_gene_aggregation': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/allGeneAggregation.txt',
   'druggable_gene_aggregation': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/druggableGeneAggregation.txt',
   'gene_set_aggregation': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/GeneSetAggregation.txt',
   'gene_set_details_per_list': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/GeneSetDetailsPerList.txt'},
  'automated_leads_identified': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/automatedLeadsIdentified.tsv',
  'biomart_hugo_entrez_mapping_file': '/data/references/external/bioMart_Hugo_Entrez_conversionTable.naExcluded.2017-02-03_04.48.56PM.txt',
  'cohort_expression': '/data/references/compendium/v4/cohort.hd5',
  'cohort_expression_tsv': '/data/references/compendium/v4/expression.tsv',
  'cohort_percentiles': '/data/references/compendium/v4/percentiles.hd5',
  'cohort_samples_v_diseases': '/data/references/compendium/v4/cohort.diseases.by.samples.tsv',
  'cohort_xy_coords': '/data/references/compendium/v4/xy_coords.tsv',
  'conf': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/conf.json',
  'ensembl_hugo_mapping_file': '/data/references/external/EnsGeneID_Hugo_Observed_Conversions.txt',
  'ensembl_id_list': '/data/references/external/ensembl_ids.txt',
  'essential_clinical_tsv': '/data/references/compendium/v4/clinical.tsv',
  'flag_analysis_failed': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/ANALYSIS_FAILED',
  'gsea_credentials': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/gsea_creds.json',
  'log': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/log.txt',
  'mss_clin_and_mutations': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/basicClinAndMutationsPerMSSOf_TH01_0051_S01.tsv',
  'mss_multi_genes': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/multiplyMutatedGenesPerMSSOf_TH01_0051_S01.tsv',
  'mss_multi_mutations': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/multiplyAppearingMutationsPerMSSOf_TH01_0051_S01.tsv',
  'neighbor_diseases': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/neighbor_diseases.tsv',
  'outlier_results': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/outlier_results_TH01_0051_S01',
  'pancan_filtered_genes': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/pancan.filtered-genes.txt',
  'pancan_high_threshold': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/pancan.high-threshold.txt',
  'pancan_low_threshold': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/pancan.low-threshold.txt',
  'pancan_median': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/pancan.median.txt',
  'pancan_samples': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/pancan_samples.txt',
  'pandis_filtered_genes': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/pandis.filtered-genes.txt',
  'pandis_high_threshold': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/pandis.high-threshold.txt',
  'pandis_low_threshold': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/pandis.low-threshold.txt',
  'pandis_median': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/pandis.median.txt',
  'pandisease_samples': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/pandisease_samples.txt',
  'rsem_genes.results': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/rsem_genes.results',
  'rsem_genes.results_header': '/data/references/external/rsem_genes.results.header.txt',
  'tpm_hugo': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/rsem.genes.tpm.hugo.tab',
  'tpm_hugo_norm_uniq': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/rsem.genes.tpm.hugo.log2plus1.dedupe.tab',
  'tumormap_command': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/tumormap_command.json',
  'tumormap_log': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/tumormap_log.txt',
  'tumormap_n_of_1_expression': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/rsem.genes.tpm.hugo.log2plus1.dedupe.tab',
  'tumormap_report': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/TH01_0051_S01.tumormap_report.txt',
  'tumormap_results': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/tumormap_results.txt'},
 'info': {'cohort_tumormap_only_samples': [],
  'cohort_tumormap_url': 'https://tumormap.ucsc.edu/?p=CKCC/v4',
  'cohort_zero_threshold': '0',
  'id_for_tumormap': 'TH01_0051_S01',
  'iqr_multiplier': 1.5,
  'logging_config': {'filename': '/data/notebooks/e-t-k/protocol_batches/thops74/output/TH01_0051_S01/log.txt',
   'format': '%(message)s',
   'level': 20},
  'proportion_unexpressed_filter_cutoff': 0.8,
  'tumormap_docker_image': 'ucschexmap/compute:0.0.1',
  'variance_filter_cutoff': 0.2},
 'md5': {'cohort_hd5': '37df8954bdcf028db174196dad4487b8',
  'cohort_tsv': '33631f699c4028642b7315a18aa7688c',
  'rsem_genes.results': '2bfdf5464cd29b427c87273ad6654116'},
 'medbook_sample_prefix': '',
 'ref_file': {'TCGA_non_silent_cancer_mutations_by_sample': '/data/references/external/TCGA_NonSilentMutationsInCancerGenesBySample.txt',
  'curated_pathways_druggable_genes': '/data/references/external/tertiary-references/curatedPathwaysContainingFDA_druggableGenes.txt',
  'druggable_genes_by_category': '/data/references/external/tertiary-references/genesByCategory.txt',
  'msigdb_pathway_file': '/data/references/external/msigdb.v5.2.symbols.gmt'},
 'sample_id': 'TH01_0051_S01',
 'tumormap_cohort_name': 'v4'}