In [1]:
    
import xenaPython as xena
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
    
In [2]:
    
%matplotlib inline
%load_ext rpy2.ipython
    
In [18]:
    
def accessXenaData(hub, data_set, mode):
    '''return the patient IDs and probe names used for a given UCSC Xena data set
    Args:
        hub (str):      Which Xena hub are data stored in?
        data_set (str): Name of Xena data set
    Returns:
        samples (dict): Patient Identifiers
        genes (dict):   Analysis probe or gene names
    '''
    if mode == "dev":
        samples = [x.encode('UTF8') for x in xena.xenaAPI.dataset_samples(hub, data_set)][0:100]
        genes = [x.encode('UTF8') for x in xena.xenaAPI.dataset_fields(hub, data_set)][0:100]
    elif mode == "actual":
        samples = [x.encode('UTF8') for x in xena.xenaAPI.dataset_samples(hub, data_set)]
        genes = [x.encode('UTF8') for x in xena.xenaAPI.dataset_fields(hub, data_set)]
    print("Done with accessing Xena data")
    return samples, genes
def matchEnsemblData(biomart_df, xena_genes):
    '''from a list of Xena probe names, find genomic coordinate data in Ensembl
    Args:
        biomart_df_csv (.csv):      start with previously generated data frame; alternately, run using R magic
                                    because biomart python API no longer supported
        xena_genes (list):          List of genes from Xena data
    Returns:
        xena_df (DataFrame):        Ensembl data for accessed xena data
    '''
    # match Xena probe names with genomic data from Ensembl  
    xena_df = pd.DataFrame(xena_genes, columns=['Identifiers'])
    xena_df = pd.merge(xena_df, biomart_df, left_on='Identifiers', right_on='hgnc_symbol')
    xena_df.drop('Identifiers', axis=1, inplace=True)
    
    # re-index
    hgnc_symbol = np.asarray(xena_df['hgnc_symbol'])
    a = hgnc_symbol
    indices = np.setdiff1d(np.arange(len(a)), np.unique(a, return_index=True)[1])
    xena_df.drop(xena_df.index[indices], inplace=True)
    print("Done with matching data to ensembl info")
    return xena_df
def accessIndexedSharedXenaData(hub, CNV_dataset, RNA_dataset, ensembl_data):
    ''' Return data frames for two data sets on Xena with matched genomic coordinates for shared samples and genes
    Args:
        hub (url):                  url name where data sets stored in Xena
        CNV_dataset (str):          name of Xena data set containing copy number data
        RNA_dataset (str):          name or Xena data set containing expression data
        ensembl_data (.csv):        start with previously generated data frame; alternately, run using R magic
                                    because biomart python API no longer supported
    Returns:
        RNA_shared_df (DataFrame):  expression data with matched genomic coordinates for shared patients, genes
        CNV_shared_df (DataFrame):  copy number data with matched genomic coordinates for shared patients, genes
    '''
    # determine patients and genes accessible in Xena
    CNV_samples, CNV_probes = accessXenaData(hub, CNV_dataset, mode)
    RNA_samples, RNA_probes = accessXenaData(hub, RNA_dataset, mode)
    # get genomic coordinates from ensembl for Xena genes
    CNV_L_df = matchEnsemblData(ensembl_data, CNV_probes)
    RNA_L_df = matchEnsemblData(ensembl_data, RNA_probes)
    # multi-index Xena data with ensembl gene coordinates
    CNV_df = analysisMultiIndexDataFrame(CNV_L_df, 'CNV', CNV_samples, CNV_dataset, CNV_probes, hub)
    RNA_df = analysisMultiIndexDataFrame(RNA_L_df, 'RNA', RNA_samples, RNA_dataset, RNA_probes, hub)
    # combine data frame with expression data and with copy number data
    all_data_df = pd.merge(CNV_df, RNA_df, left_index=True, right_index=True)
    all_data_df.index.names=['samples']
    # remaking RNA and CNV data frame from entire table to ensure only matched samples (n=528) used
    RNA_shared_df=all_data_df.T.xs('RNA', level='analysis_type').T
    CNV_shared_df=all_data_df.T.xs('CNV', level='analysis_type').T
    print("Done with accessing and indexing RNA and CNV data")
    return RNA_shared_df, CNV_shared_df
def analysisMultiIndexDataFrame(ensembl_df, type_of_analysis, analysis_samples, analysis_dataset, analysis_genes, hub):
    '''generate a multi-index data frame of Xena measured values combined with Ensembl coordinates
    Args:
        ensembl_df (DataFrame):             DataFrame of select genomic coordinate values from Ensembl.
        type_of_analysis (str):             What do Xena measurements correspond to? (e.g., "CNV", "RNA")
        analysis_samples (dict):            Which patient samples have measured values in Xena?
        analysis_dataset (DataFrame):       Xena values DataFrame.
        analysis_genes (dict):              Which probes have measured values in Xena?
    Returns:
        values_plus_metadata (DataFrame):   A multi-indexed Xena df with genomic coordinates as indices.
    '''
    # from Ensembl, take hgnc_symbol, chromosome, and gene start/end
    hgnc_symbol = np.asarray(ensembl_df['hgnc_symbol'])
    chromosome_name = np.asarray(ensembl_df['chromosome_name'])
    start_position = np.asarray(ensembl_df['start_position'])
    end_position = np.asarray(ensembl_df['end_position'])
    analysis_type = np.array(len(hgnc_symbol) * [type_of_analysis])
    # create Ensembl multi-index
    columns = pd.MultiIndex.from_arrays(
        [hgnc_symbol, chromosome_name, start_position, end_position, analysis_type],
        names=['hgnc_symbol','chromosome_name', 'start_position','end_position','analysis_type'])
    index = analysis_samples
    # Access xena data
    data = pd.DataFrame(xena.xenaAPI.Probes_values (hub, analysis_dataset, analysis_samples, analysis_genes)).T
    data = pd.DataFrame(data.values.astype(float), columns = analysis_genes)
    # forward fill na values
    data.fillna(method='ffill', inplace=True)
    # drop genes with no Xena data
    orphan_genes = list(set(analysis_genes) - set(hgnc_symbol))
    data_2 = data.drop(orphan_genes, axis=1)
    values_plus_metadata = pd.DataFrame(data_2.values, index=index, columns=columns)
    return values_plus_metadata
    
In [4]:
    
%%R
library(biomaRt)
library(plyr)
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
#=======================================================================#
# define biomart object
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
# query biomart
get_chr_genes <- function(chr_num, mart_server){
  subset(
    unique(
      getBM(
        attributes=c(
          'ensembl_gene_id',
          'ensembl_transcript_id',
          'hgnc_symbol',
          'chromosome_name',
          'start_position',
          'end_position'
        ),
        filters = 'chromosome_name', values =chr_num, mart = mart_server
      )[-c(1:2)]
    ),
    hgnc_symbol != ""
  )
}
N <- 23
chr_L <- vector("list", N)
for( i in 1:21) {
  chr <- get_chr_genes(i, ensembl)
  chr_L[[i]] <- chr
}
chr_X <- get_chr_genes("X", ensembl)
chr_Y <- get_chr_genes("Y", ensembl)
chr_L[[22]] <- chr_X
chr_L[[23]] <- chr_Y
#=======================================================================#
chr_df <- ldply(chr_L, data.frame)
    
In [6]:
    
%Rpull chr_df
    
In [10]:
    
hub = "https://tcga.xenahubs.net"
CNV_dataset = "TCGA.PANCAN.sampleMap/Gistic2_CopyNumber_Gistic2_all_data_by_genes" # TCGA Pan-Cancer (PANCAN), see below
# https://xenabrowser.net/datapages/?dataset=TCGA.PANCAN.sampleMap%2FGistic2_CopyNumber_Gistic2_all_data_by_genes&host=https%3A%2F%2Ftcga.xenahubs.net
mutation_dataset = "TCGA.PANCAN.sampleMap/PANCAN_mutation_gene" # TCGA Pan-Cancer (PANCAN), see below
# https://xenabrowser.net/datapages/?dataset=TCGA.PANCAN.sampleMap%2FPANCAN_mutation_gene&host=https%3A%2F%2Ftcga.xenahubs.net
ensembl_data = chr_df
mode = 'dev' # change from 'dev' to 'actual' when running on entire data sets.
# 'dev' is to shorten computing time and only using the first 100 genes of each data set
    
In [25]:
    
mutation_shared_df, CNV_shared_df = accessIndexedSharedXenaData(hub, CNV_dataset, mutation_dataset, ensembl_data)
    
    
In [61]:
    
# specify genes of interest. Will become buggy if only 1 gene is selected
mutation_genes_of_interest = ['A1CF', 'A2M']
copy_number_genes_of_interest = ['A1CF', 'A2M', 'A2ML1', 'A2MP1']
m_df = mutation_shared_df[mutation_genes_of_interest]
c_df = CNV_shared_df[copy_number_genes_of_interest]
g_df = m_df.join(c_df, lsuffix='_mutation_status', rsuffix='_copy_number_status')
g_df
    
    Out[61]:
In [63]:
    
sns.clustermap(g_df)
    
    Out[63]: