In [1]:
import xenaPython as xena
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [2]:
%matplotlib inline
%load_ext rpy2.ipython
In [18]:
def accessXenaData(hub, data_set, mode):
'''return the patient IDs and probe names used for a given UCSC Xena data set
Args:
hub (str): Which Xena hub are data stored in?
data_set (str): Name of Xena data set
Returns:
samples (dict): Patient Identifiers
genes (dict): Analysis probe or gene names
'''
if mode == "dev":
samples = [x.encode('UTF8') for x in xena.xenaAPI.dataset_samples(hub, data_set)][0:100]
genes = [x.encode('UTF8') for x in xena.xenaAPI.dataset_fields(hub, data_set)][0:100]
elif mode == "actual":
samples = [x.encode('UTF8') for x in xena.xenaAPI.dataset_samples(hub, data_set)]
genes = [x.encode('UTF8') for x in xena.xenaAPI.dataset_fields(hub, data_set)]
print("Done with accessing Xena data")
return samples, genes
def matchEnsemblData(biomart_df, xena_genes):
'''from a list of Xena probe names, find genomic coordinate data in Ensembl
Args:
biomart_df_csv (.csv): start with previously generated data frame; alternately, run using R magic
because biomart python API no longer supported
xena_genes (list): List of genes from Xena data
Returns:
xena_df (DataFrame): Ensembl data for accessed xena data
'''
# match Xena probe names with genomic data from Ensembl
xena_df = pd.DataFrame(xena_genes, columns=['Identifiers'])
xena_df = pd.merge(xena_df, biomart_df, left_on='Identifiers', right_on='hgnc_symbol')
xena_df.drop('Identifiers', axis=1, inplace=True)
# re-index
hgnc_symbol = np.asarray(xena_df['hgnc_symbol'])
a = hgnc_symbol
indices = np.setdiff1d(np.arange(len(a)), np.unique(a, return_index=True)[1])
xena_df.drop(xena_df.index[indices], inplace=True)
print("Done with matching data to ensembl info")
return xena_df
def accessIndexedSharedXenaData(hub, CNV_dataset, RNA_dataset, ensembl_data):
''' Return data frames for two data sets on Xena with matched genomic coordinates for shared samples and genes
Args:
hub (url): url name where data sets stored in Xena
CNV_dataset (str): name of Xena data set containing copy number data
RNA_dataset (str): name or Xena data set containing expression data
ensembl_data (.csv): start with previously generated data frame; alternately, run using R magic
because biomart python API no longer supported
Returns:
RNA_shared_df (DataFrame): expression data with matched genomic coordinates for shared patients, genes
CNV_shared_df (DataFrame): copy number data with matched genomic coordinates for shared patients, genes
'''
# determine patients and genes accessible in Xena
CNV_samples, CNV_probes = accessXenaData(hub, CNV_dataset, mode)
RNA_samples, RNA_probes = accessXenaData(hub, RNA_dataset, mode)
# get genomic coordinates from ensembl for Xena genes
CNV_L_df = matchEnsemblData(ensembl_data, CNV_probes)
RNA_L_df = matchEnsemblData(ensembl_data, RNA_probes)
# multi-index Xena data with ensembl gene coordinates
CNV_df = analysisMultiIndexDataFrame(CNV_L_df, 'CNV', CNV_samples, CNV_dataset, CNV_probes, hub)
RNA_df = analysisMultiIndexDataFrame(RNA_L_df, 'RNA', RNA_samples, RNA_dataset, RNA_probes, hub)
# combine data frame with expression data and with copy number data
all_data_df = pd.merge(CNV_df, RNA_df, left_index=True, right_index=True)
all_data_df.index.names=['samples']
# remaking RNA and CNV data frame from entire table to ensure only matched samples (n=528) used
RNA_shared_df=all_data_df.T.xs('RNA', level='analysis_type').T
CNV_shared_df=all_data_df.T.xs('CNV', level='analysis_type').T
print("Done with accessing and indexing RNA and CNV data")
return RNA_shared_df, CNV_shared_df
def analysisMultiIndexDataFrame(ensembl_df, type_of_analysis, analysis_samples, analysis_dataset, analysis_genes, hub):
'''generate a multi-index data frame of Xena measured values combined with Ensembl coordinates
Args:
ensembl_df (DataFrame): DataFrame of select genomic coordinate values from Ensembl.
type_of_analysis (str): What do Xena measurements correspond to? (e.g., "CNV", "RNA")
analysis_samples (dict): Which patient samples have measured values in Xena?
analysis_dataset (DataFrame): Xena values DataFrame.
analysis_genes (dict): Which probes have measured values in Xena?
Returns:
values_plus_metadata (DataFrame): A multi-indexed Xena df with genomic coordinates as indices.
'''
# from Ensembl, take hgnc_symbol, chromosome, and gene start/end
hgnc_symbol = np.asarray(ensembl_df['hgnc_symbol'])
chromosome_name = np.asarray(ensembl_df['chromosome_name'])
start_position = np.asarray(ensembl_df['start_position'])
end_position = np.asarray(ensembl_df['end_position'])
analysis_type = np.array(len(hgnc_symbol) * [type_of_analysis])
# create Ensembl multi-index
columns = pd.MultiIndex.from_arrays(
[hgnc_symbol, chromosome_name, start_position, end_position, analysis_type],
names=['hgnc_symbol','chromosome_name', 'start_position','end_position','analysis_type'])
index = analysis_samples
# Access xena data
data = pd.DataFrame(xena.xenaAPI.Probes_values (hub, analysis_dataset, analysis_samples, analysis_genes)).T
data = pd.DataFrame(data.values.astype(float), columns = analysis_genes)
# forward fill na values
data.fillna(method='ffill', inplace=True)
# drop genes with no Xena data
orphan_genes = list(set(analysis_genes) - set(hgnc_symbol))
data_2 = data.drop(orphan_genes, axis=1)
values_plus_metadata = pd.DataFrame(data_2.values, index=index, columns=columns)
return values_plus_metadata
In [4]:
%%R
library(biomaRt)
library(plyr)
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
#=======================================================================#
# define biomart object
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
# query biomart
get_chr_genes <- function(chr_num, mart_server){
subset(
unique(
getBM(
attributes=c(
'ensembl_gene_id',
'ensembl_transcript_id',
'hgnc_symbol',
'chromosome_name',
'start_position',
'end_position'
),
filters = 'chromosome_name', values =chr_num, mart = mart_server
)[-c(1:2)]
),
hgnc_symbol != ""
)
}
N <- 23
chr_L <- vector("list", N)
for( i in 1:21) {
chr <- get_chr_genes(i, ensembl)
chr_L[[i]] <- chr
}
chr_X <- get_chr_genes("X", ensembl)
chr_Y <- get_chr_genes("Y", ensembl)
chr_L[[22]] <- chr_X
chr_L[[23]] <- chr_Y
#=======================================================================#
chr_df <- ldply(chr_L, data.frame)
In [6]:
%Rpull chr_df
In [10]:
hub = "https://tcga.xenahubs.net"
CNV_dataset = "TCGA.PANCAN.sampleMap/Gistic2_CopyNumber_Gistic2_all_data_by_genes" # TCGA Pan-Cancer (PANCAN), see below
# https://xenabrowser.net/datapages/?dataset=TCGA.PANCAN.sampleMap%2FGistic2_CopyNumber_Gistic2_all_data_by_genes&host=https%3A%2F%2Ftcga.xenahubs.net
mutation_dataset = "TCGA.PANCAN.sampleMap/PANCAN_mutation_gene" # TCGA Pan-Cancer (PANCAN), see below
# https://xenabrowser.net/datapages/?dataset=TCGA.PANCAN.sampleMap%2FPANCAN_mutation_gene&host=https%3A%2F%2Ftcga.xenahubs.net
ensembl_data = chr_df
mode = 'dev' # change from 'dev' to 'actual' when running on entire data sets.
# 'dev' is to shorten computing time and only using the first 100 genes of each data set
In [25]:
mutation_shared_df, CNV_shared_df = accessIndexedSharedXenaData(hub, CNV_dataset, mutation_dataset, ensembl_data)
In [61]:
# specify genes of interest. Will become buggy if only 1 gene is selected
mutation_genes_of_interest = ['A1CF', 'A2M']
copy_number_genes_of_interest = ['A1CF', 'A2M', 'A2ML1', 'A2MP1']
m_df = mutation_shared_df[mutation_genes_of_interest]
c_df = CNV_shared_df[copy_number_genes_of_interest]
g_df = m_df.join(c_df, lsuffix='_mutation_status', rsuffix='_copy_number_status')
g_df
Out[61]:
In [63]:
sns.clustermap(g_df)
Out[63]: