=====================================
Parameters
----------
hub : "https://..." Xena hub where data located
CNV_dataset : xena data set with copy number data
RNA_dataset : xena data set with gene expression data
Returns
----------
*_samples : List of patient IDs in data set.
*_probes : List of gene names in data set.
*_jointplot : Jointplot figure.
In [1]:
import xenaPython as xena
import seaborn as sns
import numpy as np
import scipy as scipy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import rpy2
In [2]:
%matplotlib inline
%load_ext rpy2.ipython
In [3]:
def accessXenaData(hub, data_set):
samples = [x.encode('UTF8') for x in xena.xenaAPI.dataset_samples(hub, data_set)]
genes = [x.encode('UTF8') for x in xena.xenaAPI.dataset_fields(hub, data_set)]
# genes = [x.encode('UTF8') for x in xena.xenaAPI.dataset_fields(hub, data_set)][start:stop]
return samples, genes
def matchEnsemblData(biomart_df, xena_genes):
xena_df = pd.DataFrame(xena_genes, columns=['Identifiers'])
xena_df = pd.merge(xena_df, biomart_df, left_on='Identifiers', right_on='hgnc_symbol')
xena_df.drop('Identifiers', axis=1, inplace=True)
hgnc_symbol = np.asarray(xena_df['hgnc_symbol'])
a = hgnc_symbol
indices = np.setdiff1d(np.arange(len(a)), np.unique(a, return_index=True)[1])
xena_df.drop(xena_df.index[indices], inplace=True)
return xena_df
def analysisMultiIndexDataFrame(ensembl_df, type_of_analysis, analysis_samples, analysis_dataset, analysis_genes):
hgnc_symbol = np.asarray(ensembl_df['hgnc_symbol'])
chromosome_name = np.asarray(ensembl_df['chromosome_name'])
start_position = np.asarray(ensembl_df['start_position'])
end_position = np.asarray(ensembl_df['end_position'])
analysis_type = np.array(len(hgnc_symbol) * [type_of_analysis])
columns = pd.MultiIndex.from_arrays(
[hgnc_symbol, chromosome_name, start_position, end_position, analysis_type],
names=['hgnc_symbol','chromosome_name', 'start_position','end_position','analysis_type'])
index = analysis_samples
data = pd.DataFrame(xena.xenaAPI.Probes_values (hub, analysis_dataset, analysis_samples, analysis_genes)).T
data = pd.DataFrame(data.values.astype(float), columns = analysis_genes)
data.fillna(method='ffill', inplace=True)
data_columns = data.columns.tolist()
orphan_genes = list(set(analysis_genes) - set(hgnc_symbol))
data_2 = data.drop(orphan_genes, axis=1)
values_plus_metadata = pd.DataFrame(data_2.values, index=index, columns=columns)
return values_plus_metadata
In [4]:
hub = "https://tcga.xenahubs.net"
CNV_dataset = "TCGA.OV.sampleMap/Gistic2_CopyNumber_Gistic2_all_data_by_genes"
RNA_dataset = "TCGA.OV.sampleMap/AgilentG4502A_07_3"
Using Rmagic to query Biomart in R and return dataframe with genomic coordinate data.
In [5]:
%%R
library(biomaRt)
library(plyr)
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
#=======================================================================#
# define biomart object
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
# query biomart
get_chr_genes <- function(chr_num, mart_server){
subset(
unique(
getBM(
attributes=c(
'ensembl_gene_id',
'ensembl_transcript_id',
'hgnc_symbol',
'chromosome_name',
'start_position',
'end_position'
),
filters = 'chromosome_name', values =chr_num, mart = mart_server
)[-c(1:2)]
),
hgnc_symbol != ""
)
}
N <- 23
chr_L <- vector("list", N)
for( i in 1:21) {
chr <- get_chr_genes(i, ensembl)
chr_L[[i]] <- chr
}
chr_X <- get_chr_genes("X", ensembl)
chr_Y <- get_chr_genes("Y", ensembl)
chr_L[[22]] <- chr_X
chr_L[[23]] <- chr_Y
#=======================================================================#
chr_df <- ldply(chr_L, data.frame)
In [6]:
%Rpull chr_df
In [27]:
CNV_samples, CNV_probes = accessXenaData(hub, CNV_dataset)
RNA_samples, RNA_probes = accessXenaData(hub, RNA_dataset)
CNV_L_df = matchEnsemblData(chr_df, CNV_probes)
RNA_L_df = matchEnsemblData(chr_df, RNA_probes)
CNV_df = analysisMultiIndexDataFrame(CNV_L_df, 'CNV', CNV_samples, CNV_dataset, CNV_probes)
RNA_df = analysisMultiIndexDataFrame(RNA_L_df, 'RNA', RNA_samples, RNA_dataset, RNA_probes)
all_data_df = pd.merge(CNV_df, RNA_df, left_index=True, right_index=True)
all_data_df.index.names=['samples']
all_data_df.columns=all_data_df.columns.droplevel([u'chromosome_name', u'start_position', u'end_position'])
all_data_df=all_data_df.T.unstack([u'hgnc_symbol']).T
all_data_df=all_data_df.dropna()
In [21]:
sns.axes_style('white')
Out[21]:
In [28]:
Fig1C_dist_hex=sns.jointplot("CNV", "RNA", data=all_data_df, kind='hex', color='g')
In [29]:
Fig1C_dist_reg=sns.jointplot("CNV", "RNA", data=all_data_df, kind='scatter', color='k')
In [30]:
overplot_both_df=all_data_df[((all_data_df.CNV < -0.5) | (all_data_df.CNV > 0.5)) & ((all_data_df.RNA < -1.0) | (all_data_df.RNA > 1.0))]
overplot_both_df=sns.jointplot("CNV", "RNA", data=overplot_both_df, kind='hex', color='g')
In [31]:
overplot_normal_copy_df=all_data_df[((all_data_df.CNV > -0.5) & (all_data_df.CNV < 0.5)) & ((all_data_df.RNA < -2.0) | (all_data_df.RNA > 2.0))]
overplot_normal_copy_df=sns.jointplot("CNV", "RNA", data=overplot_normal_copy_df, kind='hex', color='g')