Jointplot correlation of expression~copy_number

Access gene expression and gene copy number data using UCSC Xena python API and generate jointplots showing expression~CN.

=====================================

Author: Thomas Silvers

Date: 20170917

Parameters
----------

hub         :   "https://..." Xena hub where data located

CNV_dataset :   xena data set with copy number data

RNA_dataset :   xena data set with gene expression data

Returns
----------

*_samples   :   List of patient IDs in data set.

*_probes    :   List of gene names in data set.

*_jointplot :   Jointplot figure.

In [1]:
import xenaPython as xena
import seaborn as sns
import numpy as np
import scipy as scipy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import rpy2

In [2]:
%matplotlib inline
%load_ext rpy2.ipython

In [3]:
def accessXenaData(hub, data_set):
    samples = [x.encode('UTF8') for x in xena.xenaAPI.dataset_samples(hub, data_set)]
    genes = [x.encode('UTF8') for x in xena.xenaAPI.dataset_fields(hub, data_set)]
    # genes = [x.encode('UTF8') for x in xena.xenaAPI.dataset_fields(hub, data_set)][start:stop]
    return samples, genes

def matchEnsemblData(biomart_df, xena_genes):
    xena_df = pd.DataFrame(xena_genes, columns=['Identifiers'])
    xena_df = pd.merge(xena_df, biomart_df, left_on='Identifiers', right_on='hgnc_symbol')
    xena_df.drop('Identifiers', axis=1, inplace=True)
    hgnc_symbol = np.asarray(xena_df['hgnc_symbol'])
    a = hgnc_symbol
    indices = np.setdiff1d(np.arange(len(a)), np.unique(a, return_index=True)[1])
    xena_df.drop(xena_df.index[indices], inplace=True)
    return xena_df

def analysisMultiIndexDataFrame(ensembl_df, type_of_analysis, analysis_samples, analysis_dataset, analysis_genes):
    hgnc_symbol = np.asarray(ensembl_df['hgnc_symbol'])
    chromosome_name = np.asarray(ensembl_df['chromosome_name'])
    start_position = np.asarray(ensembl_df['start_position'])
    end_position = np.asarray(ensembl_df['end_position'])
    analysis_type = np.array(len(hgnc_symbol) * [type_of_analysis])
    columns = pd.MultiIndex.from_arrays(
        [hgnc_symbol, chromosome_name, start_position, end_position, analysis_type],
        names=['hgnc_symbol','chromosome_name', 'start_position','end_position','analysis_type'])
    index = analysis_samples
    data = pd.DataFrame(xena.xenaAPI.Probes_values (hub, analysis_dataset, analysis_samples, analysis_genes)).T
    data = pd.DataFrame(data.values.astype(float), columns = analysis_genes)
    data.fillna(method='ffill', inplace=True)
    data_columns = data.columns.tolist()
    orphan_genes = list(set(analysis_genes) - set(hgnc_symbol))
    data_2 = data.drop(orphan_genes, axis=1)
    values_plus_metadata = pd.DataFrame(data_2.values, index=index, columns=columns)
    return values_plus_metadata

In [4]:
hub = "https://tcga.xenahubs.net"
CNV_dataset = "TCGA.OV.sampleMap/Gistic2_CopyNumber_Gistic2_all_data_by_genes"
RNA_dataset = "TCGA.OV.sampleMap/AgilentG4502A_07_3"

Using Rmagic to query Biomart in R and return dataframe with genomic coordinate data.


In [5]:
%%R

library(biomaRt)
library(plyr)

ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")

#=======================================================================#

# define biomart object
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")

# query biomart
get_chr_genes <- function(chr_num, mart_server){
  subset(
    unique(
      getBM(
        attributes=c(
          'ensembl_gene_id',
          'ensembl_transcript_id',
          'hgnc_symbol',
          'chromosome_name',
          'start_position',
          'end_position'
        ),
        filters = 'chromosome_name', values =chr_num, mart = mart_server
      )[-c(1:2)]
    ),
    hgnc_symbol != ""
  )
}

N <- 23
chr_L <- vector("list", N)

for( i in 1:21) {
  chr <- get_chr_genes(i, ensembl)
  chr_L[[i]] <- chr
}

chr_X <- get_chr_genes("X", ensembl)
chr_Y <- get_chr_genes("Y", ensembl)

chr_L[[22]] <- chr_X
chr_L[[23]] <- chr_Y

#=======================================================================#

chr_df <- ldply(chr_L, data.frame)

In [6]:
%Rpull chr_df

In [27]:
CNV_samples, CNV_probes = accessXenaData(hub, CNV_dataset)
RNA_samples, RNA_probes = accessXenaData(hub, RNA_dataset)

CNV_L_df = matchEnsemblData(chr_df, CNV_probes)
RNA_L_df = matchEnsemblData(chr_df, RNA_probes)

CNV_df = analysisMultiIndexDataFrame(CNV_L_df, 'CNV', CNV_samples, CNV_dataset, CNV_probes)
RNA_df = analysisMultiIndexDataFrame(RNA_L_df, 'RNA', RNA_samples, RNA_dataset, RNA_probes)

all_data_df = pd.merge(CNV_df, RNA_df, left_index=True, right_index=True)
all_data_df.index.names=['samples']
all_data_df.columns=all_data_df.columns.droplevel([u'chromosome_name', u'start_position', u'end_position'])
all_data_df=all_data_df.T.unstack([u'hgnc_symbol']).T
all_data_df=all_data_df.dropna()

In [21]:
sns.axes_style('white')


Out[21]:
{'axes.axisbelow': True,
 'axes.edgecolor': '.15',
 'axes.facecolor': 'white',
 'axes.grid': False,
 'axes.labelcolor': '.15',
 'axes.linewidth': 1.25,
 'figure.facecolor': 'white',
 'font.family': ['sans-serif'],
 'font.sans-serif': ['Arial',
  'DejaVu Sans',
  'Liberation Sans',
  'Bitstream Vera Sans',
  'sans-serif'],
 'grid.color': '.8',
 'grid.linestyle': '-',
 'image.cmap': 'rocket',
 'legend.frameon': False,
 'legend.numpoints': 1,
 'legend.scatterpoints': 1,
 'lines.solid_capstyle': 'round',
 'text.color': '.15',
 'xtick.color': '.15',
 'xtick.direction': 'out',
 'xtick.major.size': 0,
 'xtick.minor.size': 0,
 'ytick.color': '.15',
 'ytick.direction': 'out',
 'ytick.major.size': 0,
 'ytick.minor.size': 0}

Jointplot of all data points as hex to avoid overplotting

Used over scatter to avoid overplotting


In [28]:
Fig1C_dist_hex=sns.jointplot("CNV", "RNA", data=all_data_df, kind='hex', color='g')


Jointplot of all data points as scatter

Used under hex plot to to show variation


In [29]:
Fig1C_dist_reg=sns.jointplot("CNV", "RNA", data=all_data_df, kind='scatter', color='k')



In [30]:
overplot_both_df=all_data_df[((all_data_df.CNV < -0.5) | (all_data_df.CNV > 0.5)) & ((all_data_df.RNA < -1.0) | (all_data_df.RNA > 1.0))]
overplot_both_df=sns.jointplot("CNV", "RNA", data=overplot_both_df, kind='hex', color='g')



In [31]:
overplot_normal_copy_df=all_data_df[((all_data_df.CNV > -0.5) & (all_data_df.CNV < 0.5)) & ((all_data_df.RNA < -2.0) | (all_data_df.RNA > 2.0))]
overplot_normal_copy_df=sns.jointplot("CNV", "RNA", data=overplot_normal_copy_df, kind='hex', color='g')