Ok, so basically I want to see that my genes are more frequently co-deleted when p53 is wt compared to p53 null/inactivation


In [1]:
import xenaPython as xena
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
%load_ext rpy2.ipython

In [18]:
def accessXenaData(hub, data_set, mode):

    '''return the patient IDs and probe names used for a given UCSC Xena data set

    Args:
        hub (str):      Which Xena hub are data stored in?
        data_set (str): Name of Xena data set

    Returns:
        samples (dict): Patient Identifiers
        genes (dict):   Analysis probe or gene names
    '''
    if mode == "dev":
        samples = [x.encode('UTF8') for x in xena.xenaAPI.dataset_samples(hub, data_set)][0:100]
        genes = [x.encode('UTF8') for x in xena.xenaAPI.dataset_fields(hub, data_set)][0:100]
    elif mode == "actual":
        samples = [x.encode('UTF8') for x in xena.xenaAPI.dataset_samples(hub, data_set)]
        genes = [x.encode('UTF8') for x in xena.xenaAPI.dataset_fields(hub, data_set)]

    print("Done with accessing Xena data")

    return samples, genes



def matchEnsemblData(biomart_df, xena_genes):

    '''from a list of Xena probe names, find genomic coordinate data in Ensembl

    Args:
        biomart_df_csv (.csv):      start with previously generated data frame; alternately, run using R magic
                                    because biomart python API no longer supported
        xena_genes (list):          List of genes from Xena data

    Returns:
        xena_df (DataFrame):        Ensembl data for accessed xena data
    '''
    # match Xena probe names with genomic data from Ensembl  
    xena_df = pd.DataFrame(xena_genes, columns=['Identifiers'])
    xena_df = pd.merge(xena_df, biomart_df, left_on='Identifiers', right_on='hgnc_symbol')
    xena_df.drop('Identifiers', axis=1, inplace=True)
    
    # re-index
    hgnc_symbol = np.asarray(xena_df['hgnc_symbol'])
    a = hgnc_symbol
    indices = np.setdiff1d(np.arange(len(a)), np.unique(a, return_index=True)[1])
    xena_df.drop(xena_df.index[indices], inplace=True)

    print("Done with matching data to ensembl info")

    return xena_df



def accessIndexedSharedXenaData(hub, CNV_dataset, RNA_dataset, ensembl_data):
    ''' Return data frames for two data sets on Xena with matched genomic coordinates for shared samples and genes

    Args:
        hub (url):                  url name where data sets stored in Xena
        CNV_dataset (str):          name of Xena data set containing copy number data
        RNA_dataset (str):          name or Xena data set containing expression data
        ensembl_data (.csv):        start with previously generated data frame; alternately, run using R magic
                                    because biomart python API no longer supported

    Returns:
        RNA_shared_df (DataFrame):  expression data with matched genomic coordinates for shared patients, genes
        CNV_shared_df (DataFrame):  copy number data with matched genomic coordinates for shared patients, genes
    '''

    # determine patients and genes accessible in Xena
    CNV_samples, CNV_probes = accessXenaData(hub, CNV_dataset, mode)
    RNA_samples, RNA_probes = accessXenaData(hub, RNA_dataset, mode)

    # get genomic coordinates from ensembl for Xena genes
    CNV_L_df = matchEnsemblData(ensembl_data, CNV_probes)
    RNA_L_df = matchEnsemblData(ensembl_data, RNA_probes)

    # multi-index Xena data with ensembl gene coordinates
    CNV_df = analysisMultiIndexDataFrame(CNV_L_df, 'CNV', CNV_samples, CNV_dataset, CNV_probes, hub)
    RNA_df = analysisMultiIndexDataFrame(RNA_L_df, 'RNA', RNA_samples, RNA_dataset, RNA_probes, hub)

    # combine data frame with expression data and with copy number data
    all_data_df = pd.merge(CNV_df, RNA_df, left_index=True, right_index=True)
    all_data_df.index.names=['samples']

    # remaking RNA and CNV data frame from entire table to ensure only matched samples (n=528) used
    RNA_shared_df=all_data_df.T.xs('RNA', level='analysis_type').T
    CNV_shared_df=all_data_df.T.xs('CNV', level='analysis_type').T

    print("Done with accessing and indexing RNA and CNV data")

    return RNA_shared_df, CNV_shared_df



def analysisMultiIndexDataFrame(ensembl_df, type_of_analysis, analysis_samples, analysis_dataset, analysis_genes, hub):

    '''generate a multi-index data frame of Xena measured values combined with Ensembl coordinates

    Args:
        ensembl_df (DataFrame):             DataFrame of select genomic coordinate values from Ensembl.
        type_of_analysis (str):             What do Xena measurements correspond to? (e.g., "CNV", "RNA")
        analysis_samples (dict):            Which patient samples have measured values in Xena?
        analysis_dataset (DataFrame):       Xena values DataFrame.
        analysis_genes (dict):              Which probes have measured values in Xena?

    Returns:
        values_plus_metadata (DataFrame):   A multi-indexed Xena df with genomic coordinates as indices.
    '''
    # from Ensembl, take hgnc_symbol, chromosome, and gene start/end
    hgnc_symbol = np.asarray(ensembl_df['hgnc_symbol'])
    chromosome_name = np.asarray(ensembl_df['chromosome_name'])
    start_position = np.asarray(ensembl_df['start_position'])
    end_position = np.asarray(ensembl_df['end_position'])
    analysis_type = np.array(len(hgnc_symbol) * [type_of_analysis])

    # create Ensembl multi-index
    columns = pd.MultiIndex.from_arrays(
        [hgnc_symbol, chromosome_name, start_position, end_position, analysis_type],
        names=['hgnc_symbol','chromosome_name', 'start_position','end_position','analysis_type'])
    index = analysis_samples

    # Access xena data
    data = pd.DataFrame(xena.xenaAPI.Probes_values (hub, analysis_dataset, analysis_samples, analysis_genes)).T
    data = pd.DataFrame(data.values.astype(float), columns = analysis_genes)

    # forward fill na values
    data.fillna(method='ffill', inplace=True)

    # drop genes with no Xena data
    orphan_genes = list(set(analysis_genes) - set(hgnc_symbol))
    data_2 = data.drop(orphan_genes, axis=1)
    values_plus_metadata = pd.DataFrame(data_2.values, index=index, columns=columns)

    return values_plus_metadata

Use "magic" to access R from Python script and pull important gene location data from Ensembl


In [4]:
%%R

library(biomaRt)
library(plyr)

ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")

#=======================================================================#

# define biomart object
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")

# query biomart
get_chr_genes <- function(chr_num, mart_server){
  subset(
    unique(
      getBM(
        attributes=c(
          'ensembl_gene_id',
          'ensembl_transcript_id',
          'hgnc_symbol',
          'chromosome_name',
          'start_position',
          'end_position'
        ),
        filters = 'chromosome_name', values =chr_num, mart = mart_server
      )[-c(1:2)]
    ),
    hgnc_symbol != ""
  )
}

N <- 23
chr_L <- vector("list", N)

for( i in 1:21) {
  chr <- get_chr_genes(i, ensembl)
  chr_L[[i]] <- chr
}

chr_X <- get_chr_genes("X", ensembl)
chr_Y <- get_chr_genes("Y", ensembl)

chr_L[[22]] <- chr_X
chr_L[[23]] <- chr_Y

#=======================================================================#

chr_df <- ldply(chr_L, data.frame)

In [6]:
%Rpull chr_df

In [10]:
hub = "https://tcga.xenahubs.net"

CNV_dataset = "TCGA.PANCAN.sampleMap/Gistic2_CopyNumber_Gistic2_all_data_by_genes" # TCGA Pan-Cancer (PANCAN), see below
# https://xenabrowser.net/datapages/?dataset=TCGA.PANCAN.sampleMap%2FGistic2_CopyNumber_Gistic2_all_data_by_genes&host=https%3A%2F%2Ftcga.xenahubs.net

mutation_dataset = "TCGA.PANCAN.sampleMap/PANCAN_mutation_gene" # TCGA Pan-Cancer (PANCAN), see below
# https://xenabrowser.net/datapages/?dataset=TCGA.PANCAN.sampleMap%2FPANCAN_mutation_gene&host=https%3A%2F%2Ftcga.xenahubs.net

ensembl_data = chr_df

mode = 'dev' # change from 'dev' to 'actual' when running on entire data sets.
# 'dev' is to shorten computing time and only using the first 100 genes of each data set

In [25]:
mutation_shared_df, CNV_shared_df = accessIndexedSharedXenaData(hub, CNV_dataset, mutation_dataset, ensembl_data)


Done with accessing Xena data
Done with accessing Xena data
Done with matching data to ensembl info
Done with matching data to ensembl info
Done with accessing and indexing RNA and CNV data

In [61]:
# specify genes of interest. Will become buggy if only 1 gene is selected

mutation_genes_of_interest = ['A1CF', 'A2M']
copy_number_genes_of_interest = ['A1CF', 'A2M', 'A2ML1', 'A2MP1']

m_df = mutation_shared_df[mutation_genes_of_interest]

c_df = CNV_shared_df[copy_number_genes_of_interest]

g_df = m_df.join(c_df, lsuffix='_mutation_status', rsuffix='_copy_number_status')


g_df


Out[61]:
hgnc_symbol A1CF_mutation_status A2M_mutation_status A1CF_copy_number_status A2M_copy_number_status A2ML1 A2MP1
chromosome_name 10 12 10 12 12 12
start_position 50799409 9067664 50799409 9067664 8822472 9228533
end_position 50885675 9116229 50885675 9116229 8887001 9275817
samples
TCGA-AR-A1AH-01 0.0 0.0 0.058 0.037 0.037 0.037
TCGA-C8-A1HL-01 0.0 0.0 0.009 0.004 0.004 0.004
TCGA-A2-A3XX-01 0.0 0.0 0.081 -0.263 -0.263 -0.263

In [63]:
sns.clustermap(g_df)


Out[63]:
<seaborn.matrix.ClusterGrid at 0x11ff38810>