TP53 Phenocopying Events

In the following script, we visualize and determine if specific gene events phenocopy TP53 loss of function. We focus on MDM2, MDM4, and PPM1D amplifications, CDKN2A deletions, and ATM, CHEK1, CHEK2, ATR, and RPS6KA3 mutations.

We first filter out samples with TP53 mutations and deep deletions as well as hypermutated samples. We then visualize the difference of the TP53 classifier score across tumors with the given events above. We perform t-tests to determine if the classifier scores are significantly different in the direction of a priori assumptions.


In [1]:
import os
import pandas as pd
import numpy as np
from decimal import Decimal
from scipy.stats import ttest_ind

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
np.random.seed(123)

In [3]:
sns.set(style='whitegrid')
sns.set_context('paper', rc={'font.size':11, 'axes.titlesize':11, 'axes.labelsize':16,
                             'xtick.labelsize':11, 'ytick.labelsize':11})

In [4]:
def get_cohens_d(pos_samples, neg_samples):
    """
    Find the effect size of aberrant vs. wildtype genes according to classifier score
    Modified from:
    https://github.com/AllenDowney/CompStats/blob/master/effect_size.ipynb

    Arguments:
    pos_samples, neg_samples - aberrant or wildtype event dataframe storing classifier scores

    Output:
    Returns the Cohen's d effect size estimate statistic for two status groups
    """
    pos_samples = pos_samples['weight']
    neg_samples = neg_samples['weight']
    
    mean_diff = pos_samples.mean() - neg_samples.mean()
    n_pos, n_neg = len(pos_samples), len(neg_samples)

    var_pos = pos_samples.var()
    var_neg = neg_samples.var()

    sd_pool = np.sqrt((n_pos * var_pos + n_neg * var_neg) / (n_pos + n_neg))
    cohen_d = mean_diff / sd_pool
    return cohen_d

In [5]:
# genes of interest
amplifications = ['MDM2', 'MDM4', 'PPM1D']
deletions = ['CDKN2A']
mutations = ['ATM', 'ATR', 'CHEK1', 'CHEK2', 'CREBBP', 'RPS6KA3']

In [6]:
# Load mutation, copy number, and sample freeze data
mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv')
gistic_file = os.path.join('..', 'data', 'raw', 'pancan_GISTIC_threshold.tsv')
sample_freeze_file = os.path.join('..', 'data', 'sample_freeze.tsv')

mutation_df = pd.read_table(mut_file, index_col=0)
copy_df = pd.read_table(gistic_file, index_col=0)
sample_freeze_df = pd.read_table(sample_freeze_file)

In [7]:
# The prediction file stores the TP53 classifier scores for all samples
prediction_file = os.path.join('..', 'classifiers', 'TP53', 'tables',
                               'mutation_classification_scores.tsv')
prediction_df = pd.read_table(prediction_file, index_col=0)
prediction_df.head(2)


Out[7]:
log10_mut total_status weight TP53 TP53_loss PATIENT_BARCODE DISEASE SUBTYPE hypermutated include ID.1 Tumor_Sample_Barcode Hugo_Symbol HGVSc HGVSp Variant_Classification
ID
TCGA-02-0047-01 1.812913 0.0 0.723395 0.0 0.0 TCGA-02-0047 GBM IDHwt 0.0 1.0 TCGA-02-0047-01 NaN NaN NaN NaN Wild-Type
TCGA-02-0055-01 1.707570 1.0 0.698910 1.0 0.0 TCGA-02-0055 GBM IDHwt 0.0 1.0 TCGA-02-0055-01 TCGA-02-0055-01A-01D-1490-08 TP53 c.646G>A p.Val216Met Missense_Mutation

In [8]:
# Process and subset the GISTIC copy number data
# We do not use the preprocessed copy number data because we are also 
# interested in intermediate copy number events
copy_df.drop(['Locus ID', 'Cytoband'], axis=1, inplace=True)
copy_df.columns = copy_df.columns.str[0:15]

copy_df = copy_df.T
copy_df = copy_df.reindex(sorted(sample_freeze_df['SAMPLE_BARCODE']))
copy_df = copy_df.fillna(0)
copy_df = copy_df.astype(int)
copy_df.head(2)


Out[8]:
Gene Symbol ACAP3 ACTRT2 AGRN ANKRD65 ATAD3A ATAD3B ATAD3C AURKAIP1 B3GALT6 C1orf159 ... RP13-228J13.1 SMIM9 SNORA36A SNORA56 TMLHE VBP1 DDX11L16|chrX IL9R|chrX SPRY3|chrX VAMP7|chrX
TCGA-02-0047-01 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TCGA-02-0055-01 1 1 1 1 1 1 1 1 1 1 ... -1 -1 -1 -1 -1 -1 -1 -1 -1 -1

2 rows × 25128 columns

Subset corresponding data type according to predicted phenocopying mechanism


In [9]:
del_phenocopy_df = copy_df.loc[:, deletions]
del_phenocopy_df = del_phenocopy_df.reset_index().melt(id_vars=['index'])
del_phenocopy_df.columns = ['sample_id', 'gene', 'value']
del_phenocopy_df = del_phenocopy_df.assign(event = 'Deletion')

amp_phenocopy_df = copy_df.loc[:, amplifications]
amp_phenocopy_df = amp_phenocopy_df.reset_index().melt(id_vars=['index'])
amp_phenocopy_df.columns = ['sample_id', 'gene', 'value']
amp_phenocopy_df = amp_phenocopy_df.assign(event = 'Amplification')

mut_phenocopy_df = mutation_df.loc[:, mutations]
mut_phenocopy_df = mut_phenocopy_df.reset_index().melt(id_vars=['SAMPLE_BARCODE'])
mut_phenocopy_df.columns = ['sample_id', 'gene', 'value']
mut_phenocopy_df = mut_phenocopy_df.assign(event = 'Mutation')

In [10]:
# Combine the gene events of interest
event_df = pd.concat([del_phenocopy_df, amp_phenocopy_df, mut_phenocopy_df], axis=0)

# Add predictions and covariate information for each sample in the event_df
pred_cols = ['weight', 'total_status', 'DISEASE', 'SUBTYPE', 'hypermutated']
prediction_sub_df = prediction_df.loc[:, pred_cols]
combined_df = event_df.merge(prediction_sub_df, how='left', left_on='sample_id', right_index=True)
print(combined_df.shape)
combined_df.head(2)


(96660, 9)
Out[10]:
sample_id gene value event weight total_status DISEASE SUBTYPE hypermutated
0 TCGA-02-0047-01 CDKN2A -2 Deletion 0.723395 0.0 GBM IDHwt 0.0
1 TCGA-02-0055-01 CDKN2A 0 Deletion 0.698910 1.0 GBM IDHwt 0.0

In [11]:
# Filter TP53 mutants and hypermutated samples
plot_ready_df = combined_df[combined_df['total_status'] == 0.0]
plot_ready_df = plot_ready_df[plot_ready_df['hypermutated'] == 0.0]
print(plot_ready_df.shape)


(54100, 9)

How many samples were filtered?


In [12]:
# The same samples have equivalent representation across genes
combined_df['gene'].value_counts()


Out[12]:
RPS6KA3    9666
PPM1D      9666
MDM2       9666
ATM        9666
MDM4       9666
CDKN2A     9666
CREBBP     9666
ATR        9666
CHEK1      9666
CHEK2      9666
Name: gene, dtype: int64

In [13]:
# Subset to a gene and observe filtering TP53 inactive tumors
tp53_status_df = combined_df[combined_df['gene'] == 'CDKN2A']
tp53_status = tp53_status_df['total_status'].value_counts()
print('Filter {} TP53 mutated or deep deleted samples out of {} total samples'
      .format(tp53_status[1], tp53_status_df.shape[0]))


Filter 4037 TP53 mutated or deep deleted samples out of 9666 total samples

In [14]:
# Observe effects of hypermutated tumor filtering
hyper_df = tp53_status_df[tp53_status_df['total_status'] == 0.0]
hyper_status = hyper_df['hypermutated'].value_counts()           
print('Filter additional {} hypermutated samples out of the remaining {} total samples'
      .format(hyper_status[1], hyper_df.shape[0]))

use_sample_df = hyper_df[hyper_df['hypermutated'] == 0]
print('Final sample size: n = {}'.format(use_sample_df.shape[0]))


Filter additional 219 hypermutated samples out of the remaining 5629 total samples
Final sample size: n = 5410

In [15]:
# Confirm sample size
plot_ready_df['gene'].value_counts()


Out[15]:
RPS6KA3    5410
PPM1D      5410
CDKN2A     5410
CREBBP     5410
ATR        5410
MDM2       5410
CHEK1      5410
ATM        5410
CHEK2      5410
MDM4       5410
Name: gene, dtype: int64

Perform visualizations and include t-test results


In [16]:
# Mutation plotting parameters
x1_mut, x2_mut = -0.2, 0.2
y_mut, h_mut = 1.01, 0.03
output_results = []
#sns.set(style='white')
for gene in set(plot_ready_df['gene']):
    gene_df = plot_ready_df[plot_ready_df['gene'] == gene]
    event = gene_df['event'].tolist()[0]
    
    if event == 'Mutation':
        plt.rcParams['figure.figsize']=(3, 4)
        use_palette = {0: '#d8b365', 1: '#5ab4ac'}
        legend_label_key = {0: 'Wild-Type',  1: 'Mutant'}
        
        # Perform an independent t-test against:
        # Mutant vs. Wild-type weights
        mutant_weight = gene_df[gene_df['value'] == 1]

    else:
        plt.rcParams['figure.figsize']=(6, 4)
        use_palette = {-2: '#2c7bb6', -1: '#abd9e9', 0: '#fee08b', 1: '#fdae61', 2: '#d7191c'}
        legend_label_key = {-2: 'Deep Deletion', -1: 'Mild Deletion', 0: 'Wild-Type',
                            1: 'Mild Amplification', 2: 'High Amplification'}

        # Perform an independent t-test against:
        # Amplification vs. Wild-type weights
        if gene in ['CDKN2A']:
            mutant_weight = gene_df[(gene_df['value'] == -1) | (gene_df['value'] == -2)]
            x1_amp, x2_amp = -0.25, 0
            y_amp, h_mut = 1.01, 0.03
            y_text_amp = -0.125
            
            
            deep_del_weight = gene_df[gene_df['value'] == -2]
            mild_del_weight = gene_df[gene_df['value'] == -1]
            
            t_del_results = ttest_ind(a = mild_del_weight['weight'],
                                      b = deep_del_weight['weight'], equal_var = False)
            
            p_val_del = t_del_results.pvalue
            if p_val_del > 0.05:
                add_text_del = 'NS'
            else:
                add_text_del = "{:.2E}".format(Decimal(p_val_del))
        
        else:
            mutant_weight = gene_df[(gene_df['value'] == 1) | (gene_df['value'] == 2)]
            x1_amp, x2_amp = 0, 0.25
            y_amp, h_mut = 1.01, 0.03
            y_text_amp = 0.125
        
        if gene == 'MDM2':
            x1_amp, x2_amp = -0.1, 0.2
            y_text_amp = 0.05

    wt_weight = gene_df[gene_df['value'] == 0]

    # Output t-test results
    t_results = ttest_ind(a = wt_weight['weight'],
                          b = mutant_weight['weight'], equal_var = False)

    p_val = t_results.pvalue
    if p_val > 0.05:
        add_text = 'NS'
    else:
        add_text = "{:.2E}".format(Decimal(p_val))
    
    # Get Cohen's D effect size between the groups
    effect_size = get_cohens_d(mutant_weight, wt_weight)
    log_p = np.round(-1 * np.log10(p_val), decimals=1)
    effect = "{:.2}".format(Decimal(effect_size))
    gene_results = [gene, 'TP53', event, log_p, add_text, effect]
    output_results.append(gene_results)
    
    # Generate Plots
    ax = sns.boxplot(x="gene", y="weight", hue='value',
                     data=gene_df,
                     palette=use_palette,
                     fliersize=0)
    
    handles, labels = ax.get_legend_handles_labels()

    ax = sns.stripplot(x='gene', y='weight', hue='value',
                       data=gene_df, 
                       dodge=True,
                       palette=use_palette,
                       edgecolor='black',
                       jitter=0.3,
                       size=2,
                       alpha=0.75)
    
    # Set Legend labels
    use_labels = []
    for lab in labels:
        use_labels.append(legend_label_key[int(lab)])

    # Get x axis tick labels
    x_axis_tick_labels = []
    for key, value in legend_label_key.items():
        n = gene_df['value'].value_counts()
        try:
            n = n[key]
            result = 'n = {}\n{}'.format(n, value)
            x_axis_tick_labels.append(result)
        except:
            next
    
    l = plt.legend(handles, use_labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fancybox=True)
    l.set_title('Status')
    ax.axes.set_ylim(-0.003, 1.2)
    ax.set_yticklabels(['', 0, 0.2, 0.4, 0.6, 0.8, 1, ''])
    ax.set_xticklabels('')
    ax.set_ylabel('TP53 Classifier Score')
    ax.set_xlabel('')
    ax.legend_.remove()
    ax.set_title(gene, size=16)
    plt.axhline(0.5, color='black', linestyle='dashed', linewidth=1)
    plt.tight_layout()
    
    if event == 'Mutation':
        plt.plot([x1_mut, x1_mut, x2_mut, x2_mut], [y_mut, y_mut + h_mut, y_mut + h_mut, y_mut],
                 lw=1.2, c='black')
        plt.text(0, y_mut + h_mut, add_text, ha='center', va='bottom', color="black")
    else:
        plt.plot([x1_amp, x1_amp, x2_amp, x2_amp], [y_amp, y_amp + h_mut, y_amp + h_mut, y_amp],
                  lw=1.2, c='black')
        plt.text(y_text_amp, y_amp + h_mut, add_text, ha='center', va='bottom', color="black")
        
        if gene == 'CDKN2A':
            y_add = 0.1
            plt.plot([x1_amp - 0.075, x1_amp - 0.075, x2_amp - 0.16, x2_amp - 0.16],
                     [y_amp + y_add, y_amp + h_mut + y_add, y_amp + h_mut + y_add, y_amp + y_add],
                      lw=1.2, c='black')
            plt.text(y_text_amp - 0.1, y_amp + h_mut + 0.1,
                     add_text_del, ha='center', va='bottom', color="black")
    
    phenocopy_fig_file = os.path.join('..', 'classifiers', 'TP53', 'figures',
                                      '{}_gene_phenocopy.pdf'.format(gene))
    plt.savefig(phenocopy_fig_file, bbox_extra_artists=(l,), bbox_inches='tight')
    plt.close()

In [17]:
output_df = pd.DataFrame(output_results, columns=['gene', 'gene_b', 'event',
                                                  'neg_logp', 'p', 'effect_size'])
output_file = os.path.join('..', 'results', 'tp53_phenocopy_results.tsv')
output_df.sort_values(by='gene').to_csv(output_file, sep='\t', index=False)