In the following script, we visualize and determine if specific gene events phenocopy TP53 loss of function. We focus on MDM2, MDM4, and PPM1D amplifications, CDKN2A deletions, and ATM, CHEK1, CHEK2, ATR, and RPS6KA3 mutations.
We first filter out samples with TP53 mutations and deep deletions as well as hypermutated samples. We then visualize the difference of the TP53 classifier score across tumors with the given events above. We perform t-tests to determine if the classifier scores are significantly different in the direction of a priori assumptions.
In [1]:
import os
import pandas as pd
import numpy as np
from decimal import Decimal
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
np.random.seed(123)
In [3]:
sns.set(style='whitegrid')
sns.set_context('paper', rc={'font.size':11, 'axes.titlesize':11, 'axes.labelsize':16,
'xtick.labelsize':11, 'ytick.labelsize':11})
In [4]:
def get_cohens_d(pos_samples, neg_samples):
"""
Find the effect size of aberrant vs. wildtype genes according to classifier score
Modified from:
https://github.com/AllenDowney/CompStats/blob/master/effect_size.ipynb
Arguments:
pos_samples, neg_samples - aberrant or wildtype event dataframe storing classifier scores
Output:
Returns the Cohen's d effect size estimate statistic for two status groups
"""
pos_samples = pos_samples['weight']
neg_samples = neg_samples['weight']
mean_diff = pos_samples.mean() - neg_samples.mean()
n_pos, n_neg = len(pos_samples), len(neg_samples)
var_pos = pos_samples.var()
var_neg = neg_samples.var()
sd_pool = np.sqrt((n_pos * var_pos + n_neg * var_neg) / (n_pos + n_neg))
cohen_d = mean_diff / sd_pool
return cohen_d
In [5]:
# genes of interest
amplifications = ['MDM2', 'MDM4', 'PPM1D']
deletions = ['CDKN2A']
mutations = ['ATM', 'ATR', 'CHEK1', 'CHEK2', 'CREBBP', 'RPS6KA3']
In [6]:
# Load mutation, copy number, and sample freeze data
mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv')
gistic_file = os.path.join('..', 'data', 'raw', 'pancan_GISTIC_threshold.tsv')
sample_freeze_file = os.path.join('..', 'data', 'sample_freeze.tsv')
mutation_df = pd.read_table(mut_file, index_col=0)
copy_df = pd.read_table(gistic_file, index_col=0)
sample_freeze_df = pd.read_table(sample_freeze_file)
In [7]:
# The prediction file stores the TP53 classifier scores for all samples
prediction_file = os.path.join('..', 'classifiers', 'TP53', 'tables',
'mutation_classification_scores.tsv')
prediction_df = pd.read_table(prediction_file, index_col=0)
prediction_df.head(2)
Out[7]:
In [8]:
# Process and subset the GISTIC copy number data
# We do not use the preprocessed copy number data because we are also
# interested in intermediate copy number events
copy_df.drop(['Locus ID', 'Cytoband'], axis=1, inplace=True)
copy_df.columns = copy_df.columns.str[0:15]
copy_df = copy_df.T
copy_df = copy_df.reindex(sorted(sample_freeze_df['SAMPLE_BARCODE']))
copy_df = copy_df.fillna(0)
copy_df = copy_df.astype(int)
copy_df.head(2)
Out[8]:
In [9]:
del_phenocopy_df = copy_df.loc[:, deletions]
del_phenocopy_df = del_phenocopy_df.reset_index().melt(id_vars=['index'])
del_phenocopy_df.columns = ['sample_id', 'gene', 'value']
del_phenocopy_df = del_phenocopy_df.assign(event = 'Deletion')
amp_phenocopy_df = copy_df.loc[:, amplifications]
amp_phenocopy_df = amp_phenocopy_df.reset_index().melt(id_vars=['index'])
amp_phenocopy_df.columns = ['sample_id', 'gene', 'value']
amp_phenocopy_df = amp_phenocopy_df.assign(event = 'Amplification')
mut_phenocopy_df = mutation_df.loc[:, mutations]
mut_phenocopy_df = mut_phenocopy_df.reset_index().melt(id_vars=['SAMPLE_BARCODE'])
mut_phenocopy_df.columns = ['sample_id', 'gene', 'value']
mut_phenocopy_df = mut_phenocopy_df.assign(event = 'Mutation')
In [10]:
# Combine the gene events of interest
event_df = pd.concat([del_phenocopy_df, amp_phenocopy_df, mut_phenocopy_df], axis=0)
# Add predictions and covariate information for each sample in the event_df
pred_cols = ['weight', 'total_status', 'DISEASE', 'SUBTYPE', 'hypermutated']
prediction_sub_df = prediction_df.loc[:, pred_cols]
combined_df = event_df.merge(prediction_sub_df, how='left', left_on='sample_id', right_index=True)
print(combined_df.shape)
combined_df.head(2)
Out[10]:
In [11]:
# Filter TP53 mutants and hypermutated samples
plot_ready_df = combined_df[combined_df['total_status'] == 0.0]
plot_ready_df = plot_ready_df[plot_ready_df['hypermutated'] == 0.0]
print(plot_ready_df.shape)
In [12]:
# The same samples have equivalent representation across genes
combined_df['gene'].value_counts()
Out[12]:
In [13]:
# Subset to a gene and observe filtering TP53 inactive tumors
tp53_status_df = combined_df[combined_df['gene'] == 'CDKN2A']
tp53_status = tp53_status_df['total_status'].value_counts()
print('Filter {} TP53 mutated or deep deleted samples out of {} total samples'
.format(tp53_status[1], tp53_status_df.shape[0]))
In [14]:
# Observe effects of hypermutated tumor filtering
hyper_df = tp53_status_df[tp53_status_df['total_status'] == 0.0]
hyper_status = hyper_df['hypermutated'].value_counts()
print('Filter additional {} hypermutated samples out of the remaining {} total samples'
.format(hyper_status[1], hyper_df.shape[0]))
use_sample_df = hyper_df[hyper_df['hypermutated'] == 0]
print('Final sample size: n = {}'.format(use_sample_df.shape[0]))
In [15]:
# Confirm sample size
plot_ready_df['gene'].value_counts()
Out[15]:
In [16]:
# Mutation plotting parameters
x1_mut, x2_mut = -0.2, 0.2
y_mut, h_mut = 1.01, 0.03
output_results = []
#sns.set(style='white')
for gene in set(plot_ready_df['gene']):
gene_df = plot_ready_df[plot_ready_df['gene'] == gene]
event = gene_df['event'].tolist()[0]
if event == 'Mutation':
plt.rcParams['figure.figsize']=(3, 4)
use_palette = {0: '#d8b365', 1: '#5ab4ac'}
legend_label_key = {0: 'Wild-Type', 1: 'Mutant'}
# Perform an independent t-test against:
# Mutant vs. Wild-type weights
mutant_weight = gene_df[gene_df['value'] == 1]
else:
plt.rcParams['figure.figsize']=(6, 4)
use_palette = {-2: '#2c7bb6', -1: '#abd9e9', 0: '#fee08b', 1: '#fdae61', 2: '#d7191c'}
legend_label_key = {-2: 'Deep Deletion', -1: 'Mild Deletion', 0: 'Wild-Type',
1: 'Mild Amplification', 2: 'High Amplification'}
# Perform an independent t-test against:
# Amplification vs. Wild-type weights
if gene in ['CDKN2A']:
mutant_weight = gene_df[(gene_df['value'] == -1) | (gene_df['value'] == -2)]
x1_amp, x2_amp = -0.25, 0
y_amp, h_mut = 1.01, 0.03
y_text_amp = -0.125
deep_del_weight = gene_df[gene_df['value'] == -2]
mild_del_weight = gene_df[gene_df['value'] == -1]
t_del_results = ttest_ind(a = mild_del_weight['weight'],
b = deep_del_weight['weight'], equal_var = False)
p_val_del = t_del_results.pvalue
if p_val_del > 0.05:
add_text_del = 'NS'
else:
add_text_del = "{:.2E}".format(Decimal(p_val_del))
else:
mutant_weight = gene_df[(gene_df['value'] == 1) | (gene_df['value'] == 2)]
x1_amp, x2_amp = 0, 0.25
y_amp, h_mut = 1.01, 0.03
y_text_amp = 0.125
if gene == 'MDM2':
x1_amp, x2_amp = -0.1, 0.2
y_text_amp = 0.05
wt_weight = gene_df[gene_df['value'] == 0]
# Output t-test results
t_results = ttest_ind(a = wt_weight['weight'],
b = mutant_weight['weight'], equal_var = False)
p_val = t_results.pvalue
if p_val > 0.05:
add_text = 'NS'
else:
add_text = "{:.2E}".format(Decimal(p_val))
# Get Cohen's D effect size between the groups
effect_size = get_cohens_d(mutant_weight, wt_weight)
log_p = np.round(-1 * np.log10(p_val), decimals=1)
effect = "{:.2}".format(Decimal(effect_size))
gene_results = [gene, 'TP53', event, log_p, add_text, effect]
output_results.append(gene_results)
# Generate Plots
ax = sns.boxplot(x="gene", y="weight", hue='value',
data=gene_df,
palette=use_palette,
fliersize=0)
handles, labels = ax.get_legend_handles_labels()
ax = sns.stripplot(x='gene', y='weight', hue='value',
data=gene_df,
dodge=True,
palette=use_palette,
edgecolor='black',
jitter=0.3,
size=2,
alpha=0.75)
# Set Legend labels
use_labels = []
for lab in labels:
use_labels.append(legend_label_key[int(lab)])
# Get x axis tick labels
x_axis_tick_labels = []
for key, value in legend_label_key.items():
n = gene_df['value'].value_counts()
try:
n = n[key]
result = 'n = {}\n{}'.format(n, value)
x_axis_tick_labels.append(result)
except:
next
l = plt.legend(handles, use_labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fancybox=True)
l.set_title('Status')
ax.axes.set_ylim(-0.003, 1.2)
ax.set_yticklabels(['', 0, 0.2, 0.4, 0.6, 0.8, 1, ''])
ax.set_xticklabels('')
ax.set_ylabel('TP53 Classifier Score')
ax.set_xlabel('')
ax.legend_.remove()
ax.set_title(gene, size=16)
plt.axhline(0.5, color='black', linestyle='dashed', linewidth=1)
plt.tight_layout()
if event == 'Mutation':
plt.plot([x1_mut, x1_mut, x2_mut, x2_mut], [y_mut, y_mut + h_mut, y_mut + h_mut, y_mut],
lw=1.2, c='black')
plt.text(0, y_mut + h_mut, add_text, ha='center', va='bottom', color="black")
else:
plt.plot([x1_amp, x1_amp, x2_amp, x2_amp], [y_amp, y_amp + h_mut, y_amp + h_mut, y_amp],
lw=1.2, c='black')
plt.text(y_text_amp, y_amp + h_mut, add_text, ha='center', va='bottom', color="black")
if gene == 'CDKN2A':
y_add = 0.1
plt.plot([x1_amp - 0.075, x1_amp - 0.075, x2_amp - 0.16, x2_amp - 0.16],
[y_amp + y_add, y_amp + h_mut + y_add, y_amp + h_mut + y_add, y_amp + y_add],
lw=1.2, c='black')
plt.text(y_text_amp - 0.1, y_amp + h_mut + 0.1,
add_text_del, ha='center', va='bottom', color="black")
phenocopy_fig_file = os.path.join('..', 'classifiers', 'TP53', 'figures',
'{}_gene_phenocopy.pdf'.format(gene))
plt.savefig(phenocopy_fig_file, bbox_extra_artists=(l,), bbox_inches='tight')
plt.close()
In [17]:
output_df = pd.DataFrame(output_results, columns=['gene', 'gene_b', 'event',
'neg_logp', 'p', 'effect_size'])
output_file = os.path.join('..', 'results', 'tp53_phenocopy_results.tsv')
output_df.sort_values(by='gene').to_csv(output_file, sep='\t', index=False)