The Pathway Working Group from TCGA PanCancerAtlas curated variants in Ras Pathway genes by their expert-predicted oncogenicity status (either oncogenic or unconfirmed). Here, we output three sets of files
In [1]:
import os
import csv
import numpy as np
import pandas as pd
from decimal import Decimal
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
%matplotlib inline
plt.style.use('seaborn-notebook')
In [3]:
sns.set(style="whitegrid")
sns.set_context("paper", rc={"font.size":14, "axes.titlesize":15, "axes.labelsize":20,
'xtick.labelsize':14, 'ytick.labelsize':14})
In [4]:
np.random.seed(123)
In [5]:
def assign_curation(aa_row, df='ras_variant'):
"""
Determines if the specific amino acid mutation is cataloged as Oncogenic
in the Ras pathway or not. To be used as a call to pandas.DataFrame().apply()
Arguments:
aa_row - a row that has gene and mutation information
df - the type of dataframe called, can either be `ras_variant` or `sample`
depending on the type of dataframe to assign curation
Output:
Oncogenicity Status for the given amino acid mutation
"""
gene = aa_row['Hugo_Symbol']
if df == 'ras_variant':
aa = aa_row.name
elif df == 'sample':
aa = aa_row['HGVSp']
ras_sub = ras_variant_df[ras_variant_df.index == gene]
if len(ras_sub) == 0:
return 'Not Ras'
if aa in ras_sub['mutation'].tolist():
return 'Oncogenic'
else:
return 'Unconfirmed'
In [6]:
# Set File names
aa_mutation_scores_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
'amino_acid_mutation_scores.tsv')
ras_variant_curation_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
'Ras_pathway_variant_oncogenicity_data.tsv')
sample_mutation_scores_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
'mutation_classification_scores.tsv')
# Output Files
aa_out_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
'RAS_oncogenecity_predictions.tsv')
In [7]:
# Load Ras variant curation file
ras_variant_df = pd.read_table(ras_variant_curation_file, index_col=0)
ras_variant_df.head(2)
Out[7]:
In [8]:
# Add oncogenecity designation to each amino acid mutation and write to file
aa_scores = pd.read_table(aa_mutation_scores_file, index_col=0)
aa_onco_df = aa_scores.assign(designation = aa_scores.apply(assign_curation, axis=1))
aa_onco_df.to_csv(aa_out_file, sep='\t')
aa_onco_df.head(2)
Out[8]:
In [9]:
# Add curation to specific mutation scores
mut_scores_df = pd.read_table(sample_mutation_scores_file, index_col=0)
mut_scores_df = (
mut_scores_df[mut_scores_df['Variant_Classification']
.isin(['Missense_Mutation', 'Nonsense_Mutation'])]
)
ras_pathway_scores_df = mut_scores_df[mut_scores_df['Hugo_Symbol'].isin(ras_variant_df.index.tolist())]
ras_pathway_scores_df = (
ras_pathway_scores_df.assign(
curation = ras_pathway_scores_df.apply(lambda x:
assign_curation(x, df='sample'), axis=1))
)
ras_pathway_scores_df.head(2)
Out[9]:
In [10]:
# Separate out samples with Oncogenic curation status
oncogenic_sample_df = ras_pathway_scores_df[ras_pathway_scores_df['curation'] == 'Oncogenic']
unconfirmed_sample_df = (
ras_pathway_scores_df[~ras_pathway_scores_df.index.isin(oncogenic_sample_df.index)]
)
filtered_df = pd.concat([oncogenic_sample_df, unconfirmed_sample_df])
In [11]:
# Generate and Save plots
plt.rcParams['figure.figsize']=(3.5, 4)
t_test_results = []
x1, x2 = 0, 1
for gene in set(mut_scores_df['Hugo_Symbol']):
gene_mutation_score_df = filtered_df[filtered_df['Hugo_Symbol'] == gene]
gene_mutation_score_df = gene_mutation_score_df.dropna(axis=0, subset=['weight'])
fig_name = os.path.join('..', 'figures', 'variants', 'variant_prediction_{}.pdf'.format(gene))
try:
# perform an independent t-test for prediction scores by oncogenicity
oncogenic_scores = (
gene_mutation_score_df.loc[
gene_mutation_score_df['curation'] == 'Oncogenic', 'weight']
)
unconfirmed_scores = (
gene_mutation_score_df.loc[
gene_mutation_score_df['curation'] == 'Unconfirmed', 'weight']
)
t_results = ttest_ind(a = oncogenic_scores,
b = unconfirmed_scores, equal_var = False)
add_result = [gene, t_results.pvalue, t_results.statistic]
t_test_results.append(add_result)
# Setup p value annotation
max_val = gene_mutation_score_df['weight'].max()
y, h = max_val + 0.06, 0.05
# Plot
ax = sns.stripplot(x='curation', y='weight', data=gene_mutation_score_df,
palette = {'Oncogenic': "seagreen", 'Unconfirmed': 'goldenrod'},
jitter=0.35, size=3.25, alpha=0.65)
ax.axes.set_ylim(0, max_val + 0.2)
ax.set_yticklabels([0, 0.2, 0.4, 0.6, 0.8, 1, ''])
ax.set_ylabel('Ras Classifier Score')
ax.set_xlabel(gene)
plt.axhline(0.5, color='grey', linestyle='dashed', linewidth=2)
# Only display t-test bars if there are two classes of data
if len(oncogenic_scores) != 0 and len(unconfirmed_scores) != 0:
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c='black')
plt.text(.5, y+h, "{:.2E}".format(Decimal(t_results.pvalue)),
ha='center', va='bottom', color="black")
plt.tight_layout()
plt.savefig(fig_name)
plt.close()
except:
next
In [12]:
# Write out full t-test results
t_test_file = os.path.join('..', 'results', 'ras_t_test_oncogenicity.tsv')
with open(t_test_file, 'w') as csvfile:
onco_writer = csv.writer(csvfile, delimiter='\t')
onco_writer.writerow(['gene', 'p_value', 't_statistic'])
for t_ in t_test_results:
onco_writer.writerow(t_)
In [13]:
mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv')
classifier_scores_file = os.path.join('..', 'classifiers', 'RAS',
'classifier_decisions.tsv')
mutation_df = pd.read_table(mut_file, index_col=0)
classifier_scores_df = pd.read_table(classifier_scores_file, index_col=0)
In [14]:
genes = ['TP53', 'SUZ12']
y = mutation_df[genes]
In [15]:
plot_df = y.merge(classifier_scores_df, left_index=True, right_index=True)
In [16]:
def plot_ras_score(df, alt_gene):
ax = sns.boxplot(x="total_status", y="weight", data=plot_df,
hue=alt_gene, palette = {0: "whitesmoke", 1: 'gainsboro'},
fliersize=0)
ax = sns.stripplot(x='total_status', y='weight', hue=alt_gene,
data=plot_df,
dodge=True, edgecolor='gray',
palette = {1: "seagreen", 0: 'goldenrod'},
jitter=0.25, size=2, alpha=0.65)
handles, labels = ax.get_legend_handles_labels()
l = plt.legend(handles[2:4], ['Wild-Type', 'Mutant'], bbox_to_anchor=(.63, 0.2), loc=2, borderaxespad=0.)
l.set_title(alt_gene)
ax.axes.set_ylim(0, 1.3)
ax.set_yticklabels([0, 0.2, 0.4, 0.6, 0.8, 1, ''])
ax.set_xticklabels(['Ras WT', 'Ras Mutant'])
ax.set_ylabel('Ras Classifier Score')
ax.set_xlabel('Ras Status')
ax.legend
plt.axhline(0.5, color='black', linestyle='dashed', linewidth=1)
# Ras mutant vs. Ras wildtype
ras_mutant = plot_df[plot_df['total_status'] == 1]
ras_wt = plot_df[plot_df['total_status'] == 0]
# Also interested in alt_gene status within Ras mutant samples
alt_gene_mutant = ras_mutant[ras_mutant[alt_gene] == 1]
alt_gene_wt = ras_mutant[ras_mutant[alt_gene] == 0]
# Also interested in alt_gene status within Ras wildtype samples
alt_gene_mutant_raswt = ras_wt[ras_wt[alt_gene] == 1]
alt_gene_wt_raswt = ras_wt[ras_wt[alt_gene] == 0]
# Output t-test results
t_results_ras = ttest_ind(a = ras_mutant['weight'],
b = ras_wt['weight'], equal_var = False)
t_results_alt = ttest_ind(a = alt_gene_mutant['weight'],
b = alt_gene_wt['weight'], equal_var = False)
t_results_alt_raswt = ttest_ind(a = alt_gene_mutant_raswt['weight'],
b = alt_gene_wt_raswt['weight'], equal_var = False)
# Add Ras T-Test Results
plt.plot([x1, x1, x2, x2], [y1, y1+h, y1+h, y1], lw=1.2, c='black')
plt.text(.5, y1+h, t_results_ras.pvalue,
ha='center', va='bottom', color="black")
# Add alt_gene mut t-test results
plt.plot([x3, x3, x4, x4], [y2, y2+h, y2+h, y2], lw=1.2, c='black')
plt.text(0, y2+h, "{:.2E}".format(Decimal(t_results_alt_raswt.pvalue)),
ha='center', va='bottom', color="black")
# Add alt_gene wild-type t-test results
plt.plot([x5, x5, x6, x6], [y2, y2+h, y2+h, y2], lw=1.2, c='black')
plt.text(1, y2+h, "{:.2E}".format(Decimal(t_results_alt.pvalue)),
ha='center', va='bottom', color="black")
plt.tight_layout()
In [17]:
# Plot Results
x1, x2 = 0, 1
x3, x4 = -0.2, 0.2
x5, x6 = 0.8, 1.2
y1, y2, h = 1.17, 1, 0.03
plt.rcParams['figure.figsize']=(3.5, 4)
In [18]:
plot_ras_score(plot_df, 'SUZ12')
base_path = os.path.join('..', 'classifiers', 'RAS', 'figures')
fig_file = os.path.join(base_path, 'suz12_cooccurence.png')
plt.savefig(fig_file)
In [19]:
plot_ras_score(plot_df, 'TP53')
fig_file = os.path.join(base_path, 'tp53_cooccurence.png')
plt.savefig(fig_file)