Visualizing Ras Classification Scores by Oncogenicity Status

The Pathway Working Group from TCGA PanCancerAtlas curated variants in Ras Pathway genes by their expert-predicted oncogenicity status (either oncogenic or unconfirmed). Here, we output three sets of files

Oncogenicity scores by Ras classifier score dataframe
Swarm plots of Ras classifier scores by Oncogenicity status
Co-occurence plots of Ras by TP53 and SUZ12 status



In [1]:

    
import os
import csv
import numpy as np
import pandas as pd
from decimal import Decimal
from scipy.stats import ttest_ind

import matplotlib.pyplot as plt
import seaborn as sns



In [2]:

    
%matplotlib inline
plt.style.use('seaborn-notebook')



In [3]:

    
sns.set(style="whitegrid")
sns.set_context("paper", rc={"font.size":14, "axes.titlesize":15, "axes.labelsize":20,
                             'xtick.labelsize':14, 'ytick.labelsize':14})



In [4]:

    
np.random.seed(123)



In [5]:

    
def assign_curation(aa_row, df='ras_variant'):
    """
    Determines if the specific amino acid mutation is cataloged as Oncogenic 
    in the Ras pathway or not. To be used as a call to pandas.DataFrame().apply() 
    
    Arguments:
      aa_row - a row that has gene and mutation information
      df - the type of dataframe called, can either be `ras_variant` or `sample`
           depending on the type of dataframe to assign curation
    
    Output:
      Oncogenicity Status for the given amino acid mutation
    """
    gene = aa_row['Hugo_Symbol']
    if df == 'ras_variant':
        aa = aa_row.name
    elif df == 'sample':
        aa = aa_row['HGVSp']
    ras_sub = ras_variant_df[ras_variant_df.index == gene]
    if len(ras_sub) == 0:
        return 'Not Ras'
    
    if aa in ras_sub['mutation'].tolist():
        return 'Oncogenic'
    else:
        return 'Unconfirmed'



In [6]:

    
# Set File names
aa_mutation_scores_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
                                       'amino_acid_mutation_scores.tsv')
ras_variant_curation_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
                                         'Ras_pathway_variant_oncogenicity_data.tsv')
sample_mutation_scores_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
                                           'mutation_classification_scores.tsv')

# Output Files
aa_out_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
                           'RAS_oncogenecity_predictions.tsv')



In [7]:

    
# Load Ras variant curation file
ras_variant_df = pd.read_table(ras_variant_curation_file, index_col=0)
ras_variant_df.head(2)









    Out[7]:







  
    
      
      mutation
      gene_type
      MutSig
      hotspots
      copy_number_fusion
      GISTIC_amp
      GISTIC_del
      McCormick
    
    
      Gene
      
      
      
      
      
      
      
      
    
  
  
    
      ALK
      fusion gene
      OG
      1
      some hotspots & known mutations
      fusion gene
      NaN
      ;
      yes
    
    
      ALK
      p.Asp1203Tyr
      OG
      1
      some hotspots & known mutations
      fusion gene
      NaN
      ;
      yes



In [8]:

    
# Add oncogenecity designation to each amino acid mutation and write to file
aa_scores = pd.read_table(aa_mutation_scores_file, index_col=0)

aa_onco_df = aa_scores.assign(designation = aa_scores.apply(assign_curation, axis=1))
aa_onco_df.to_csv(aa_out_file, sep='\t')

aa_onco_df.head(2)









    Out[8]:







  
    
      
      Variant_Classification
      Hugo_Symbol
      Mean
      SD
      count
      low_CI
      high_CI
      designation
    
    
      HGVSp
      
      
      
      
      
      
      
      
    
  
  
    
      p.Val600Glu
      Missense_Mutation
      BRAF
      0.379832
      0.222155
      453
      0.360466
      0.401139
      Oncogenic
    
    
      p.Gly12Asp
      Missense_Mutation
      KRAS
      0.819980
      0.126311
      166
      0.800135
      0.838195
      Oncogenic



In [9]:

    
# Add curation to specific mutation scores
mut_scores_df = pd.read_table(sample_mutation_scores_file, index_col=0)
mut_scores_df = (
    mut_scores_df[mut_scores_df['Variant_Classification']
                  .isin(['Missense_Mutation', 'Nonsense_Mutation'])]
    )
ras_pathway_scores_df = mut_scores_df[mut_scores_df['Hugo_Symbol'].isin(ras_variant_df.index.tolist())]
ras_pathway_scores_df = (
    ras_pathway_scores_df.assign(
        curation = ras_pathway_scores_df.apply(lambda x:
                                               assign_curation(x, df='sample'), axis=1))
    )
ras_pathway_scores_df.head(2)









    Out[9]:







  
    
      
      log10_mut
      total_status
      weight
      NRAS
      HRAS
      KRAS
      HRAS_gain
      KRAS_gain
      NRAS_gain
      PATIENT_BARCODE
      ...
      SUBTYPE
      hypermutated
      include
      ID.1
      Tumor_Sample_Barcode
      Hugo_Symbol
      HGVSc
      HGVSp
      Variant_Classification
      curation
    
    
      ID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      TCGA-02-2485-01
      1.748188
      0.0
      0.467431
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      TCGA-02-2485
      ...
      IDHwt
      0.0
      0.0
      TCGA-02-2485-01
      TCGA-02-2485-01A-01D-1494-08
      EGFR
      c.866C>A
      p.Ala289Asp
      Missense_Mutation
      Oncogenic
    
    
      TCGA-04-1362-01
      1.838849
      0.0
      0.484370
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      TCGA-04-1362
      ...
      Not_Applicable
      0.0
      1.0
      TCGA-04-1362-01
      TCGA-04-1362-01A-01W-0492-08
      SOS1
      c.3691C>G
      p.Leu1231Val
      Missense_Mutation
      Unconfirmed
    
  

2 rows × 21 columns



In [10]:

    
# Separate out samples with Oncogenic curation status
oncogenic_sample_df = ras_pathway_scores_df[ras_pathway_scores_df['curation'] == 'Oncogenic']
unconfirmed_sample_df = (
    ras_pathway_scores_df[~ras_pathway_scores_df.index.isin(oncogenic_sample_df.index)]
    )

filtered_df = pd.concat([oncogenic_sample_df, unconfirmed_sample_df])



In [11]:

    
# Generate and Save plots
plt.rcParams['figure.figsize']=(3.5, 4)
t_test_results = []
x1, x2 = 0, 1

for gene in set(mut_scores_df['Hugo_Symbol']):
    gene_mutation_score_df = filtered_df[filtered_df['Hugo_Symbol'] == gene]
    gene_mutation_score_df = gene_mutation_score_df.dropna(axis=0, subset=['weight'])
    fig_name = os.path.join('..', 'figures', 'variants', 'variant_prediction_{}.pdf'.format(gene))
    try:
        # perform an independent t-test for prediction scores by oncogenicity
        oncogenic_scores = (
            gene_mutation_score_df.loc[
                gene_mutation_score_df['curation'] == 'Oncogenic', 'weight']
            )
        unconfirmed_scores = (
            gene_mutation_score_df.loc[
                gene_mutation_score_df['curation'] == 'Unconfirmed', 'weight']
            )

        t_results = ttest_ind(a = oncogenic_scores,
                              b = unconfirmed_scores, equal_var = False)
        add_result = [gene, t_results.pvalue, t_results.statistic]
        t_test_results.append(add_result)
        
        # Setup p value annotation
        max_val = gene_mutation_score_df['weight'].max()
        y, h = max_val + 0.06, 0.05
        
        # Plot
        ax = sns.stripplot(x='curation', y='weight', data=gene_mutation_score_df, 
                           palette = {'Oncogenic': "seagreen", 'Unconfirmed': 'goldenrod'},
                           jitter=0.35, size=3.25, alpha=0.65)
        ax.axes.set_ylim(0, max_val + 0.2)
        ax.set_yticklabels([0, 0.2, 0.4, 0.6, 0.8, 1, ''])
        ax.set_ylabel('Ras Classifier Score')
        ax.set_xlabel(gene)
        plt.axhline(0.5, color='grey', linestyle='dashed', linewidth=2)
        
        # Only display t-test bars if there are two classes of data
        if len(oncogenic_scores) != 0 and len(unconfirmed_scores) != 0:
            plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c='black')
            plt.text(.5, y+h, "{:.2E}".format(Decimal(t_results.pvalue)),
                     ha='center', va='bottom', color="black")
    
        plt.tight_layout()
        plt.savefig(fig_name)
        plt.close()
    
    except:
        next









    



/home/gway/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:3146: RuntimeWarning: Degrees of freedom <= 0 for slice
  **kwargs)
/home/gway/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:127: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)



In [12]:

    
# Write out full t-test results
t_test_file = os.path.join('..', 'results', 'ras_t_test_oncogenicity.tsv')
with open(t_test_file, 'w') as csvfile:
    onco_writer = csv.writer(csvfile, delimiter='\t')
    onco_writer.writerow(['gene', 'p_value', 't_statistic'])
    for t_ in t_test_results:
        onco_writer.writerow(t_)

Investigate cooccurrence of Ras mutations with TP53 and SUZ12



In [13]:

    
mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv')
classifier_scores_file = os.path.join('..', 'classifiers', 'RAS',
                                      'classifier_decisions.tsv')

mutation_df = pd.read_table(mut_file, index_col=0)
classifier_scores_df = pd.read_table(classifier_scores_file, index_col=0)



In [14]:

    
genes = ['TP53', 'SUZ12']
y = mutation_df[genes]



In [15]:

    
plot_df = y.merge(classifier_scores_df, left_index=True, right_index=True)



In [16]:

    
def plot_ras_score(df, alt_gene):
    ax = sns.boxplot(x="total_status", y="weight", data=plot_df,
                 hue=alt_gene, palette = {0: "whitesmoke", 1: 'gainsboro'},
                 fliersize=0)
    ax = sns.stripplot(x='total_status', y='weight', hue=alt_gene,
                       data=plot_df, 
                       dodge=True, edgecolor='gray',
                       palette = {1: "seagreen", 0: 'goldenrod'},
                       jitter=0.25, size=2, alpha=0.65)
    handles, labels = ax.get_legend_handles_labels()
    l = plt.legend(handles[2:4], ['Wild-Type', 'Mutant'], bbox_to_anchor=(.63, 0.2), loc=2, borderaxespad=0.)
    l.set_title(alt_gene)
    ax.axes.set_ylim(0, 1.3)
    ax.set_yticklabels([0, 0.2, 0.4, 0.6, 0.8, 1, ''])
    ax.set_xticklabels(['Ras WT', 'Ras Mutant'])
    ax.set_ylabel('Ras Classifier Score')
    ax.set_xlabel('Ras Status')
    ax.legend
    plt.axhline(0.5, color='black', linestyle='dashed', linewidth=1)
    
    # Ras mutant vs. Ras wildtype
    ras_mutant = plot_df[plot_df['total_status'] == 1]
    ras_wt = plot_df[plot_df['total_status'] == 0]

    # Also interested in alt_gene status within Ras mutant samples
    alt_gene_mutant = ras_mutant[ras_mutant[alt_gene] == 1]
    alt_gene_wt = ras_mutant[ras_mutant[alt_gene] == 0]

    # Also interested in alt_gene status within Ras wildtype samples
    alt_gene_mutant_raswt = ras_wt[ras_wt[alt_gene] == 1]
    alt_gene_wt_raswt = ras_wt[ras_wt[alt_gene] == 0]
    
    # Output t-test results
    t_results_ras = ttest_ind(a = ras_mutant['weight'],
                              b = ras_wt['weight'], equal_var = False)

    t_results_alt = ttest_ind(a = alt_gene_mutant['weight'],
                              b = alt_gene_wt['weight'], equal_var = False)
    
    t_results_alt_raswt = ttest_ind(a = alt_gene_mutant_raswt['weight'],
                                    b = alt_gene_wt_raswt['weight'], equal_var = False)
    
    # Add Ras T-Test Results
    plt.plot([x1, x1, x2, x2], [y1, y1+h, y1+h, y1], lw=1.2, c='black')
    plt.text(.5, y1+h, t_results_ras.pvalue,
             ha='center', va='bottom', color="black")

    # Add alt_gene mut t-test results
    plt.plot([x3, x3, x4, x4], [y2, y2+h, y2+h, y2], lw=1.2, c='black')
    plt.text(0, y2+h, "{:.2E}".format(Decimal(t_results_alt_raswt.pvalue)),
             ha='center', va='bottom', color="black")
    
    # Add alt_gene wild-type t-test results
    plt.plot([x5, x5, x6, x6], [y2, y2+h, y2+h, y2], lw=1.2, c='black')
    plt.text(1, y2+h, "{:.2E}".format(Decimal(t_results_alt.pvalue)),
             ha='center', va='bottom', color="black")
    plt.tight_layout()



In [17]:

    
# Plot Results
x1, x2 = 0, 1
x3, x4 = -0.2, 0.2
x5, x6 = 0.8, 1.2
y1, y2, h = 1.17, 1, 0.03
plt.rcParams['figure.figsize']=(3.5, 4)



In [18]:

    
plot_ras_score(plot_df, 'SUZ12')
base_path = os.path.join('..', 'classifiers', 'RAS', 'figures')
fig_file = os.path.join(base_path, 'suz12_cooccurence.png')
plt.savefig(fig_file)



In [19]:

    
plot_ras_score(plot_df, 'TP53')
fig_file = os.path.join(base_path, 'tp53_cooccurence.png')
plt.savefig(fig_file)

	mutation	gene_type	MutSig	hotspots	copy_number_fusion	GISTIC_amp	GISTIC_del	McCormick
Gene
ALK	fusion gene	OG	1	some hotspots & known mutations	fusion gene	NaN	;	yes
ALK	p.Asp1203Tyr	OG	1	some hotspots & known mutations	fusion gene	NaN	;	yes

	Variant_Classification	Hugo_Symbol	Mean	SD	count	low_CI	high_CI	designation
HGVSp
p.Val600Glu	Missense_Mutation	BRAF	0.379832	0.222155	453	0.360466	0.401139	Oncogenic
p.Gly12Asp	Missense_Mutation	KRAS	0.819980	0.126311	166	0.800135	0.838195	Oncogenic

	log10_mut	total_status	weight	NRAS	HRAS	KRAS	HRAS_gain	KRAS_gain	NRAS_gain	PATIENT_BARCODE	...	SUBTYPE	hypermutated	include	ID.1	Tumor_Sample_Barcode	Hugo_Symbol	HGVSc	HGVSp	Variant_Classification	curation
ID
TCGA-02-2485-01	1.748188	0.0	0.467431	0.0	0.0	0.0	0.0	0.0	0.0	TCGA-02-2485	...	IDHwt	0.0	0.0	TCGA-02-2485-01	TCGA-02-2485-01A-01D-1494-08	EGFR	c.866C>A	p.Ala289Asp	Missense_Mutation	Oncogenic
TCGA-04-1362-01	1.838849	0.0	0.484370	0.0	0.0	0.0	0.0	0.0	0.0	TCGA-04-1362	...	Not_Applicable	0.0	1.0	TCGA-04-1362-01	TCGA-04-1362-01A-01W-0492-08	SOS1	c.3691C>G	p.Leu1231Val	Missense_Mutation	Unconfirmed