In [1]:
import os
import sys
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score, average_precision_score

In [2]:
# Get the current working directory
cwd = os.getcwd()

# Ensure that the path is starting in the scripts directory
if not cwd.split('/')[-1] == 'scripts':
    sys.path.append(os.path.join(cwd, 'scripts'))

In [3]:
def get_gene_auroc(x, w):
    score = roc_auc_score(x, w, average='weighted')
    return(score)

def get_gene_auprc(x, w):
    score = average_precision_score(x, w, average='weighted')
    return(score)

In [4]:
ras_folder = os.path.join('..', 'classifiers', 'RAS')

In [5]:
# Load Datasets
mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv.gz')
sample_freeze_file = os.path.join('..', 'data', 'sample_freeze.tsv')
copy_loss_file = os.path.join('..', 'data', 'copy_number_loss_status.tsv.gz')
copy_gain_file = os.path.join('..', 'data', 'copy_number_gain_status.tsv.gz')

mutation_df = pd.read_table(mut_file, index_col=0)
sample_freeze = pd.read_table(sample_freeze_file, index_col=0)
copy_loss_df = pd.read_table(copy_loss_file, index_col=0)
copy_gain_df = pd.read_table(copy_gain_file, index_col=0)

In [6]:
# Load Ras Pathway Genes
ras_genes_file = os.path.join('..', 'data', 'ras_genes.csv')
ras_genes_df = pd.read_table(ras_genes_file)

In [7]:
# Load classifier weights
ras_decision_file = os.path.join(ras_folder, 'classifier_decisions.tsv')
ras_decisions_df = pd.read_table(ras_decision_file)
ras_decisions_df.head()


Out[7]:
SAMPLE_BARCODE log10_mut total_status weight NRAS KRAS HRAS HRAS_gain KRAS_gain NRAS_gain PATIENT_BARCODE DISEASE SUBTYPE hypermutated include
0 TCGA-02-0047-01 1.812913 0 0.357117 0 0 0 0 0 0 TCGA-02-0047 GBM IDHwt 0 0
1 TCGA-02-0055-01 1.707570 0 0.530723 0 0 0 0 0 0 TCGA-02-0055 GBM IDHwt 0 0
2 TCGA-02-2483-01 1.662758 0 0.642091 0 0 0 0 0 0 TCGA-02-2483 GBM IDHmut-non-codel 0 0
3 TCGA-02-2485-01 1.748188 0 0.467431 0 0 0 0 0 0 TCGA-02-2485 GBM IDHwt 0 0
4 TCGA-02-2486-01 1.755875 0 0.361267 0 0 0 0 0 0 TCGA-02-2486 GBM IDHwt 0 0

In [8]:
ras_mutations_df = mutation_df[ras_genes_df['genes']]

# Add status to the Y matrix depending on if the gene is a tumor suppressor
# or an oncogene. An oncogene can be activated with copy number gains, but
# a tumor suppressor is inactivated with copy number loss
oncogene = ras_genes_df[ras_genes_df['og_tsg'] == 'OG']
tumor_suppressor = ras_genes_df[ras_genes_df['og_tsg'] == 'TSG']

# Subset copy number information
ras_copy_gain_sub_df = copy_gain_df[oncogene['genes']]
ras_copy_loss_sub_df = copy_loss_df[tumor_suppressor['genes']]

# Combine Copy Number data
ras_copy_df = pd.concat([ras_copy_gain_sub_df, ras_copy_loss_sub_df], axis=1)

In [9]:
ras_status_df = ras_mutations_df + ras_copy_df
ras_status_df[ras_status_df == 2] = 1

In [10]:
subset_columns = ['SAMPLE_BARCODE', 'DISEASE', 'weight', 'total_status', 'log10_mut',
                  'hypermutated', 'include']
ras_decisions_subset_df = ras_decisions_df[subset_columns]
ras_full_status_df = ras_status_df.merge(ras_decisions_subset_df, left_index=True,
                                         right_on='SAMPLE_BARCODE')
ras_full_status_df.index = ras_full_status_df['SAMPLE_BARCODE']

In [11]:
# Remove hyper mutated samples
burden_filter = ras_full_status_df['hypermutated'] == 0
burden_filter = burden_filter & ras_full_status_df['log10_mut'] < 5 * ras_full_status_df['log10_mut'].std()
ras_full_status_df = ras_full_status_df[burden_filter]
ras_full_status_df.head(3)


Out[11]:
ALK ARAF BRAF EGFR ERBB2 ERBB3 ERBB4 FGFR1 FGFR2 FGFR3 ... ERRFI1 NF1 RASA1 SAMPLE_BARCODE DISEASE weight total_status log10_mut hypermutated include
SAMPLE_BARCODE
TCGA-02-0047-01 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 TCGA-02-0047-01 GBM 0.357117 0 1.812913 0 0
TCGA-02-0055-01 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 TCGA-02-0055-01 GBM 0.530723 0 1.707570 0 0
TCGA-02-2483-01 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 TCGA-02-2483-01 GBM 0.642091 0 1.662758 0 0

3 rows × 45 columns


In [12]:
full_auroc = (
    ras_full_status_df[ras_genes_df['genes']]
    .apply(lambda x: get_gene_auroc(x, ras_full_status_df['weight']))
    )

full_auprc = (
    ras_full_status_df[ras_genes_df['genes']]
    .apply(lambda x: get_gene_auprc(x, ras_full_status_df['weight']))
    )

In [13]:
# Remove Ras positive samples, and recalculate metrics
remove_ras_status = ras_full_status_df[ras_full_status_df['total_status'] == 0]
remove_ras_status_df = remove_ras_status[ras_genes_df['genes']]
remove_ras_status_df = remove_ras_status_df.drop(['KRAS', 'HRAS', 'NRAS'], axis=1)
full_auroc_remove = remove_ras_status_df.apply(lambda x: get_gene_auroc(x, w=remove_ras_status['weight']))
full_auprc_remove = remove_ras_status_df.apply(lambda x: get_gene_auprc(x, w=remove_ras_status['weight']))

In [14]:
# Get output metrics for Ras classification
output_ras_metrics = pd.concat([full_auroc, full_auroc_remove], axis=1, sort=False)
output_ras_metrics = output_ras_metrics * 100  # To get percent
output_ras_metrics = output_ras_metrics - 50  # Subtract 50 from AUROC only

# Combine with AUPRC
output_ras_metrics = pd.concat([output_ras_metrics, full_auprc * 100,
                                full_auprc_remove * 100], axis=1, sort=False)
output_ras_metrics.columns = ['ras_auroc', 'no_ras_auroc', 'ras_auprc', 'no_ras_auprc']

# Fill removed Ras metrics with included metrics
output_ras_metrics['no_ras_auroc'] = (
    output_ras_metrics['no_ras_auroc'].fillna(output_ras_metrics['ras_auroc'])
    )
output_ras_metrics['no_ras_auprc'] = (
    output_ras_metrics['no_ras_auprc'].fillna(output_ras_metrics['ras_auprc'])
    )

# Write results to file
tables_folder = os.path.join(ras_folder, 'tables')
if not os.path.exists(tables_folder):
    os.makedirs(tables_folder)

ras_metric_file = os.path.join(ras_folder, 'tables', 'ras_metrics_pathwaymapper.txt')
output_ras_metrics.to_csv(ras_metric_file, sep='\t')

output_ras_metrics.head()


Out[14]:
ras_auroc no_ras_auroc ras_auprc no_ras_auprc
ALK 10.336711 9.806159 5.136642 4.880183
ARAF 9.596890 9.838528 2.657702 2.588441
BRAF -5.287740 -4.301018 8.447411 12.584116
EGFR -4.273536 -3.678272 5.194929 5.384128
ERBB2 8.547825 10.703101 7.275319 7.803525

In [15]:
# Display Ras pathway metrics
all_samples_ras_pathway_status = ras_full_status_df[ras_genes_df['genes']].max(axis=1)
print('Ras Pathway Performance Summary: All Ras Genes')
print('AUROC:')
print(roc_auc_score(all_samples_ras_pathway_status,
                    ras_full_status_df['weight'], average='weighted'))
print('AUPRC:')
print(average_precision_score(all_samples_ras_pathway_status,
                              ras_full_status_df['weight'], average='weighted'))


Ras Pathway Performance Summary: All Ras Genes
AUROC:
0.5905870096512793
AUPRC:
0.7106392434670601

In [16]:
print('Ras Pathway Performance Summary: KRAS, NRAS, HRAS')
print('AUROC:')
print(roc_auc_score(ras_full_status_df['total_status'],
                    ras_full_status_df['weight'], average='weighted'))
print('AUPRC:')
print(average_precision_score(ras_full_status_df['total_status'],
                              ras_full_status_df['weight'], average='weighted'))


Ras Pathway Performance Summary: KRAS, NRAS, HRAS
AUROC:
0.8672490043723222
AUPRC:
0.6123556579784297

In [17]:
print('Ras Pathway Performance Summary: Held Out Samples')
held_out_ras_df = ras_full_status_df[ras_full_status_df['include'] == 0]
print('AUROC:')
print(roc_auc_score(held_out_ras_df['total_status'],
                    held_out_ras_df['weight'], average='weighted'))
print('AUPRC:')
print(average_precision_score(held_out_ras_df['total_status'],
                              held_out_ras_df['weight'], average='weighted'))


Ras Pathway Performance Summary: Held Out Samples
AUROC:
0.7521227354642891
AUPRC:
0.23966565191694897

Visualize Distribution of AUROC and AUPRC for all genes


In [18]:
# Subset mutation file by samples
sub_full_mutation_df = mutation_df[burden_filter]
low_mutation_count_filter = (
    sub_full_mutation_df.sum()
    [sub_full_mutation_df.sum() >= 10].sort_values(ascending=False).index
    )
sub_full_mutation_df = sub_full_mutation_df[low_mutation_count_filter]
sub_full_mutation_df.head()


Out[18]:
TP53 TTN MUC16 PIK3CA CSMD3 RYR2 LRP1B SYNE1 FLG USH2A ... C17orf50 C17orf105 C17orf103 MBD3L2 DKFZP761K2322 RP11-434D2.3 FXYD4 AC110619.2 RP11-386M24.4 C15orf40
SAMPLE_BARCODE
TCGA-02-0047-01 0 0 0 1 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TCGA-02-0055-01 1 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TCGA-02-2483-01 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TCGA-02-2485-01 1 0 1 1 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
TCGA-02-2486-01 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 19372 columns


In [19]:
# Get Metrics for All Genes
all_auprc = sub_full_mutation_df.apply(lambda x: get_gene_auprc(x, w = ras_full_status_df['weight']))
all_auroc = sub_full_mutation_df.apply(lambda x: get_gene_auroc(x, w = ras_full_status_df['weight']))

In [20]:
# Process file and save results
all_gene_metrics_file = os.path.join(ras_folder, 'tables', 'all_gene_metric_ranks.tsv')

all_genes_auprc_df = pd.DataFrame(all_auprc.sort_values(ascending=False), columns=['auprc'])
all_genes_auroc_df = pd.DataFrame(all_auroc.sort_values(ascending=False), columns=['auroc'])

all_genes_auprc_df = all_genes_auprc_df.assign(auprc_rank = list(range(0, all_genes_auprc_df.shape[0])))
all_genes_auroc_df = all_genes_auroc_df.assign(auroc_rank = list(range(0, all_genes_auprc_df.shape[0])))

all_genes_auprc_df = all_genes_auprc_df.assign(ras = 0)
all_genes_auprc_df.loc[all_genes_auprc_df.index.isin(ras_genes_df['genes']), 'ras'] = 1

all_genes_metrics_df = all_genes_auprc_df.reset_index().merge(all_genes_auroc_df,
                                                              left_on='index', right_index=True)

all_genes_metrics_df.columns = ['Gene', 'AUPRC', 'AUPRC Rank', 'ras', 'AUROC', 'AUROC Rank']
all_genes_metrics_df.to_csv(all_gene_metrics_file, sep='\t', index=False)
all_genes_metrics_df.head(10)


Out[20]:
Gene AUPRC AUPRC Rank ras AUROC AUROC Rank
0 KRAS 0.480921 0 1 0.917009 0
1 TTN 0.374206 1 0 0.559094 18591
2 TP53 0.359681 2 0 0.456115 19329
3 MUC16 0.266557 3 0 0.591954 16894
4 APC 0.250905 4 0 0.773767 47
5 LRP1B 0.182983 5 0 0.596804 16396
6 CSMD3 0.172012 6 0 0.567850 18312
7 RYR2 0.171406 7 0 0.588802 17195
8 PIK3CA 0.168235 8 0 0.577421 17938
9 SYNE1 0.162610 9 0 0.570606 18222