notebook.community

Edit and run



In [1]:

    
import os
import sys
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score, average_precision_score



In [2]:

    
# Get the current working directory
cwd = os.getcwd()

# Ensure that the path is starting in the scripts directory
if not cwd.split('/')[-1] == 'scripts':
    sys.path.append(os.path.join(cwd, 'scripts'))



In [3]:

    
def get_gene_auroc(x, w):
    score = roc_auc_score(x, w, average='weighted')
    return(score)

def get_gene_auprc(x, w):
    score = average_precision_score(x, w, average='weighted')
    return(score)



In [4]:

    
ras_folder = os.path.join('..', 'classifiers', 'RAS')



In [5]:

    
# Load Datasets
mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv.gz')
sample_freeze_file = os.path.join('..', 'data', 'sample_freeze.tsv')
copy_loss_file = os.path.join('..', 'data', 'copy_number_loss_status.tsv.gz')
copy_gain_file = os.path.join('..', 'data', 'copy_number_gain_status.tsv.gz')

mutation_df = pd.read_table(mut_file, index_col=0)
sample_freeze = pd.read_table(sample_freeze_file, index_col=0)
copy_loss_df = pd.read_table(copy_loss_file, index_col=0)
copy_gain_df = pd.read_table(copy_gain_file, index_col=0)



In [6]:

    
# Load Ras Pathway Genes
ras_genes_file = os.path.join('..', 'data', 'ras_genes.csv')
ras_genes_df = pd.read_table(ras_genes_file)



In [7]:

    
# Load classifier weights
ras_decision_file = os.path.join(ras_folder, 'classifier_decisions.tsv')
ras_decisions_df = pd.read_table(ras_decision_file)
ras_decisions_df.head()









    Out[7]:







  
    
      
      SAMPLE_BARCODE
      log10_mut
      total_status
      weight
      NRAS
      KRAS
      HRAS
      HRAS_gain
      KRAS_gain
      NRAS_gain
      PATIENT_BARCODE
      DISEASE
      SUBTYPE
      hypermutated
      include
    
  
  
    
      0
      TCGA-02-0047-01
      1.812913
      0
      0.357117
      0
      0
      0
      0
      0
      0
      TCGA-02-0047
      GBM
      IDHwt
      0
      0
    
    
      1
      TCGA-02-0055-01
      1.707570
      0
      0.530723
      0
      0
      0
      0
      0
      0
      TCGA-02-0055
      GBM
      IDHwt
      0
      0
    
    
      2
      TCGA-02-2483-01
      1.662758
      0
      0.642091
      0
      0
      0
      0
      0
      0
      TCGA-02-2483
      GBM
      IDHmut-non-codel
      0
      0
    
    
      3
      TCGA-02-2485-01
      1.748188
      0
      0.467431
      0
      0
      0
      0
      0
      0
      TCGA-02-2485
      GBM
      IDHwt
      0
      0
    
    
      4
      TCGA-02-2486-01
      1.755875
      0
      0.361267
      0
      0
      0
      0
      0
      0
      TCGA-02-2486
      GBM
      IDHwt
      0
      0



In [8]:

    
ras_mutations_df = mutation_df[ras_genes_df['genes']]

# Add status to the Y matrix depending on if the gene is a tumor suppressor
# or an oncogene. An oncogene can be activated with copy number gains, but
# a tumor suppressor is inactivated with copy number loss
oncogene = ras_genes_df[ras_genes_df['og_tsg'] == 'OG']
tumor_suppressor = ras_genes_df[ras_genes_df['og_tsg'] == 'TSG']

# Subset copy number information
ras_copy_gain_sub_df = copy_gain_df[oncogene['genes']]
ras_copy_loss_sub_df = copy_loss_df[tumor_suppressor['genes']]

# Combine Copy Number data
ras_copy_df = pd.concat([ras_copy_gain_sub_df, ras_copy_loss_sub_df], axis=1)



In [9]:

    
ras_status_df = ras_mutations_df + ras_copy_df
ras_status_df[ras_status_df == 2] = 1



In [10]:

    
subset_columns = ['SAMPLE_BARCODE', 'DISEASE', 'weight', 'total_status', 'log10_mut',
                  'hypermutated', 'include']
ras_decisions_subset_df = ras_decisions_df[subset_columns]
ras_full_status_df = ras_status_df.merge(ras_decisions_subset_df, left_index=True,
                                         right_on='SAMPLE_BARCODE')
ras_full_status_df.index = ras_full_status_df['SAMPLE_BARCODE']



In [11]:

    
# Remove hyper mutated samples
burden_filter = ras_full_status_df['hypermutated'] == 0
burden_filter = burden_filter & ras_full_status_df['log10_mut'] < 5 * ras_full_status_df['log10_mut'].std()
ras_full_status_df = ras_full_status_df[burden_filter]
ras_full_status_df.head(3)









    Out[11]:







  
    
      
      ALK
      ARAF
      BRAF
      EGFR
      ERBB2
      ERBB3
      ERBB4
      FGFR1
      FGFR2
      FGFR3
      ...
      ERRFI1
      NF1
      RASA1
      SAMPLE_BARCODE
      DISEASE
      weight
      total_status
      log10_mut
      hypermutated
      include
    
    
      SAMPLE_BARCODE
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      TCGA-02-0047-01
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      TCGA-02-0047-01
      GBM
      0.357117
      0
      1.812913
      0
      0
    
    
      TCGA-02-0055-01
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      1
      0
      TCGA-02-0055-01
      GBM
      0.530723
      0
      1.707570
      0
      0
    
    
      TCGA-02-2483-01
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      TCGA-02-2483-01
      GBM
      0.642091
      0
      1.662758
      0
      0
    
  

3 rows × 45 columns



In [12]:

    
full_auroc = (
    ras_full_status_df[ras_genes_df['genes']]
    .apply(lambda x: get_gene_auroc(x, ras_full_status_df['weight']))
    )

full_auprc = (
    ras_full_status_df[ras_genes_df['genes']]
    .apply(lambda x: get_gene_auprc(x, ras_full_status_df['weight']))
    )



In [13]:

    
# Remove Ras positive samples, and recalculate metrics
remove_ras_status = ras_full_status_df[ras_full_status_df['total_status'] == 0]
remove_ras_status_df = remove_ras_status[ras_genes_df['genes']]
remove_ras_status_df = remove_ras_status_df.drop(['KRAS', 'HRAS', 'NRAS'], axis=1)
full_auroc_remove = remove_ras_status_df.apply(lambda x: get_gene_auroc(x, w=remove_ras_status['weight']))
full_auprc_remove = remove_ras_status_df.apply(lambda x: get_gene_auprc(x, w=remove_ras_status['weight']))



In [14]:

    
# Get output metrics for Ras classification
output_ras_metrics = pd.concat([full_auroc, full_auroc_remove], axis=1, sort=False)
output_ras_metrics = output_ras_metrics * 100  # To get percent
output_ras_metrics = output_ras_metrics - 50  # Subtract 50 from AUROC only

# Combine with AUPRC
output_ras_metrics = pd.concat([output_ras_metrics, full_auprc * 100,
                                full_auprc_remove * 100], axis=1, sort=False)
output_ras_metrics.columns = ['ras_auroc', 'no_ras_auroc', 'ras_auprc', 'no_ras_auprc']

# Fill removed Ras metrics with included metrics
output_ras_metrics['no_ras_auroc'] = (
    output_ras_metrics['no_ras_auroc'].fillna(output_ras_metrics['ras_auroc'])
    )
output_ras_metrics['no_ras_auprc'] = (
    output_ras_metrics['no_ras_auprc'].fillna(output_ras_metrics['ras_auprc'])
    )

# Write results to file
tables_folder = os.path.join(ras_folder, 'tables')
if not os.path.exists(tables_folder):
    os.makedirs(tables_folder)

ras_metric_file = os.path.join(ras_folder, 'tables', 'ras_metrics_pathwaymapper.txt')
output_ras_metrics.to_csv(ras_metric_file, sep='\t')

output_ras_metrics.head()









    Out[14]:







  
    
      
      ras_auroc
      no_ras_auroc
      ras_auprc
      no_ras_auprc
    
  
  
    
      ALK
      10.336711
      9.806159
      5.136642
      4.880183
    
    
      ARAF
      9.596890
      9.838528
      2.657702
      2.588441
    
    
      BRAF
      -5.287740
      -4.301018
      8.447411
      12.584116
    
    
      EGFR
      -4.273536
      -3.678272
      5.194929
      5.384128
    
    
      ERBB2
      8.547825
      10.703101
      7.275319
      7.803525



In [15]:

    
# Display Ras pathway metrics
all_samples_ras_pathway_status = ras_full_status_df[ras_genes_df['genes']].max(axis=1)
print('Ras Pathway Performance Summary: All Ras Genes')
print('AUROC:')
print(roc_auc_score(all_samples_ras_pathway_status,
                    ras_full_status_df['weight'], average='weighted'))
print('AUPRC:')
print(average_precision_score(all_samples_ras_pathway_status,
                              ras_full_status_df['weight'], average='weighted'))









    



Ras Pathway Performance Summary: All Ras Genes
AUROC:
0.5905870096512793
AUPRC:
0.7106392434670601



In [16]:

    
print('Ras Pathway Performance Summary: KRAS, NRAS, HRAS')
print('AUROC:')
print(roc_auc_score(ras_full_status_df['total_status'],
                    ras_full_status_df['weight'], average='weighted'))
print('AUPRC:')
print(average_precision_score(ras_full_status_df['total_status'],
                              ras_full_status_df['weight'], average='weighted'))









    



Ras Pathway Performance Summary: KRAS, NRAS, HRAS
AUROC:
0.8672490043723222
AUPRC:
0.6123556579784297



In [17]:

    
print('Ras Pathway Performance Summary: Held Out Samples')
held_out_ras_df = ras_full_status_df[ras_full_status_df['include'] == 0]
print('AUROC:')
print(roc_auc_score(held_out_ras_df['total_status'],
                    held_out_ras_df['weight'], average='weighted'))
print('AUPRC:')
print(average_precision_score(held_out_ras_df['total_status'],
                              held_out_ras_df['weight'], average='weighted'))









    



Ras Pathway Performance Summary: Held Out Samples
AUROC:
0.7521227354642891
AUPRC:
0.23966565191694897

Visualize Distribution of AUROC and AUPRC for all genes



In [18]:

    
# Subset mutation file by samples
sub_full_mutation_df = mutation_df[burden_filter]
low_mutation_count_filter = (
    sub_full_mutation_df.sum()
    [sub_full_mutation_df.sum() >= 10].sort_values(ascending=False).index
    )
sub_full_mutation_df = sub_full_mutation_df[low_mutation_count_filter]
sub_full_mutation_df.head()









    Out[18]:







  
    
      
      TP53
      TTN
      MUC16
      PIK3CA
      CSMD3
      RYR2
      LRP1B
      SYNE1
      FLG
      USH2A
      ...
      C17orf50
      C17orf105
      C17orf103
      MBD3L2
      DKFZP761K2322
      RP11-434D2.3
      FXYD4
      AC110619.2
      RP11-386M24.4
      C15orf40
    
    
      SAMPLE_BARCODE
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      TCGA-02-0047-01
      0
      0
      0
      1
      0
      1
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      TCGA-02-0055-01
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      TCGA-02-2483-01
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      TCGA-02-2485-01
      1
      0
      1
      1
      0
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      TCGA-02-2486-01
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 19372 columns



In [19]:

    
# Get Metrics for All Genes
all_auprc = sub_full_mutation_df.apply(lambda x: get_gene_auprc(x, w = ras_full_status_df['weight']))
all_auroc = sub_full_mutation_df.apply(lambda x: get_gene_auroc(x, w = ras_full_status_df['weight']))



In [20]:

    
# Process file and save results
all_gene_metrics_file = os.path.join(ras_folder, 'tables', 'all_gene_metric_ranks.tsv')

all_genes_auprc_df = pd.DataFrame(all_auprc.sort_values(ascending=False), columns=['auprc'])
all_genes_auroc_df = pd.DataFrame(all_auroc.sort_values(ascending=False), columns=['auroc'])

all_genes_auprc_df = all_genes_auprc_df.assign(auprc_rank = list(range(0, all_genes_auprc_df.shape[0])))
all_genes_auroc_df = all_genes_auroc_df.assign(auroc_rank = list(range(0, all_genes_auprc_df.shape[0])))

all_genes_auprc_df = all_genes_auprc_df.assign(ras = 0)
all_genes_auprc_df.loc[all_genes_auprc_df.index.isin(ras_genes_df['genes']), 'ras'] = 1

all_genes_metrics_df = all_genes_auprc_df.reset_index().merge(all_genes_auroc_df,
                                                              left_on='index', right_index=True)

all_genes_metrics_df.columns = ['Gene', 'AUPRC', 'AUPRC Rank', 'ras', 'AUROC', 'AUROC Rank']
all_genes_metrics_df.to_csv(all_gene_metrics_file, sep='\t', index=False)
all_genes_metrics_df.head(10)

	Gene	AUPRC	AUPRC Rank	ras	AUROC	AUROC Rank
0	KRAS	0.480921	0	1	0.917009	0
1	TTN	0.374206	1	0	0.559094	18591
2	TP53	0.359681	2	0	0.456115	19329
3	MUC16	0.266557	3	0	0.591954	16894
4	APC	0.250905	4	0	0.773767	47
5	LRP1B	0.182983	5	0	0.596804	16396
6	CSMD3	0.172012	6	0	0.567850	18312
7	RYR2	0.171406	7	0	0.588802	17195
8	PIK3CA	0.168235	8	0	0.577421	17938
9	SYNE1	0.162610	9	0	0.570606	18222

	SAMPLE_BARCODE	log10_mut	weight	PATIENT_BARCODE	DISEASE	SUBTYPE
0	TCGA-02-0047-01	1.812913	0.357117	TCGA-02-0047	GBM	IDHwt
1	TCGA-02-0055-01	1.707570	0.530723	TCGA-02-0055	GBM	IDHwt
2	TCGA-02-2483-01	1.662758	0.642091	TCGA-02-2483	GBM	IDHmut-non-codel
3	TCGA-02-2485-01	1.748188	0.467431	TCGA-02-2485	GBM	IDHwt
4	TCGA-02-2486-01	1.755875	0.361267	TCGA-02-2486	GBM	IDHwt

	ALK	ARAF	BRAF	EGFR	ERBB2	ERBB3	ERBB4	FGFR1	FGFR2	FGFR3	...	ERRFI1	NF1	RASA1	SAMPLE_BARCODE	DISEASE	weight	total_status	log10_mut	hypermutated	include
SAMPLE_BARCODE
TCGA-02-0047-01	0	0	0	0	0	0	0	0	0	0	...	0	0	0	TCGA-02-0047-01	GBM	0.357117	0	1.812913	0	0
TCGA-02-0055-01	0	0	0	0	0	0	0	0	0	0	...	0	1	0	TCGA-02-0055-01	GBM	0.530723	0	1.707570	0	0
TCGA-02-2483-01	0	0	0	0	0	0	0	0	1	0	...	0	0	0	TCGA-02-2483-01	GBM	0.642091	0	1.662758	0	0

	ras_auroc	no_ras_auroc	ras_auprc	no_ras_auprc
ALK	10.336711	9.806159	5.136642	4.880183
ARAF	9.596890	9.838528	2.657702	2.588441
BRAF	-5.287740	-4.301018	8.447411	12.584116
EGFR	-4.273536	-3.678272	5.194929	5.384128
ERBB2	8.547825	10.703101	7.275319	7.803525