Processing TP53 Exon-Exon Junction Data

Gregory Way 2017

Reading in junction and sample files and merging with TP53 mutation data.


In [1]:
import os
import pandas as pd

In [2]:
def get_rail_id(row):
    """
    Extract specific rail_ids from complex data structure that assigns rail_ids
    (Sample IDs) to snaptron_ids (exon-exon junctions). Designed to be used
    as a pd.DataFrame().apply() function.

    Arguments:
    row - a row in the junction dataframe

    Output: a list of sample IDs with the specific snaptron ID
    """

    row = row['samples'].split(',')

    all_sample_ids = []
    for sample_id in row:
        if sample_id != '':
            sample_id = sample_id.split(':')[0]
            all_sample_ids.append(sample_id)
    return(all_sample_ids)

First, load several files required for processing


In [3]:
# Load Sample File
sample_file = 'samples.tsv.gz'

dictionary_df = (
    pd.read_table(sample_file, low_memory=False)
    .loc[:, ['rail_id', 'gdc_cases.samples.submitter_id']]
)
dictionary_df.head(2)


Out[3]:
rail_id gdc_cases.samples.submitter_id
0 0 TCGA-CD-8534-01A
1 1 TCGA-ER-A19A-06A

In [4]:
# Load junctions file
junction_file = 'tp53_junctions.txt.gz'

junction_df = pd.read_table(junction_file)
junction_df.head(2)


Out[4]:
DataSource:Type snaptron_id chromosome start end length strand annotated left_motif right_motif left_annotated right_annotated samples samples_count coverage_sum coverage_avg coverage_median source_dataset_id
0 TCGA:I 13874804 chr17 7170387 7668636 498250 - 0 GC AG 0 0 ,1978:1 1 1 1.0 1.0 4
1 TCGA:I 13874806 chr17 7170396 7668645 498250 + 0 AT AC 0 0 ,11160:1 1 1 1.0 1.0 4

In [5]:
# Load mutation classification scores file
file = os.path.join('..', '..', 'classifiers', 'TP53',
                    'tables', 'mutation_classification_scores.tsv')

mut_scores_df = pd.read_table(file, index_col=0)
mut_scores_df.head(2)


Out[5]:
log10_mut total_status weight TP53 TP53_loss PATIENT_BARCODE DISEASE SUBTYPE hypermutated include ID.1 Tumor_Sample_Barcode Hugo_Symbol HGVSc HGVSp Variant_Classification
ID
TCGA-02-0047-01 1.812913 0.0 0.723395 0.0 0.0 TCGA-02-0047 GBM IDHwt 0.0 1.0 TCGA-02-0047-01 NaN NaN NaN NaN Wild-Type
TCGA-02-0055-01 1.707570 1.0 0.698910 1.0 0.0 TCGA-02-0055 GBM IDHwt 0.0 1.0 TCGA-02-0055-01 TCGA-02-0055-01A-01D-1490-08 TP53 c.646G>A p.Val216Met Missense_Mutation

In [6]:
# Load raw mutation file
file = os.path.join('..', '..', 'data', 'raw', 'mc3.v0.2.8.PUBLIC.maf.gz')

raw_mut_df = pd.read_table(file)
raw_mut_df.head()


/home/gway/anaconda3/envs/pancancer-classifier/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[6]:
Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type ... ExAC_AF_NFE ExAC_AF_OTH ExAC_AF_SAS GENE_PHENO FILTER COSMIC CENTERS CONTEXT DBVS NCALLERS
0 TACC2 0 . GRCh37 10 123810032 123810032 + Missense_Mutation SNP ... . . . . PASS SITE|p.T38M|c.113C>T|3 MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS GGACACGCCCG by1000G 5
1 JAKMIP3 0 . GRCh37 10 133967449 133967449 + Silent SNP ... . . . . PASS NONE MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS CTGGACGAGGA byFrequency 5
2 PANX3 0 . GRCh37 11 124489539 124489539 + Missense_Mutation SNP ... . . . . PASS SITE|p.R296Q|c.887G>A|3 MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS ATGTCGGTGGG . 5
3 SPI1 0 . GRCh37 11 47380512 47380512 + Missense_Mutation SNP ... . . . . PASS NONE RADIA|MUSE GGCTGGGGACA . 2
4 NAALAD2 0 . GRCh37 11 89868837 89868837 + Missense_Mutation SNP ... . . . . PASS SITE|p.R65C|c.193C>T|4 MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS TTCTTCGGTAA . 5

5 rows × 114 columns


In [7]:
# Load binary mutation file
file = os.path.join('..', '..', 'data', 'pancan_mutation_freeze.tsv')

mut_df = pd.read_table(file, index_col=0)
mut_df.head(2)


Out[7]:
5S_rRNA A1BG A1CF A2M A2ML1 A3GALT2 A4GALT A4GNT AAAS AACS ... ZYX ZZEF1 ZZZ3 hsa-mir-1199 hsa-mir-150 hsa-mir-3171 hsa-mir-466 hsa-mir-5195 hsa-mir-6080 hsa-mir-7162
SAMPLE_BARCODE
TCGA-02-0047-01 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TCGA-02-0055-01 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 20938 columns

Next, select the samples with the specific mutation of interest

Also make sure that we only select samples in which this silent mutation is the only TP53 mutation present.


In [8]:
# Subset mutation file to samples with c375GT TP53 mutations
silent_mut_df = (
    raw_mut_df.query('Hugo_Symbol == "TP53"')
    .query('HGVSc == "c.375G>T"')
)

# Obtain the samples with the specific mutation
silent_mut_samples = silent_mut_df.Tumor_Sample_Barcode.str.slice(start=0, stop=15)
print(len(silent_mut_samples))

# From these, remove samples that also have a different mutation in TP53
only_silent_mut_samples = (
    mut_df.reindex(silent_mut_samples)
    .loc[:, 'TP53']
)

only_silent_mut_samples = (
    only_silent_mut_samples.loc[only_silent_mut_samples == 0]
    .index
    .tolist()
)

print(len(only_silent_mut_samples))

# Select those samples in which we have mutation classification scores
mut_silent_scores_df = (
    mut_scores_df
    .loc[mut_scores_df.index.isin(only_silent_mut_samples), :]
)

print(mut_silent_scores_df.shape)
mut_silent_scores_df.head(2)


31
20
(20, 16)
Out[8]:
log10_mut total_status weight TP53 TP53_loss PATIENT_BARCODE DISEASE SUBTYPE hypermutated include ID.1 Tumor_Sample_Barcode Hugo_Symbol HGVSc HGVSp Variant_Classification
ID
TCGA-05-4415-01 2.271842 0.0 0.642171 0.0 0.0 TCGA-05-4415 LUAD Not_Applicable 0.0 1.0 TCGA-05-4415-01 TCGA-05-4415-01A-22D-1855-08 TP53 c.375G>T p.%3D Silent
TCGA-18-3417-01 2.409933 0.0 0.719642 0.0 0.0 TCGA-18-3417 LUSC Not_Applicable 0.0 1.0 TCGA-18-3417-01 TCGA-18-3417-01A-01D-1441-08 TP53 c.375G>T p.%3D Silent

Finally, process the junctions file and output the results for plotting


In [9]:
# Process and output junction file
out_file = 'tp53_junctions_with_mutations.csv.gz'

junctions_process_df = (
    junction_df.assign(diff_start = abs(7675994 - junction_df['start']),
                       diff_end = abs(7675994 - junction_df['end']))
    .sort_values(by = "diff_start")
)

junctions_process_df = (
    junctions_process_df
    .assign(rail_id = junctions_process_df.apply(get_rail_id, axis=1))
    .set_index(['snaptron_id', 'start', 'end', 'length', 'strand',
                'left_motif', 'right_motif', 'samples_count',
                'coverage_sum', 'coverage_avg', 'coverage_median',
                'diff_start', 'diff_end'])['rail_id']
    .apply(pd.Series)
    .stack()
    .reset_index()
)

junctions_process_df[0] = junctions_process_df[0].astype(int)

junctions_process_df = (
    junctions_process_df
    .merge(dictionary_df, left_on=0, right_on='rail_id')
)

junctions_process_df = (
    junctions_process_df
    .assign(
        tcga_id = junctions_process_df['gdc_cases.samples.submitter_id']
        .str
        .slice(start=0, stop=15)
    )
    .merge(mut_scores_df, left_on='tcga_id', right_index=True)
)

junctions_process_df.to_csv(out_file, compression='gzip')
junctions_process_df.head(2)


Out[9]:
snaptron_id start end length strand left_motif right_motif samples_count coverage_sum coverage_avg ... DISEASE SUBTYPE hypermutated include ID.1 Tumor_Sample_Barcode Hugo_Symbol HGVSc HGVSp Variant_Classification
13 13945731 7676009 7676046 38 - GT AG 1 1 1.0 ... OV Not_Applicable 0.0 0.0 TCGA-25-2404-01 TCGA-25-2404-01A-01W-0799-08 TP53 c.641A>G p.His214Arg Missense_Mutation
14 13945751 7676064 7676130 67 + GT AG 6 6 1.0 ... OV Not_Applicable 0.0 0.0 TCGA-25-2404-01 TCGA-25-2404-01A-01W-0799-08 TP53 c.641A>G p.His214Arg Missense_Mutation

2 rows × 34 columns