Processing TP53 Exon-Exon Junction Data

Gregory Way 2017

Reading in junction and sample files and merging with TP53 mutation data.



In [1]:

    
import os
import pandas as pd



In [2]:

    
def get_rail_id(row):
    """
    Extract specific rail_ids from complex data structure that assigns rail_ids
    (Sample IDs) to snaptron_ids (exon-exon junctions). Designed to be used
    as a pd.DataFrame().apply() function.

    Arguments:
    row - a row in the junction dataframe

    Output: a list of sample IDs with the specific snaptron ID
    """

    row = row['samples'].split(',')

    all_sample_ids = []
    for sample_id in row:
        if sample_id != '':
            sample_id = sample_id.split(':')[0]
            all_sample_ids.append(sample_id)
    return(all_sample_ids)

First, load several files required for processing



In [3]:

    
# Load Sample File
sample_file = 'samples.tsv.gz'

dictionary_df = (
    pd.read_table(sample_file, low_memory=False)
    .loc[:, ['rail_id', 'gdc_cases.samples.submitter_id']]
)
dictionary_df.head(2)









    Out[3]:







  
    
      
      rail_id
      gdc_cases.samples.submitter_id
    
  
  
    
      0
      0
      TCGA-CD-8534-01A
    
    
      1
      1
      TCGA-ER-A19A-06A



In [4]:

    
# Load junctions file
junction_file = 'tp53_junctions.txt.gz'

junction_df = pd.read_table(junction_file)
junction_df.head(2)









    Out[4]:







  
    
      
      DataSource:Type
      snaptron_id
      chromosome
      start
      end
      length
      strand
      annotated
      left_motif
      right_motif
      left_annotated
      right_annotated
      samples
      samples_count
      coverage_sum
      coverage_avg
      coverage_median
      source_dataset_id
    
  
  
    
      0
      TCGA:I
      13874804
      chr17
      7170387
      7668636
      498250
      -
      0
      GC
      AG
      0
      0
      ,1978:1
      1
      1
      1.0
      1.0
      4
    
    
      1
      TCGA:I
      13874806
      chr17
      7170396
      7668645
      498250
      +
      0
      AT
      AC
      0
      0
      ,11160:1
      1
      1
      1.0
      1.0
      4



In [5]:

    
# Load mutation classification scores file
file = os.path.join('..', '..', 'classifiers', 'TP53',
                    'tables', 'mutation_classification_scores.tsv')

mut_scores_df = pd.read_table(file, index_col=0)
mut_scores_df.head(2)









    Out[5]:







  
    
      
      log10_mut
      total_status
      weight
      TP53
      TP53_loss
      PATIENT_BARCODE
      DISEASE
      SUBTYPE
      hypermutated
      include
      ID.1
      Tumor_Sample_Barcode
      Hugo_Symbol
      HGVSc
      HGVSp
      Variant_Classification
    
    
      ID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      TCGA-02-0047-01
      1.812913
      0.0
      0.723395
      0.0
      0.0
      TCGA-02-0047
      GBM
      IDHwt
      0.0
      1.0
      TCGA-02-0047-01
      NaN
      NaN
      NaN
      NaN
      Wild-Type
    
    
      TCGA-02-0055-01
      1.707570
      1.0
      0.698910
      1.0
      0.0
      TCGA-02-0055
      GBM
      IDHwt
      0.0
      1.0
      TCGA-02-0055-01
      TCGA-02-0055-01A-01D-1490-08
      TP53
      c.646G>A
      p.Val216Met
      Missense_Mutation



In [6]:

    
# Load raw mutation file
file = os.path.join('..', '..', 'data', 'raw', 'mc3.v0.2.8.PUBLIC.maf.gz')

raw_mut_df = pd.read_table(file)
raw_mut_df.head()









    



/home/gway/anaconda3/envs/pancancer-classifier/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    Out[6]:







  
    
      
      Hugo_Symbol
      Entrez_Gene_Id
      Center
      NCBI_Build
      Chromosome
      Start_Position
      End_Position
      Strand
      Variant_Classification
      Variant_Type
      ...
      ExAC_AF_NFE
      ExAC_AF_OTH
      ExAC_AF_SAS
      GENE_PHENO
      FILTER
      COSMIC
      CENTERS
      CONTEXT
      DBVS
      NCALLERS
    
  
  
    
      0
      TACC2
      0
      .
      GRCh37
      10
      123810032
      123810032
      +
      Missense_Mutation
      SNP
      ...
      .
      .
      .
      .
      PASS
      SITE|p.T38M|c.113C>T|3
      MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
      GGACACGCCCG
      by1000G
      5
    
    
      1
      JAKMIP3
      0
      .
      GRCh37
      10
      133967449
      133967449
      +
      Silent
      SNP
      ...
      .
      .
      .
      .
      PASS
      NONE
      MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
      CTGGACGAGGA
      byFrequency
      5
    
    
      2
      PANX3
      0
      .
      GRCh37
      11
      124489539
      124489539
      +
      Missense_Mutation
      SNP
      ...
      .
      .
      .
      .
      PASS
      SITE|p.R296Q|c.887G>A|3
      MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
      ATGTCGGTGGG
      .
      5
    
    
      3
      SPI1
      0
      .
      GRCh37
      11
      47380512
      47380512
      +
      Missense_Mutation
      SNP
      ...
      .
      .
      .
      .
      PASS
      NONE
      RADIA|MUSE
      GGCTGGGGACA
      .
      2
    
    
      4
      NAALAD2
      0
      .
      GRCh37
      11
      89868837
      89868837
      +
      Missense_Mutation
      SNP
      ...
      .
      .
      .
      .
      PASS
      SITE|p.R65C|c.193C>T|4
      MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
      TTCTTCGGTAA
      .
      5
    
  

5 rows × 114 columns



In [7]:

    
# Load binary mutation file
file = os.path.join('..', '..', 'data', 'pancan_mutation_freeze.tsv')

mut_df = pd.read_table(file, index_col=0)
mut_df.head(2)









    Out[7]:







  
    
      
      5S_rRNA
      A1BG
      A1CF
      A2M
      A2ML1
      A3GALT2
      A4GALT
      A4GNT
      AAAS
      AACS
      ...
      ZYX
      ZZEF1
      ZZZ3
      hsa-mir-1199
      hsa-mir-150
      hsa-mir-3171
      hsa-mir-466
      hsa-mir-5195
      hsa-mir-6080
      hsa-mir-7162
    
    
      SAMPLE_BARCODE
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      TCGA-02-0047-01
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      TCGA-02-0055-01
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

2 rows × 20938 columns

Next, select the samples with the specific mutation of interest

Also make sure that we only select samples in which this silent mutation is the only TP53 mutation present.



In [8]:

    
# Subset mutation file to samples with c375GT TP53 mutations
silent_mut_df = (
    raw_mut_df.query('Hugo_Symbol == "TP53"')
    .query('HGVSc == "c.375G>T"')
)

# Obtain the samples with the specific mutation
silent_mut_samples = silent_mut_df.Tumor_Sample_Barcode.str.slice(start=0, stop=15)
print(len(silent_mut_samples))

# From these, remove samples that also have a different mutation in TP53
only_silent_mut_samples = (
    mut_df.reindex(silent_mut_samples)
    .loc[:, 'TP53']
)

only_silent_mut_samples = (
    only_silent_mut_samples.loc[only_silent_mut_samples == 0]
    .index
    .tolist()
)

print(len(only_silent_mut_samples))

# Select those samples in which we have mutation classification scores
mut_silent_scores_df = (
    mut_scores_df
    .loc[mut_scores_df.index.isin(only_silent_mut_samples), :]
)

print(mut_silent_scores_df.shape)
mut_silent_scores_df.head(2)









    



31
20
(20, 16)






    Out[8]:







  
    
      
      log10_mut
      total_status
      weight
      TP53
      TP53_loss
      PATIENT_BARCODE
      DISEASE
      SUBTYPE
      hypermutated
      include
      ID.1
      Tumor_Sample_Barcode
      Hugo_Symbol
      HGVSc
      HGVSp
      Variant_Classification
    
    
      ID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      TCGA-05-4415-01
      2.271842
      0.0
      0.642171
      0.0
      0.0
      TCGA-05-4415
      LUAD
      Not_Applicable
      0.0
      1.0
      TCGA-05-4415-01
      TCGA-05-4415-01A-22D-1855-08
      TP53
      c.375G>T
      p.%3D
      Silent
    
    
      TCGA-18-3417-01
      2.409933
      0.0
      0.719642
      0.0
      0.0
      TCGA-18-3417
      LUSC
      Not_Applicable
      0.0
      1.0
      TCGA-18-3417-01
      TCGA-18-3417-01A-01D-1441-08
      TP53
      c.375G>T
      p.%3D
      Silent

Finally, process the junctions file and output the results for plotting



In [9]:

    
# Process and output junction file
out_file = 'tp53_junctions_with_mutations.csv.gz'

junctions_process_df = (
    junction_df.assign(diff_start = abs(7675994 - junction_df['start']),
                       diff_end = abs(7675994 - junction_df['end']))
    .sort_values(by = "diff_start")
)

junctions_process_df = (
    junctions_process_df
    .assign(rail_id = junctions_process_df.apply(get_rail_id, axis=1))
    .set_index(['snaptron_id', 'start', 'end', 'length', 'strand',
                'left_motif', 'right_motif', 'samples_count',
                'coverage_sum', 'coverage_avg', 'coverage_median',
                'diff_start', 'diff_end'])['rail_id']
    .apply(pd.Series)
    .stack()
    .reset_index()
)

junctions_process_df[0] = junctions_process_df[0].astype(int)

junctions_process_df = (
    junctions_process_df
    .merge(dictionary_df, left_on=0, right_on='rail_id')
)

junctions_process_df = (
    junctions_process_df
    .assign(
        tcga_id = junctions_process_df['gdc_cases.samples.submitter_id']
        .str
        .slice(start=0, stop=15)
    )
    .merge(mut_scores_df, left_on='tcga_id', right_index=True)
)

junctions_process_df.to_csv(out_file, compression='gzip')
junctions_process_df.head(2)









    Out[9]:







  
    
      
      snaptron_id
      start
      end
      length
      strand
      left_motif
      right_motif
      samples_count
      coverage_sum
      coverage_avg
      ...
      DISEASE
      SUBTYPE
      hypermutated
      include
      ID.1
      Tumor_Sample_Barcode
      Hugo_Symbol
      HGVSc
      HGVSp
      Variant_Classification
    
  
  
    
      13
      13945731
      7676009
      7676046
      38
      -
      GT
      AG
      1
      1
      1.0
      ...
      OV
      Not_Applicable
      0.0
      0.0
      TCGA-25-2404-01
      TCGA-25-2404-01A-01W-0799-08
      TP53
      c.641A>G
      p.His214Arg
      Missense_Mutation
    
    
      14
      13945751
      7676064
      7676130
      67
      +
      GT
      AG
      6
      6
      1.0
      ...
      OV
      Not_Applicable
      0.0
      0.0
      TCGA-25-2404-01
      TCGA-25-2404-01A-01W-0799-08
      TP53
      c.641A>G
      p.His214Arg
      Missense_Mutation
    
  

2 rows × 34 columns

	DataSource:Type	snaptron_id	chromosome	start	end	length	strand	annotated	left_motif	right_motif	left_annotated	right_annotated	samples	samples_count	coverage_sum	coverage_avg	coverage_median	source_dataset_id
0	TCGA:I	13874804	chr17	7170387	7668636	498250	-	0	GC	AG	0	0	,1978:1	1	1	1.0	1.0	4
1	TCGA:I	13874806	chr17	7170396	7668645	498250	+	0	AT	AC	0	0	,11160:1	1	1	1.0	1.0	4

	log10_mut	total_status	weight	TP53	TP53_loss	PATIENT_BARCODE	DISEASE	SUBTYPE	hypermutated	include	ID.1	Tumor_Sample_Barcode	Hugo_Symbol	HGVSc	HGVSp	Variant_Classification
ID
TCGA-02-0047-01	1.812913	0.0	0.723395	0.0	0.0	TCGA-02-0047	GBM	IDHwt	0.0	1.0	TCGA-02-0047-01	NaN	NaN	NaN	NaN	Wild-Type
TCGA-02-0055-01	1.707570	1.0	0.698910	1.0	0.0	TCGA-02-0055	GBM	IDHwt	0.0	1.0	TCGA-02-0055-01	TCGA-02-0055-01A-01D-1490-08	TP53	c.646G>A	p.Val216Met	Missense_Mutation

	Hugo_Symbol	Center	NCBI_Build	Chromosome	Start_Position	End_Position	Strand	Variant_Classification	Variant_Type	...	ExAC_AF_NFE	ExAC_AF_OTH	ExAC_AF_SAS	GENE_PHENO	FILTER	COSMIC	CENTERS	CONTEXT	DBVS	NCALLERS
0	TACC2	.	GRCh37	10	123810032	123810032	+	Missense_Mutation	SNP	...	.	.	.	.	PASS	SITE\|p.T38M\|c.113C>T\|3	MUTECT\|RADIA\|SOMATICSNIPER\|MUSE\|VARSCANS	GGACACGCCCG	by1000G	5
1	JAKMIP3	.	GRCh37	10	133967449	133967449	+	Silent	SNP	...	.	.	.	.	PASS	NONE	MUTECT\|RADIA\|SOMATICSNIPER\|MUSE\|VARSCANS	CTGGACGAGGA	byFrequency	5
2	PANX3	.	GRCh37	11	124489539	124489539	+	Missense_Mutation	SNP	...	.	.	.	.	PASS	SITE\|p.R296Q\|c.887G>A\|3	MUTECT\|RADIA\|SOMATICSNIPER\|MUSE\|VARSCANS	ATGTCGGTGGG	.	5
3	SPI1	.	GRCh37	11	47380512	47380512	+	Missense_Mutation	SNP	...	.	.	.	.	PASS	NONE	RADIA\|MUSE	GGCTGGGGACA	.	2
4	NAALAD2	.	GRCh37	11	89868837	89868837	+	Missense_Mutation	SNP	...	.	.	.	.	PASS	SITE\|p.R65C\|c.193C>T\|4	MUTECT\|RADIA\|SOMATICSNIPER\|MUSE\|VARSCANS	TTCTTCGGTAA	.	5

	5S_rRNA	A1BG	A1CF	A2M	A2ML1	A3GALT2	A4GALT	A4GNT	AAAS	AACS	...	ZYX	ZZEF1	ZZZ3	hsa-mir-1199	hsa-mir-150	hsa-mir-3171	hsa-mir-466	hsa-mir-5195	hsa-mir-6080	hsa-mir-7162
SAMPLE_BARCODE
TCGA-02-0047-01	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
TCGA-02-0055-01	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	log10_mut	total_status	weight	TP53	TP53_loss	PATIENT_BARCODE	DISEASE	SUBTYPE	hypermutated	include	ID.1	Tumor_Sample_Barcode	Hugo_Symbol	HGVSc	HGVSp	Variant_Classification
ID
TCGA-05-4415-01	2.271842	0.0	0.642171	0.0	0.0	TCGA-05-4415	LUAD	Not_Applicable	0.0	1.0	TCGA-05-4415-01	TCGA-05-4415-01A-22D-1855-08	TP53	c.375G>T	p.%3D	Silent
TCGA-18-3417-01	2.409933	0.0	0.719642	0.0	0.0	TCGA-18-3417	LUSC	Not_Applicable	0.0	1.0	TCGA-18-3417-01	TCGA-18-3417-01A-01D-1441-08	TP53	c.375G>T	p.%3D	Silent

	snaptron_id	start	end	length	strand	left_motif	right_motif	samples_count	coverage_sum	coverage_avg	...	DISEASE	SUBTYPE	hypermutated	include	ID.1	Tumor_Sample_Barcode	Hugo_Symbol	HGVSc	HGVSp	Variant_Classification
13	13945731	7676009	7676046	38	-	GT	AG	1	1	1.0	...	OV	Not_Applicable	0.0	0.0	TCGA-25-2404-01	TCGA-25-2404-01A-01W-0799-08	TP53	c.641A>G	p.His214Arg	Missense_Mutation
14	13945751	7676064	7676130	67	+	GT	AG	6	6	1.0	...	OV	Not_Applicable	0.0	0.0	TCGA-25-2404-01	TCGA-25-2404-01A-01W-0799-08	TP53	c.641A>G	p.His214Arg	Missense_Mutation