In [1]:
import os
import pandas as pd
In [2]:
def get_rail_id(row):
"""
Extract specific rail_ids from complex data structure that assigns rail_ids
(Sample IDs) to snaptron_ids (exon-exon junctions). Designed to be used
as a pd.DataFrame().apply() function.
Arguments:
row - a row in the junction dataframe
Output: a list of sample IDs with the specific snaptron ID
"""
row = row['samples'].split(',')
all_sample_ids = []
for sample_id in row:
if sample_id != '':
sample_id = sample_id.split(':')[0]
all_sample_ids.append(sample_id)
return(all_sample_ids)
In [3]:
# Load Sample File
sample_file = 'samples.tsv.gz'
dictionary_df = (
pd.read_table(sample_file, low_memory=False)
.loc[:, ['rail_id', 'gdc_cases.samples.submitter_id']]
)
dictionary_df.head(2)
Out[3]:
In [4]:
# Load junctions file
junction_file = 'tp53_junctions.txt.gz'
junction_df = pd.read_table(junction_file)
junction_df.head(2)
Out[4]:
In [5]:
# Load mutation classification scores file
file = os.path.join('..', '..', 'classifiers', 'TP53',
'tables', 'mutation_classification_scores.tsv')
mut_scores_df = pd.read_table(file, index_col=0)
mut_scores_df.head(2)
Out[5]:
In [6]:
# Load raw mutation file
file = os.path.join('..', '..', 'data', 'raw', 'mc3.v0.2.8.PUBLIC.maf.gz')
raw_mut_df = pd.read_table(file)
raw_mut_df.head()
Out[6]:
In [7]:
# Load binary mutation file
file = os.path.join('..', '..', 'data', 'pancan_mutation_freeze.tsv')
mut_df = pd.read_table(file, index_col=0)
mut_df.head(2)
Out[7]:
In [8]:
# Subset mutation file to samples with c375GT TP53 mutations
silent_mut_df = (
raw_mut_df.query('Hugo_Symbol == "TP53"')
.query('HGVSc == "c.375G>T"')
)
# Obtain the samples with the specific mutation
silent_mut_samples = silent_mut_df.Tumor_Sample_Barcode.str.slice(start=0, stop=15)
print(len(silent_mut_samples))
# From these, remove samples that also have a different mutation in TP53
only_silent_mut_samples = (
mut_df.reindex(silent_mut_samples)
.loc[:, 'TP53']
)
only_silent_mut_samples = (
only_silent_mut_samples.loc[only_silent_mut_samples == 0]
.index
.tolist()
)
print(len(only_silent_mut_samples))
# Select those samples in which we have mutation classification scores
mut_silent_scores_df = (
mut_scores_df
.loc[mut_scores_df.index.isin(only_silent_mut_samples), :]
)
print(mut_silent_scores_df.shape)
mut_silent_scores_df.head(2)
Out[8]:
In [9]:
# Process and output junction file
out_file = 'tp53_junctions_with_mutations.csv.gz'
junctions_process_df = (
junction_df.assign(diff_start = abs(7675994 - junction_df['start']),
diff_end = abs(7675994 - junction_df['end']))
.sort_values(by = "diff_start")
)
junctions_process_df = (
junctions_process_df
.assign(rail_id = junctions_process_df.apply(get_rail_id, axis=1))
.set_index(['snaptron_id', 'start', 'end', 'length', 'strand',
'left_motif', 'right_motif', 'samples_count',
'coverage_sum', 'coverage_avg', 'coverage_median',
'diff_start', 'diff_end'])['rail_id']
.apply(pd.Series)
.stack()
.reset_index()
)
junctions_process_df[0] = junctions_process_df[0].astype(int)
junctions_process_df = (
junctions_process_df
.merge(dictionary_df, left_on=0, right_on='rail_id')
)
junctions_process_df = (
junctions_process_df
.assign(
tcga_id = junctions_process_df['gdc_cases.samples.submitter_id']
.str
.slice(start=0, stop=15)
)
.merge(mut_scores_df, left_on='tcga_id', right_index=True)
)
junctions_process_df.to_csv(out_file, compression='gzip')
junctions_process_df.head(2)
Out[9]: