In [1]:
    
import os
import pandas as pd
    
In [2]:
    
def get_rail_id(row):
    """
    Extract specific rail_ids from complex data structure that assigns rail_ids
    (Sample IDs) to snaptron_ids (exon-exon junctions). Designed to be used
    as a pd.DataFrame().apply() function.
    Arguments:
    row - a row in the junction dataframe
    Output: a list of sample IDs with the specific snaptron ID
    """
    row = row['samples'].split(',')
    all_sample_ids = []
    for sample_id in row:
        if sample_id != '':
            sample_id = sample_id.split(':')[0]
            all_sample_ids.append(sample_id)
    return(all_sample_ids)
    
In [3]:
    
# Load Sample File
sample_file = 'samples.tsv.gz'
dictionary_df = (
    pd.read_table(sample_file, low_memory=False)
    .loc[:, ['rail_id', 'gdc_cases.samples.submitter_id']]
)
dictionary_df.head(2)
    
    Out[3]:
In [4]:
    
# Load junctions file
junction_file = 'tp53_junctions.txt.gz'
junction_df = pd.read_table(junction_file)
junction_df.head(2)
    
    Out[4]:
In [5]:
    
# Load mutation classification scores file
file = os.path.join('..', '..', 'classifiers', 'TP53',
                    'tables', 'mutation_classification_scores.tsv')
mut_scores_df = pd.read_table(file, index_col=0)
mut_scores_df.head(2)
    
    Out[5]:
In [6]:
    
# Load raw mutation file
file = os.path.join('..', '..', 'data', 'raw', 'mc3.v0.2.8.PUBLIC.maf.gz')
raw_mut_df = pd.read_table(file)
raw_mut_df.head()
    
    
    Out[6]:
In [7]:
    
# Load binary mutation file
file = os.path.join('..', '..', 'data', 'pancan_mutation_freeze.tsv')
mut_df = pd.read_table(file, index_col=0)
mut_df.head(2)
    
    Out[7]:
In [8]:
    
# Subset mutation file to samples with c375GT TP53 mutations
silent_mut_df = (
    raw_mut_df.query('Hugo_Symbol == "TP53"')
    .query('HGVSc == "c.375G>T"')
)
# Obtain the samples with the specific mutation
silent_mut_samples = silent_mut_df.Tumor_Sample_Barcode.str.slice(start=0, stop=15)
print(len(silent_mut_samples))
# From these, remove samples that also have a different mutation in TP53
only_silent_mut_samples = (
    mut_df.reindex(silent_mut_samples)
    .loc[:, 'TP53']
)
only_silent_mut_samples = (
    only_silent_mut_samples.loc[only_silent_mut_samples == 0]
    .index
    .tolist()
)
print(len(only_silent_mut_samples))
# Select those samples in which we have mutation classification scores
mut_silent_scores_df = (
    mut_scores_df
    .loc[mut_scores_df.index.isin(only_silent_mut_samples), :]
)
print(mut_silent_scores_df.shape)
mut_silent_scores_df.head(2)
    
    
    Out[8]:
In [9]:
    
# Process and output junction file
out_file = 'tp53_junctions_with_mutations.csv.gz'
junctions_process_df = (
    junction_df.assign(diff_start = abs(7675994 - junction_df['start']),
                       diff_end = abs(7675994 - junction_df['end']))
    .sort_values(by = "diff_start")
)
junctions_process_df = (
    junctions_process_df
    .assign(rail_id = junctions_process_df.apply(get_rail_id, axis=1))
    .set_index(['snaptron_id', 'start', 'end', 'length', 'strand',
                'left_motif', 'right_motif', 'samples_count',
                'coverage_sum', 'coverage_avg', 'coverage_median',
                'diff_start', 'diff_end'])['rail_id']
    .apply(pd.Series)
    .stack()
    .reset_index()
)
junctions_process_df[0] = junctions_process_df[0].astype(int)
junctions_process_df = (
    junctions_process_df
    .merge(dictionary_df, left_on=0, right_on='rail_id')
)
junctions_process_df = (
    junctions_process_df
    .assign(
        tcga_id = junctions_process_df['gdc_cases.samples.submitter_id']
        .str
        .slice(start=0, stop=15)
    )
    .merge(mut_scores_df, left_on='tcga_id', right_index=True)
)
junctions_process_df.to_csv(out_file, compression='gzip')
junctions_process_df.head(2)
    
    Out[9]: