Parse A1


In [1]:
confidence=0.95

In [2]:
def get_id(line):
    return "_".join(map(str, [line['chromosome_name'], line['start'], line['stop']]))

In [3]:
import pandas as pd
table = pd.read_table("A1.tsv")
table['id']=table.apply(get_id, axis=1)
table = table.set_index('id')
table.columns


Out[3]:
Index([u'chromosome_name', u'start', u'stop', u'reference', u'variant',
       u'type', u'gene_name', u'transcript_name', u'transcript_species',
       u'transcript_source', u'transcript_version', u'strand',
       u'transcript_status', u'trv_type', u'c_position', u'amino_acid_change',
       u'ucsc_cons', u'domain', u'all_domains', u'deletion_substructures',
       u'transcript_error', u'adrenalmet.rcnt.llr3_ref',
       u'adrenalmet.rcnt.llr3_var', u'adrenalmet.rcnt.llr3_VAF',
       u'livermet.rcnt.llr3_ref', u'livermet.rcnt.llr3_var',
       u'livermet.rcnt.llr3_VAF', u'lungmet.rcnt.llr3_ref',
       u'lungmet.rcnt.llr3_var', u'lungmet.rcnt.llr3_VAF',
       u'spinalmet.rcnt.llr3_ref', u'spinalmet.rcnt.llr3_var',
       u'spinalmet.rcnt.llr3_VAF', u'tumor.rcnt.llr3_ref',
       u'tumor.rcnt.llr3_var', u'tumor.rcnt.llr3_VAF', u'cluster'],
      dtype='object')

In [4]:
table.head()


Out[4]:
chromosome_name start stop reference variant type gene_name transcript_name transcript_species transcript_source ... lungmet.rcnt.llr3_ref lungmet.rcnt.llr3_var lungmet.rcnt.llr3_VAF spinalmet.rcnt.llr3_ref spinalmet.rcnt.llr3_var spinalmet.rcnt.llr3_VAF tumor.rcnt.llr3_ref tumor.rcnt.llr3_var tumor.rcnt.llr3_VAF cluster
id
1_110359957_110359957 1 110359957 110359957 C T SNP AHCYL1 NM_006621 human genbank ... 151 0 0.00 142 0 0.00 150 4 2.58 7
6_48902663_48902663 6 48902663 48902663 T G SNP ENSG00000221175 ENST00000408248 human ensembl ... 147 1 0.68 178 0 0.00 189 0 0.00 6
6_49276467_49276467 6 49276467 49276467 G C SNP - - - - ... 85 0 0.00 63 17 21.25 117 1 0.85 9
6_50173697_50173697 6 50173697 50173697 C G SNP DEFB112 NM_001037498 human genbank ... 126 0 0.00 115 30 20.69 179 0 0.00 9
6_50179508_50179508 6 50179508 50179508 A T SNP - - - - ... 113 0 0.00 128 0 0.00 141 0 0.00 6

5 rows × 37 columns


In [21]:
n = len(table)
rows = ["5 #anatomical sites\n5 #samples\n%d #mutations\n#sample_index\tsample_label\tanatomical_site_index\tanatomical_site_label\tcharacter_index\tcharacter_label\tref\tvar\n" % n]

ref_cols = ['tumor.rcnt.llr3_ref','adrenalmet.rcnt.llr3_ref', 'livermet.rcnt.llr3_ref', 'lungmet.rcnt.llr3_ref', 'spinalmet.rcnt.llr3_ref']
var_cols = ['tumor.rcnt.llr3_var','adrenalmet.rcnt.llr3_var', 'livermet.rcnt.llr3_var', 'lungmet.rcnt.llr3_var', 'spinalmet.rcnt.llr3_var']

cols = ['breast', 'adrenal', 'liver', 'lung', 'spinal']
samples = {}
samples['breast'] = 'tumor.rcnt.llr3'
samples['adrenal'] = 'adrenalmet.rcnt.llr3'
samples['liver'] = 'livermet.rcnt.llr3'
samples['lung'] = 'lungmet.rcnt.llr3'
samples['spinal'] = 'spinalmet.rcnt.llr3'

charIndex = {}
def print_char(row, sam):
    global charIndex
    if str(row.name).replace("_", ":") not in charIndex:
        charIndex[str(row.name).replace("_", ":")] = len(charIndex)
    return "\t".join(map(str,[i, sam, i, sam, charIndex[str(row.name).replace("_", ":")], str(row.name).replace("_", ":"), row[samples[sam] + '_ref'], row[samples[sam] + '_var']])) 

for i, sam in enumerate(cols):
    rows += list(table.apply(print_char, args=[sam], axis=1))
    
with open("A1_SNV_MACHINA.tsv", "w") as f:
    for row in rows:
        f.write(row + "\n")

Write Treeomics input files


In [3]:
ref_cols = ['tumor.rcnt.llr3_ref','adrenalmet.rcnt.llr3_ref', 'livermet.rcnt.llr3_ref', 'lungmet.rcnt.llr3_ref', 'spinalmet.rcnt.llr3_ref']
var_cols = ['tumor.rcnt.llr3_var','adrenalmet.rcnt.llr3_var', 'livermet.rcnt.llr3_var', 'lungmet.rcnt.llr3_var', 'spinalmet.rcnt.llr3_var']

cols = ['breast', 'adrenal', 'liver', 'lung', 'spinal']
table2 = table[['cluster', 'gene_name', 'chromosome_name', 'start'] + ref_cols + var_cols]
table2.columns = ['cluster', 'gene_name', 'chromosome_name', 'start']+['ref-'+c for c in cols] + ['var-'+c for c in cols] 

with open("A1_treeomics_coverage.txt", "w") as f:
    f.write("\t".join(["Chromosome", "Position", "Change", "Gene", "breast_0", "adrenal_0", "liver_0", "lung_0", "spinal_0"]) + "\n")
    for i, row in table2.iterrows():
        f.write("\t".join(map(str, [row['chromosome_name'], row['start'], 'X>Y', row['gene_name'], 
                                    row['ref-breast'] + row['var-breast'], 
                                    row['ref-adrenal'] + row['var-adrenal'], 
                                    row['ref-liver'] + row['var-liver'], 
                                    row['ref-lung'] + row['var-lung'], 
                                    row['ref-spinal'] + row['var-spinal']])) + "\n")
    
with open("A1_treeomics_mut.txt", "w") as f:
    f.write("\t".join(["Chromosome", "Position", "Change", "Gene", "breast_0", "adrenal_0", "liver_0", "lung_0", "spinal_0"]) + "\n")
    for i, row in table2.iterrows():
        f.write("\t".join(map(str, [row['chromosome_name'], row['start'], 'X>Y', row['gene_name'], 
                                    row['var-breast'], row['var-adrenal'], row['var-liver'], 
                                    row['var-lung'], row['var-spinal']])) + "\n")

Generate MACHINA input file


In [4]:
ref_cols = ['tumor.rcnt.llr3_ref','adrenalmet.rcnt.llr3_ref', 'livermet.rcnt.llr3_ref', 'lungmet.rcnt.llr3_ref', 'spinalmet.rcnt.llr3_ref']
var_cols = ['tumor.rcnt.llr3_var','adrenalmet.rcnt.llr3_var', 'livermet.rcnt.llr3_var', 'lungmet.rcnt.llr3_var', 'spinalmet.rcnt.llr3_var']

#breast,adrenal,liver,lung,spinal
cols = ['breast', 'adrenal', 'liver', 'lung', 'spinal']
table = table[['cluster']+ref_cols+var_cols]
table.columns = ['cluster']+['ref-'+c for c in cols] + ['var-'+c for c in cols]

Get intervals


In [9]:
ctable = table.groupby('cluster').sum()

global corrected_confidence
nsamples = len([c for c in ctable.columns if c.startswith('ref')])
nclusters = len(ctable)
corrected_confidence = 1-((1.-confidence)/(nsamples*nclusters))
print corrected_confidence

assert(corrected_confidence > confidence)
assert(corrected_confidence < 1.0)


0.998888888889

In [10]:
import numpy
from scipy.stats import beta
from scipy.stats import norm

def binomial_hpdr(n, N, pct, a=1, b=1, n_pbins=1e3):
    """
    Function computes the posterior mode along with the upper and lower bounds of the
    **Highest Posterior Density Region**.

    Parameters
    ----------
    n: number of successes 
    N: sample size 
    pct: the size of the confidence interval (between 0 and 1)
    a: the alpha hyper-parameter for the Beta distribution used as a prior (Default=1)
    b: the beta hyper-parameter for the Beta distribution used as a prior (Default=1)
    n_pbins: the number of bins to segment the p_range into (Default=1e3)

    Returns
    -------
    A tuple that contains the mode as well as the lower and upper bounds of the interval
    (mode, lower, upper)

    """
    # fixed random variable object for posterior Beta distribution
    rv = beta(n+a, N-n+b)
    # determine the mode and standard deviation of the posterior
    stdev = rv.stats('v')**0.5
    mode = (n+a-1.)/(N+a+b-2.)
    # compute the number of sigma that corresponds to this confidence
    # this is used to set the rough range of possible success probabilities
    n_sigma = numpy.ceil(norm.ppf( (1+pct)/2. ))+1
    # set the min and max values for success probability 
    max_p = mode + n_sigma * stdev
    if max_p > 1:
        max_p = 1.
    min_p = mode - n_sigma * stdev
    if min_p > 1:
        min_p = 1.
    # make the range of success probabilities
    p_range = numpy.linspace(min_p, max_p, n_pbins+1)
    # construct the probability mass function over the given range
    if mode > 0.5:
        sf = rv.sf(p_range)
        pmf = sf[:-1] - sf[1:]
    else:
        cdf = rv.cdf(p_range)
        pmf = cdf[1:] - cdf[:-1]
    # find the upper and lower bounds of the interval 
    sorted_idxs = numpy.argsort( pmf )[::-1]
    cumsum = numpy.cumsum( numpy.sort(pmf)[::-1] )
    j = numpy.argmin( numpy.abs(cumsum - pct) )
    upper = p_range[ (sorted_idxs[:j+1]).max()+1 ]
    lower = p_range[ (sorted_idxs[:j+1]).min() ]    

    return (mode, lower, upper)

In [14]:
def get_ub(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    return v[2]
    

def get_lb(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[1]
    #if mval < 0.01: mval = 0
    return mval

def get_mean(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[0]
    return mval

ctable = table.groupby('cluster').sum()
for sam in cols:
    ctable['ub-'+sam]= ctable.apply(get_ub, args=[sam], axis=1)
    ctable['lb-'+sam]= ctable.apply(get_lb, args=[sam], axis=1)
    ctable[sam]= ctable.apply(get_mean, args=[sam], axis=1)

In [15]:
def get_ub(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    return v[2]
    

def get_lb(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[1]
    if mval < 0.01: mval = 0
    return mval

def get_mean(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[0]
    return mval

ctable_cutoff = table.groupby('cluster').sum()
for sam in cols:
    ctable_cutoff['ub-'+sam]= ctable.apply(get_ub, args=[sam], axis=1)
    ctable_cutoff['lb-'+sam]= ctable.apply(get_lb, args=[sam], axis=1)
    ctable_cutoff[sam]= ctable.apply(get_mean, args=[sam], axis=1)

In [16]:
def get_vaf(row, sam):
    return float(row['var-'+sam])/float(row['var-'+sam]+row['ref-'+sam])

#ctable_cutoff = table.groupby('cluster').mean()
vafs = pd.DataFrame()
for sam in cols:
    vafs[sam] = table.apply(get_vaf, args=[sam], axis=1)
vafs['cluster'] = table['cluster']

In [17]:
rows = ["5 #anatomical sites\n5 #samples\n9 #mutations\n#sample_index\tsample_label\tanatomical_site_index\tanatomical_site_label\tcharacter_index\tcharacter_label\tf_lb\tf_ub\n",]

def print_char(row, sam):
    return "\t".join(map(str,[i, sam, i, sam, row.name-1, str(row.name), max(row['lb-'+sam] * 2, 0), min(1, 2 * row['ub-'+sam])]))+"\n"

for i, sam in enumerate(cols):
    rows += list(ctable_cutoff.apply(print_char, args=[sam], axis=1))

with open("../A1_MACHINA_"+str(confidence)+".tsv", 'w') as f:
    for line in rows:
        f.write(line)