In [2]:
import numpy as np
import pandas as pd
import glob
import scipy.stats as stats

Summarize eQTL instances


In [121]:
pats_12_ncm = set()
pats_15_ncm = set()
for line in open('../autoAnno_GeneHancer/TCGA2pat.txt').read().splitlines():
    row = line.split('\t')
    if row[-1] != '0':
        pats_12_ncm.add(row[0][:12])
        pats_15_ncm.add(row[0][:15])

In [142]:
p2q = {}
pcut = 0
eGenes = set()
gene2p = {}
for line in open('./gene2ncmut_lm_p_q.txt').read().splitlines():
    row = line.split('\t')
    p = float(row[-3])
    q = float(row[-2])
    p2q[p] = q
    if q <= 0.2:
        pcut = max(p+1e-5, pcut)
        eGenes.add(row[0])
    gene2p[row[0]] = p

In [149]:
loci2gene2info = {}
loci2p = {}
gene_loci_sig = set()
loci_sig = set()
affected_genes = set()
for line in open('./gene2ncmut_lm_p_allPairs.txt').read().splitlines():
    row = line.split('\t')
    if row[1] not in loci2gene2info:
        loci2gene2info[row[1]] = {}
        loci2p[row[1]] = 1
    p = float(row[3])
    loci2gene2info[row[1]][row[0]] = [row[2], row[3]] 
    loci2p[row[1]] = min(loci2p[row[1]], p)
    if row[0] in eGenes and p <= pcut:
        gene_loci_sig.add('{}\t{}'.format(row[0], row[1]))
        loci_sig.add(row[1])
    affected_genes.add(row[0])

In [125]:
print len(loci2gene2info)
print len(affected_genes)


7821
9445

In [126]:
loci2motif = {}
for line in open('../motifAnalysis/summarize_instances_50_compare_loci.txt').read().splitlines():
    row = line.split('\t')
    loci = row[1]
    if int(row[4]) < 4:
        continue
    motif_gain_loss_nPats = '_'.join(row[2:5])
    if loci not in loci2motif:
        loci2motif[loci] = set()
    loci2motif[loci].add(motif_gain_loss_nPats)

In [23]:
pat2disease = {}
disease2nPats = {}
disease2nPats['PanCan'] = 0.0
for line in open('../pat2clin4surv_gender.txt').read().splitlines():
    row = line.split('\t')
    if row[0] in pats_12_ncm:
        pat2disease[row[0]] = row[1]
        if row[1] not in disease2nPats:
            disease2nPats[row[1]] = 0.0
        disease2nPats[row[1]] += 1
        disease2nPats['PanCan'] += 1

In [24]:
print disease2nPats


{'DLBC': 7.0, 'GBM': 52.0, 'STAD': 40.0, 'UCEC': 51.0, 'THCA': 50.0, 'CESC': 20.0, 'LIHC': 54.0, 'HNSC': 50.0, 'SKCM': 38.0, 'BLCA': 23.0, 'LUSC': 50.0, 'OV': 50.0, 'SARC': 34.0, 'KIRP': 36.0, 'LGG': 19.0, 'LAML': 32.0, 'COADREAD': 65.0, 'PRAD': 20.0, 'LUAD': 50.0, 'BRCA': 99.0, 'KIRC': 41.0, 'KICH': 49.0, 'PanCan': 930.0}

In [25]:
gene2TSS = {}
df = pd.read_table('/cellar/users/wzhang1984/soft/homer/data/promoters/human.pos', header=None, index_col=0)
df['TSS'] = list(zip(df[1], df[2]+2000, -df[4]*2+1))
refseq2TSS = df['TSS'].to_dict()
df = pd.read_table('/cellar/users/wzhang1984/soft/homer/data/accession/human2gene.tsv', header=None, index_col=0)
id2gene = df[6].to_dict()

for ID in id2gene:
    if ID in refseq2TSS:
        gene = id2gene[ID]
        if gene not in gene2TSS:
            gene2TSS[gene] = set()
        gene2TSS[gene].add(refseq2TSS[ID])

In [26]:
line_out = set()
header = True
all_affected_pats = set()
loci_wMotifs = set()
line_out2 = set()
line_out3 = set()
line_out4 = set()
gene2line_out5_fig2a = {}
line_out4_header = 'loci\tgene\tdistance:gene\tp\tq\tc_score\t'+'\t'.join([disease for disease in sorted(disease2nPats)])+'\n'
loci2concentrate = {}
for line in open('../autoAnno_GeneHancer/TCGA_snv_mnv_merged_50_anno_promoter_genehancer.txt').read().splitlines():
    row = line.split('\t')
    if header:
        header = False
        continue
    row = line.split('\n')[0].split('\t')
    loci = '{}:{}-{}'.format(row[0], row[1], row[2])
    if loci not in loci2gene2info:
        continue
    Chr = row[0]
    Start = int(row[1]) + 1
    End = int(row[2]) - 1
    mid = (Start+End) / 2.

    loci_motifs = ''
    if loci in loci2motif:
        loci_motifs = ','.join(loci2motif[loci])
        if loci2p[loci] <= pcut and loci in loci_sig:
            loci_wMotifs.add(loci)
    pats = set()
    locs = set()
    disease2pats = {}
    mut2pats = {}
    for mut in row[4].split(","):
        [pat, Chr_StartEnd, source, alt] = mut.split("__")
        pat = pat[:12]
        pats.add(pat)
        locs.add(Chr_StartEnd)
        disease = pat2disease[pat]
        if disease not in disease2pats:
            disease2pats[disease] = set()
        disease2pats[disease].add(pat)
        if Chr_StartEnd not in mut2pats:
            mut2pats[Chr_StartEnd] = set()
        mut2pats[Chr_StartEnd].add(pat)
    loci2concentrate[loci] = 0
    for mut in mut2pats:
        loci2concentrate[loci] = max(loci2concentrate[loci], len(mut2pats[mut])/float(len(pats)))

    for g in row[-1].split(','):
        [gene, tp] = g.split('|')
        if gene in loci2gene2info[loci]:
            distance = ''
            distance_start_end = []
            if gene in gene2TSS:
                for chr_tss_direction in gene2TSS[gene]:
                    [chr_gene, tss, direction] = chr_tss_direction
                    if Chr != chr_gene:
                        continue
                    tmp = (mid-tss) * direction
                    if distance == '' or np.abs(distance) > np.abs(tmp):
                        distance = tmp
                        distance_start_end = sorted([(Start-tss)*direction, (End-tss)*direction])
            for i in range(len(distance_start_end)):
                if distance_start_end[i] > 0:
                    distance_start_end[i] = '+' + str(distance_start_end[i])
            p = float(loci2gene2info[loci][gene][1])
            q = ''
            if p in p2q:
                q = p2q[p]
            line_out_tmp = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(gene, loci, (End-Start+1), len(pats), len(locs), ','.join(sorted(pats)), ','.join(sorted(locs)), '\t'.join(loci2gene2info[loci][gene]), q, loci_motifs, len(disease2pats), ','.join(['{}_{}'.format(disease_,len(disease2pats[disease_])) for disease_ in sorted(disease2pats)]), distance, loci2concentrate[loci])
            line_out.add(line_out_tmp)
            
            if gene not in gene2line_out5_fig2a:
                gene2line_out5_fig2a[gene] = [1, '']
            if p < gene2line_out5_fig2a[gene][0]:
                gene2line_out5_fig2a[gene][0] = p
                gene2line_out5_fig2a[gene][1] = '{}\t{}\t{}\t{}\t{}\t{}\n'.format(gene, loci, loci2gene2info[loci][gene][0], gene2p[gene], p, distance)

            gene_loci = '{}\t{}'.format(gene,loci)
            if gene_loci not in gene_loci_sig:
                continue
            diff = 1
            for p2 in p2q:
                if np.abs(p-p2) < diff:
                    diff = np.abs(p-p2)
                    q = p2q[p2]
            all_affected_pats = all_affected_pats|pats
            line_out3.add('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(gene, Chr, Start, End, len(pats), p, q, loci2gene2info[loci][gene][0], ', '.join(['{}'.format(disease_i) for disease_i in sorted(disease2pats)]), distance, loci2concentrate[loci]))
            loci_gene2percentages = {}
            for disease in disease2nPats:
                if disease in disease2pats:
                    loci_gene2percentages[disease] = len(disease2pats[disease]) / disease2nPats[disease]
                else:
                    loci_gene2percentages[disease] = 0
            loci_gene2percentages['PanCan'] = len(pats) / disease2nPats['PanCan']
            line_out4.add('{}\t{}\t{}:{}\t{}\t{}\t{}\t{}\n'.format(loci, gene, '-'.join([str(i) for i in distance_start_end]), gene.split('|')[0], p, q, loci2concentrate[loci], '\t'.join([str(loci_gene2percentages[disease]) for disease in sorted(disease2nPats)])))

            if gene_loci not in gene_loci_sig:
                continue
            if loci not in loci2motif:
                continue
            for motif in loci2motif[loci]:
                gain_loss = ''
                if len(motif.split('_gain_')) > 1:
                    motif = motif.split('_gain_')[0]
                    gain_loss = 'Gain'
                elif len(motif.split('_lost_')) > 1:
                    motif = motif.split('_lost_')[0]
                    gain_loss = 'Loss'
                line_out2.add('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(gene, Chr, Start, End, len(pats), loci2gene2info[loci][gene][0], motif+'\t'+gain_loss, ', '.join(['{}'.format(disease_i) for disease_i in sorted(disease2pats)]), distance))


print len(all_affected_pats)
print len(loci_wMotifs)


820
83

In [27]:
open('gene2ncmut_lm_min_loci_instances.txt','w').write(''.join(sorted(line_out)))
open('gene2ncmut_lm_min_loci_instances_TableS2.txt','w').write('Gene\teQTL chromosome\teQTL start\teQTL end\tNumber of patients\tCoefficient\tMotif\tGain/Loss of motif\tCancer types\tDistance to TSS\n'+''.join(sorted(line_out2)))
open('gene2ncmut_lm_min_loci_instances_TableS1.txt','w').write('Gene\teQTL chromosome\teQTL start\teQTL end\tNumber of patients\tp-value\tq-value\tCoefficient\tCancer types\tDistance to TSS\tConcentration score\n'+''.join(sorted(line_out3)))

open('gene2ncmut_lm_min_loci_instances_percentage.txt','w').write(line_out4_header+''.join(sorted(line_out4)))

open('gene2ncmut_lm_min_loci_instances_fig2a.txt','w').write(''.join([i[1] for i in gene2line_out5_fig2a.values()]))

In [ ]:

Check overlap between eGenes and cancer genes


In [29]:
DIR4NBS = "/cellar/data/users/wzhang1984/forNBS/"

CGs = set()
CGC = set()
og_tsg = set()

for line in open(DIR4NBS+'CGs.txt').read().splitlines():
    CGs.add(line)

for line in open(DIR4NBS+'cancer_gene_census.csv').read().splitlines()[1:]:
    row = line.split(',')
    CGC.add(row[0])

for line in open(DIR4NBS+'oncogene_tsg.txt').read().splitlines():
    row = line.split('\t')
    og_tsg.add(row[0])

all_CGs = CGs|CGC|og_tsg

white_list=set()

for line in open('./whitelist.txt').read().splitlines():
    white_list.add(line.split('|')[0])

print len(all_CGs&white_list)
print len(white_list)

line_out=''
eGenes = set()
for line in open('./gene2ncmut_lm_min_loci_instances_TableS1.txt').read().splitlines():
    row = line.split('\t')
    loci2CGs = ''
    loci2CGC = ''
    loci2og_tsg = ''
    g = row[0]
    eGenes.add(g)
    if g in CGs:
        loci2CGs = 'CGs'
    if g in CGC:
        loci2CGC = 'CGC'
    if g in og_tsg:
        loci2og_tsg ='og_tsg'
    if loci2CGs or loci2CGC:
        line_out += '{}\t{}\t{}\t{}\n'.format(row[0], loci2CGs, loci2CGC, loci2og_tsg)

open('loci2CGs.txt','wb').write(line_out)


821
16413

In [30]:
# Fisher's exct test for the enrichment of overlap between eGenes and cancer genes
a = len(eGenes&all_CGs&white_list)
b = len(eGenes&white_list) - a
c = len(all_CGs&white_list) - a
d = len(white_list) - b - c + a
stats.fisher_exact([[a, b], [c, d]])


Out[30]:
(1.3570240220743386, 0.31833581787939375)

In [31]:
print(a, b, c, d)


(13, 183, 808, 15435)

In [32]:
eGenes&all_CGs


Out[32]:
{'AFF4',
 'ARID1A',
 'ARNT',
 'CCND2',
 'ERBB4',
 'ETV5',
 'EXT1',
 'IRF6',
 'KCNJ5',
 'NCK1',
 'SETDB1',
 'SPECC1',
 'TERT'}

Create Cytoscape files for the eQTL-gene network


In [33]:
line_out = 'Gene1\tGene2\tAnnotation\tDirection\tScore\n'
g2t = {}

for line in open('gene2ncmut_lm_min_loci_instances.txt').read().splitlines():
    row = line.split('\t')
    if row[0] not in eGenes or float(row[8]) > pcut:
        continue
    gene = row[0]
    g2t[gene] = 'gene'
    g2t[row[1]] = 'nc'
    line_out += '{}\t{}\t{}\t{}\t{}\n'.format(row[1], gene, 'nc-gene', '-.', '1')

network_fn = '/cellar/data/users/wzhang1984/forNBS/FIsInGene_031516_with_annotations.txt'

for line in open(network_fn).read().splitlines()[1:]:
    row = line.split('\t')
    [g1, g2] = row[:2]
    if g1 in g2t and g2 in g2t:
        line_out += line+'\n'

open('nc_network.txt','wb').write(line_out)

line_out = 'name\ttype\n'
for g in g2t:
    line_out += '{}\t{}\n'.format(g,g2t[g])

open('nc_network.noa','wb').write(line_out)

Summarize mutations for network analysis


In [36]:
og_tsg = {}
for line in open(DIR4NBS+'oncogene_tsg.txt').read().splitlines():
    row = line.split('\t')
    og_tsg[row[0]] = row[1]

genes_cancer = set(og_tsg.keys())

In [37]:
print 'Reading pat2mut'
gene2cdsmut = {}
gene2cdsmut_orig = {}
pats_cdsmut = set()
for line in open('../coding_muts/pat2mut.txt').read().rstrip().split('\n'):
    row = line.split('\t')
    pat = row[0][:12]
    pats_cdsmut.add(pat)
    gene=row[1]
    if gene not in gene2cdsmut:
        gene2cdsmut[gene] = {}
        gene2cdsmut_orig[gene] = {}
    mut_f = int(row[2])
    if gene not in og_tsg or og_tsg[gene] != 'Oncogene':
        mut_f = -1
    gene2cdsmut[gene][pat] = mut_f
    gene2cdsmut_orig[gene][pat] = int(row[2])


Reading pat2mut

In [38]:
print 'Reading pat2CNA'
fns = glob.glob('/cellar/data/users/wzhang1984/cBioPortal/*/tcga/data_CNA.txt')
dfs = []
for fn in fns:
    disease = fn.split('/')[-3]
    df = pd.read_table(fn, index_col=0)
    df.columns = df.columns.str[:15]
    df = df.loc[:,df.columns.isin(pats_15_ncm)]
    npats = len(df.columns)
    if npats > 0:
        dfs.append(df)
df_cna = pd.concat(dfs, axis=1)
df_cna.columns = df_cna.columns.str[:12]
df_cna = (df_cna/2).round(0)
pats_CNA = set(df_cna.columns)
gene2CNA = df_cna.replace(0, np.nan).stack()
gene2CNA_index = gene2CNA.index


Reading pat2CNA

In [39]:
print 'Reading pat2ncmut'
gene2ncmut = {}
gene2dd = {}
for line in open('./gene2ncmut_lm_min_loci_instances.txt').read().rstrip().split('\n'):
    row = line.split('\t')
    p = float(row[8])
    if p > pcut:
        continue
    if row[0] not in eGenes:
        continue
    gene = row[0]

    coef = float(row[7])
    if gene not in gene2ncmut:
        gene2ncmut[gene] = {}
        gene2dd[gene] = [0, 1] # dominant direction, p-value
    if p < gene2dd[gene][1]:
        gene2dd[gene][0] = np.sign(coef)
        gene2dd[gene][1] = p
    for pat in row[5].split(','):
        if pat not in gene2ncmut[gene]:
            gene2ncmut[gene][pat] = 0
        gene2ncmut[gene][pat] += coef
    genes_cancer.add(gene)


Reading pat2ncmut

In [40]:
pats_ncmut=set()
print 'Reading all pats'
for line in open('../autoAnno_GeneHancer/TCGA2pat.txt').read().rstrip().split('\n'):
    a=line.split('\t')
    pat=a[0][:12]
    if a[-1]!='0':
        pats_ncmut.add(pat)

active_pats=pats_ncmut&pats_cdsmut&pats_CNA
print len(active_pats)


Reading all pats
846

In [41]:
line_info = 'gene\tog_tsg\tmut_oncogene\tmut\tamp\tdel\tncmut_up\tncmut_down\tdirection\tup\tdown\tscore\n'
line_oncoprinter = 'Sample\tGene\tAlteration\tType\n'
for gene in genes_cancer:
    og_tsg_term = 'ncmut'
    if gene in og_tsg:
        og_tsg_term = og_tsg[gene]
    Mut_oncogene = Mut = Amp = Del = up = down = score = 0
    ncmut_up = 0
    ncmut_down = 0
    line_up = line_down = gene
    tmp_oncoprinter = ''
    for pat in sorted(active_pats):
        alt_up = alt_down = 0
        if gene in gene2cdsmut and pat in gene2cdsmut[gene]:
            cdsmut = gene2cdsmut[gene][pat]
            if cdsmut == 1:
                Mut_oncogene += 1
                alt_up = 1
            elif cdsmut == -1:
                Mut += 1
                alt_down = -1
            if gene2cdsmut_orig[gene][pat] == 1:
                tmp_oncoprinter += '{}\t{}\t{}\t{}\n'.format(pat,gene,'*****','MISSENSE')
            elif gene2cdsmut_orig[gene][pat] == -1:
                tmp_oncoprinter += '{}\t{}\t{}\t{}\n'.format(pat,gene,'*****','TRUNC')
        if (gene, pat) in gene2CNA_index:
            CNA = gene2CNA[gene][pat]
            if CNA == 1:
                Amp += 1
                tmp_oncoprinter += '{}\t{}\t{}\t{}\n'.format(pat,gene,'AMP','CNA')
                alt_up = 1
            elif CNA == -1:
                Del += 1
                tmp_oncoprinter += '{}\t{}\t{}\t{}\n'.format(pat,gene,'HOMDEL','CNA')
                alt_down = -1
        if gene in gene2ncmut and pat in gene2ncmut[gene]:
            ncmut = gene2ncmut[gene][pat]
            if ncmut > 0:
                ncmut_up += 1
                tmp_oncoprinter += '{}\t{}\t{}\t{}\n'.format(pat,gene,'UP','EXP')
                alt_up = 1
            elif ncmut < 0:
                ncmut_down += 1
                tmp_oncoprinter += '{}\t{}\t{}\t{}\n'.format(pat,gene,'DOWN','EXP')
                alt_down = -1
        if alt_up == 1:
            up += 1.
        if alt_down == -1:
            down += 1.
        line_up += '\t{}'.format(alt_up)
        line_down += '\t{}'.format(alt_down)
    line_up += '\n'
    line_down += '\n'
    if og_tsg_term in ['Oncogene','Amplification_Oncogene']:
        score = up
    elif og_tsg_term in ['TSG','Homozygous_deletion_TSG']:
        score = -down
    elif up > down:
        score = up
    elif up < down:
        score = -down
    if og_tsg_term == 'ncmut' and gene2dd[gene][0] < 0:
        if ncmut_down < 5:
            continue
        score = -down
    if og_tsg_term == 'ncmut' and gene2dd[gene][0] > 0:
        if ncmut_up < 5:
            continue
        score = up
    if og_tsg_term == 'Amplification_Oncogene' and Amp < 5:
        continue
    if og_tsg_term == 'Homozygous_deletion_TSG' and Del < 5:
        continue
    if og_tsg_term == 'Oncogene' and Mut_oncogene < 5:
        continue
    if og_tsg_term == 'TSG' and Mut < 5:
        continue
    line_oncoprinter += tmp_oncoprinter
    dd = ''
    if og_tsg_term == 'ncmut':
        dd = gene2dd[gene]
    line_info += '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(gene, og_tsg_term, Mut_oncogene, Mut, Amp, Del, ncmut_up, ncmut_down, dd, up/len(active_pats), down/len(active_pats), score/len(active_pats))

In [42]:
open('summarize_muts2info_min.txt','wb').write(line_info)
open('summarize_muts4oncoprinter.txt','wb').write(line_oncoprinter)

In [43]:
print 'Reading pat2disease'
pat2disease = {}
for line in open('../pat2clin4surv_gender.txt').read().splitlines():
    row = line.split('\t')
    if row[0] not in pat2disease:
        pat2disease[row[0]] = set()
    pat2disease[row[0]].add(row[1])


Reading pat2disease

In [44]:
line_out = ''
for pat in sorted(active_pats):
    line_out += pat+'\t'+', '.join(pat2disease[pat[:12]])+'\n'
open('acitve_pats.txt','w').write(line_out)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: