In [2]:
import numpy as np
import pandas as pd
import glob
import scipy.stats as stats
In [121]:
pats_12_ncm = set()
pats_15_ncm = set()
for line in open('../autoAnno_GeneHancer/TCGA2pat.txt').read().splitlines():
row = line.split('\t')
if row[-1] != '0':
pats_12_ncm.add(row[0][:12])
pats_15_ncm.add(row[0][:15])
In [142]:
p2q = {}
pcut = 0
eGenes = set()
gene2p = {}
for line in open('./gene2ncmut_lm_p_q.txt').read().splitlines():
row = line.split('\t')
p = float(row[-3])
q = float(row[-2])
p2q[p] = q
if q <= 0.2:
pcut = max(p+1e-5, pcut)
eGenes.add(row[0])
gene2p[row[0]] = p
In [149]:
loci2gene2info = {}
loci2p = {}
gene_loci_sig = set()
loci_sig = set()
affected_genes = set()
for line in open('./gene2ncmut_lm_p_allPairs.txt').read().splitlines():
row = line.split('\t')
if row[1] not in loci2gene2info:
loci2gene2info[row[1]] = {}
loci2p[row[1]] = 1
p = float(row[3])
loci2gene2info[row[1]][row[0]] = [row[2], row[3]]
loci2p[row[1]] = min(loci2p[row[1]], p)
if row[0] in eGenes and p <= pcut:
gene_loci_sig.add('{}\t{}'.format(row[0], row[1]))
loci_sig.add(row[1])
affected_genes.add(row[0])
In [125]:
print len(loci2gene2info)
print len(affected_genes)
In [126]:
loci2motif = {}
for line in open('../motifAnalysis/summarize_instances_50_compare_loci.txt').read().splitlines():
row = line.split('\t')
loci = row[1]
if int(row[4]) < 4:
continue
motif_gain_loss_nPats = '_'.join(row[2:5])
if loci not in loci2motif:
loci2motif[loci] = set()
loci2motif[loci].add(motif_gain_loss_nPats)
In [23]:
pat2disease = {}
disease2nPats = {}
disease2nPats['PanCan'] = 0.0
for line in open('../pat2clin4surv_gender.txt').read().splitlines():
row = line.split('\t')
if row[0] in pats_12_ncm:
pat2disease[row[0]] = row[1]
if row[1] not in disease2nPats:
disease2nPats[row[1]] = 0.0
disease2nPats[row[1]] += 1
disease2nPats['PanCan'] += 1
In [24]:
print disease2nPats
In [25]:
gene2TSS = {}
df = pd.read_table('/cellar/users/wzhang1984/soft/homer/data/promoters/human.pos', header=None, index_col=0)
df['TSS'] = list(zip(df[1], df[2]+2000, -df[4]*2+1))
refseq2TSS = df['TSS'].to_dict()
df = pd.read_table('/cellar/users/wzhang1984/soft/homer/data/accession/human2gene.tsv', header=None, index_col=0)
id2gene = df[6].to_dict()
for ID in id2gene:
if ID in refseq2TSS:
gene = id2gene[ID]
if gene not in gene2TSS:
gene2TSS[gene] = set()
gene2TSS[gene].add(refseq2TSS[ID])
In [26]:
line_out = set()
header = True
all_affected_pats = set()
loci_wMotifs = set()
line_out2 = set()
line_out3 = set()
line_out4 = set()
gene2line_out5_fig2a = {}
line_out4_header = 'loci\tgene\tdistance:gene\tp\tq\tc_score\t'+'\t'.join([disease for disease in sorted(disease2nPats)])+'\n'
loci2concentrate = {}
for line in open('../autoAnno_GeneHancer/TCGA_snv_mnv_merged_50_anno_promoter_genehancer.txt').read().splitlines():
row = line.split('\t')
if header:
header = False
continue
row = line.split('\n')[0].split('\t')
loci = '{}:{}-{}'.format(row[0], row[1], row[2])
if loci not in loci2gene2info:
continue
Chr = row[0]
Start = int(row[1]) + 1
End = int(row[2]) - 1
mid = (Start+End) / 2.
loci_motifs = ''
if loci in loci2motif:
loci_motifs = ','.join(loci2motif[loci])
if loci2p[loci] <= pcut and loci in loci_sig:
loci_wMotifs.add(loci)
pats = set()
locs = set()
disease2pats = {}
mut2pats = {}
for mut in row[4].split(","):
[pat, Chr_StartEnd, source, alt] = mut.split("__")
pat = pat[:12]
pats.add(pat)
locs.add(Chr_StartEnd)
disease = pat2disease[pat]
if disease not in disease2pats:
disease2pats[disease] = set()
disease2pats[disease].add(pat)
if Chr_StartEnd not in mut2pats:
mut2pats[Chr_StartEnd] = set()
mut2pats[Chr_StartEnd].add(pat)
loci2concentrate[loci] = 0
for mut in mut2pats:
loci2concentrate[loci] = max(loci2concentrate[loci], len(mut2pats[mut])/float(len(pats)))
for g in row[-1].split(','):
[gene, tp] = g.split('|')
if gene in loci2gene2info[loci]:
distance = ''
distance_start_end = []
if gene in gene2TSS:
for chr_tss_direction in gene2TSS[gene]:
[chr_gene, tss, direction] = chr_tss_direction
if Chr != chr_gene:
continue
tmp = (mid-tss) * direction
if distance == '' or np.abs(distance) > np.abs(tmp):
distance = tmp
distance_start_end = sorted([(Start-tss)*direction, (End-tss)*direction])
for i in range(len(distance_start_end)):
if distance_start_end[i] > 0:
distance_start_end[i] = '+' + str(distance_start_end[i])
p = float(loci2gene2info[loci][gene][1])
q = ''
if p in p2q:
q = p2q[p]
line_out_tmp = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(gene, loci, (End-Start+1), len(pats), len(locs), ','.join(sorted(pats)), ','.join(sorted(locs)), '\t'.join(loci2gene2info[loci][gene]), q, loci_motifs, len(disease2pats), ','.join(['{}_{}'.format(disease_,len(disease2pats[disease_])) for disease_ in sorted(disease2pats)]), distance, loci2concentrate[loci])
line_out.add(line_out_tmp)
if gene not in gene2line_out5_fig2a:
gene2line_out5_fig2a[gene] = [1, '']
if p < gene2line_out5_fig2a[gene][0]:
gene2line_out5_fig2a[gene][0] = p
gene2line_out5_fig2a[gene][1] = '{}\t{}\t{}\t{}\t{}\t{}\n'.format(gene, loci, loci2gene2info[loci][gene][0], gene2p[gene], p, distance)
gene_loci = '{}\t{}'.format(gene,loci)
if gene_loci not in gene_loci_sig:
continue
diff = 1
for p2 in p2q:
if np.abs(p-p2) < diff:
diff = np.abs(p-p2)
q = p2q[p2]
all_affected_pats = all_affected_pats|pats
line_out3.add('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(gene, Chr, Start, End, len(pats), p, q, loci2gene2info[loci][gene][0], ', '.join(['{}'.format(disease_i) for disease_i in sorted(disease2pats)]), distance, loci2concentrate[loci]))
loci_gene2percentages = {}
for disease in disease2nPats:
if disease in disease2pats:
loci_gene2percentages[disease] = len(disease2pats[disease]) / disease2nPats[disease]
else:
loci_gene2percentages[disease] = 0
loci_gene2percentages['PanCan'] = len(pats) / disease2nPats['PanCan']
line_out4.add('{}\t{}\t{}:{}\t{}\t{}\t{}\t{}\n'.format(loci, gene, '-'.join([str(i) for i in distance_start_end]), gene.split('|')[0], p, q, loci2concentrate[loci], '\t'.join([str(loci_gene2percentages[disease]) for disease in sorted(disease2nPats)])))
if gene_loci not in gene_loci_sig:
continue
if loci not in loci2motif:
continue
for motif in loci2motif[loci]:
gain_loss = ''
if len(motif.split('_gain_')) > 1:
motif = motif.split('_gain_')[0]
gain_loss = 'Gain'
elif len(motif.split('_lost_')) > 1:
motif = motif.split('_lost_')[0]
gain_loss = 'Loss'
line_out2.add('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(gene, Chr, Start, End, len(pats), loci2gene2info[loci][gene][0], motif+'\t'+gain_loss, ', '.join(['{}'.format(disease_i) for disease_i in sorted(disease2pats)]), distance))
print len(all_affected_pats)
print len(loci_wMotifs)
In [27]:
open('gene2ncmut_lm_min_loci_instances.txt','w').write(''.join(sorted(line_out)))
open('gene2ncmut_lm_min_loci_instances_TableS2.txt','w').write('Gene\teQTL chromosome\teQTL start\teQTL end\tNumber of patients\tCoefficient\tMotif\tGain/Loss of motif\tCancer types\tDistance to TSS\n'+''.join(sorted(line_out2)))
open('gene2ncmut_lm_min_loci_instances_TableS1.txt','w').write('Gene\teQTL chromosome\teQTL start\teQTL end\tNumber of patients\tp-value\tq-value\tCoefficient\tCancer types\tDistance to TSS\tConcentration score\n'+''.join(sorted(line_out3)))
open('gene2ncmut_lm_min_loci_instances_percentage.txt','w').write(line_out4_header+''.join(sorted(line_out4)))
open('gene2ncmut_lm_min_loci_instances_fig2a.txt','w').write(''.join([i[1] for i in gene2line_out5_fig2a.values()]))
In [ ]:
In [29]:
DIR4NBS = "/cellar/data/users/wzhang1984/forNBS/"
CGs = set()
CGC = set()
og_tsg = set()
for line in open(DIR4NBS+'CGs.txt').read().splitlines():
CGs.add(line)
for line in open(DIR4NBS+'cancer_gene_census.csv').read().splitlines()[1:]:
row = line.split(',')
CGC.add(row[0])
for line in open(DIR4NBS+'oncogene_tsg.txt').read().splitlines():
row = line.split('\t')
og_tsg.add(row[0])
all_CGs = CGs|CGC|og_tsg
white_list=set()
for line in open('./whitelist.txt').read().splitlines():
white_list.add(line.split('|')[0])
print len(all_CGs&white_list)
print len(white_list)
line_out=''
eGenes = set()
for line in open('./gene2ncmut_lm_min_loci_instances_TableS1.txt').read().splitlines():
row = line.split('\t')
loci2CGs = ''
loci2CGC = ''
loci2og_tsg = ''
g = row[0]
eGenes.add(g)
if g in CGs:
loci2CGs = 'CGs'
if g in CGC:
loci2CGC = 'CGC'
if g in og_tsg:
loci2og_tsg ='og_tsg'
if loci2CGs or loci2CGC:
line_out += '{}\t{}\t{}\t{}\n'.format(row[0], loci2CGs, loci2CGC, loci2og_tsg)
open('loci2CGs.txt','wb').write(line_out)
In [30]:
# Fisher's exct test for the enrichment of overlap between eGenes and cancer genes
a = len(eGenes&all_CGs&white_list)
b = len(eGenes&white_list) - a
c = len(all_CGs&white_list) - a
d = len(white_list) - b - c + a
stats.fisher_exact([[a, b], [c, d]])
Out[30]:
In [31]:
print(a, b, c, d)
In [32]:
eGenes&all_CGs
Out[32]:
In [33]:
line_out = 'Gene1\tGene2\tAnnotation\tDirection\tScore\n'
g2t = {}
for line in open('gene2ncmut_lm_min_loci_instances.txt').read().splitlines():
row = line.split('\t')
if row[0] not in eGenes or float(row[8]) > pcut:
continue
gene = row[0]
g2t[gene] = 'gene'
g2t[row[1]] = 'nc'
line_out += '{}\t{}\t{}\t{}\t{}\n'.format(row[1], gene, 'nc-gene', '-.', '1')
network_fn = '/cellar/data/users/wzhang1984/forNBS/FIsInGene_031516_with_annotations.txt'
for line in open(network_fn).read().splitlines()[1:]:
row = line.split('\t')
[g1, g2] = row[:2]
if g1 in g2t and g2 in g2t:
line_out += line+'\n'
open('nc_network.txt','wb').write(line_out)
line_out = 'name\ttype\n'
for g in g2t:
line_out += '{}\t{}\n'.format(g,g2t[g])
open('nc_network.noa','wb').write(line_out)
In [36]:
og_tsg = {}
for line in open(DIR4NBS+'oncogene_tsg.txt').read().splitlines():
row = line.split('\t')
og_tsg[row[0]] = row[1]
genes_cancer = set(og_tsg.keys())
In [37]:
print 'Reading pat2mut'
gene2cdsmut = {}
gene2cdsmut_orig = {}
pats_cdsmut = set()
for line in open('../coding_muts/pat2mut.txt').read().rstrip().split('\n'):
row = line.split('\t')
pat = row[0][:12]
pats_cdsmut.add(pat)
gene=row[1]
if gene not in gene2cdsmut:
gene2cdsmut[gene] = {}
gene2cdsmut_orig[gene] = {}
mut_f = int(row[2])
if gene not in og_tsg or og_tsg[gene] != 'Oncogene':
mut_f = -1
gene2cdsmut[gene][pat] = mut_f
gene2cdsmut_orig[gene][pat] = int(row[2])
In [38]:
print 'Reading pat2CNA'
fns = glob.glob('/cellar/data/users/wzhang1984/cBioPortal/*/tcga/data_CNA.txt')
dfs = []
for fn in fns:
disease = fn.split('/')[-3]
df = pd.read_table(fn, index_col=0)
df.columns = df.columns.str[:15]
df = df.loc[:,df.columns.isin(pats_15_ncm)]
npats = len(df.columns)
if npats > 0:
dfs.append(df)
df_cna = pd.concat(dfs, axis=1)
df_cna.columns = df_cna.columns.str[:12]
df_cna = (df_cna/2).round(0)
pats_CNA = set(df_cna.columns)
gene2CNA = df_cna.replace(0, np.nan).stack()
gene2CNA_index = gene2CNA.index
In [39]:
print 'Reading pat2ncmut'
gene2ncmut = {}
gene2dd = {}
for line in open('./gene2ncmut_lm_min_loci_instances.txt').read().rstrip().split('\n'):
row = line.split('\t')
p = float(row[8])
if p > pcut:
continue
if row[0] not in eGenes:
continue
gene = row[0]
coef = float(row[7])
if gene not in gene2ncmut:
gene2ncmut[gene] = {}
gene2dd[gene] = [0, 1] # dominant direction, p-value
if p < gene2dd[gene][1]:
gene2dd[gene][0] = np.sign(coef)
gene2dd[gene][1] = p
for pat in row[5].split(','):
if pat not in gene2ncmut[gene]:
gene2ncmut[gene][pat] = 0
gene2ncmut[gene][pat] += coef
genes_cancer.add(gene)
In [40]:
pats_ncmut=set()
print 'Reading all pats'
for line in open('../autoAnno_GeneHancer/TCGA2pat.txt').read().rstrip().split('\n'):
a=line.split('\t')
pat=a[0][:12]
if a[-1]!='0':
pats_ncmut.add(pat)
active_pats=pats_ncmut&pats_cdsmut&pats_CNA
print len(active_pats)
In [41]:
line_info = 'gene\tog_tsg\tmut_oncogene\tmut\tamp\tdel\tncmut_up\tncmut_down\tdirection\tup\tdown\tscore\n'
line_oncoprinter = 'Sample\tGene\tAlteration\tType\n'
for gene in genes_cancer:
og_tsg_term = 'ncmut'
if gene in og_tsg:
og_tsg_term = og_tsg[gene]
Mut_oncogene = Mut = Amp = Del = up = down = score = 0
ncmut_up = 0
ncmut_down = 0
line_up = line_down = gene
tmp_oncoprinter = ''
for pat in sorted(active_pats):
alt_up = alt_down = 0
if gene in gene2cdsmut and pat in gene2cdsmut[gene]:
cdsmut = gene2cdsmut[gene][pat]
if cdsmut == 1:
Mut_oncogene += 1
alt_up = 1
elif cdsmut == -1:
Mut += 1
alt_down = -1
if gene2cdsmut_orig[gene][pat] == 1:
tmp_oncoprinter += '{}\t{}\t{}\t{}\n'.format(pat,gene,'*****','MISSENSE')
elif gene2cdsmut_orig[gene][pat] == -1:
tmp_oncoprinter += '{}\t{}\t{}\t{}\n'.format(pat,gene,'*****','TRUNC')
if (gene, pat) in gene2CNA_index:
CNA = gene2CNA[gene][pat]
if CNA == 1:
Amp += 1
tmp_oncoprinter += '{}\t{}\t{}\t{}\n'.format(pat,gene,'AMP','CNA')
alt_up = 1
elif CNA == -1:
Del += 1
tmp_oncoprinter += '{}\t{}\t{}\t{}\n'.format(pat,gene,'HOMDEL','CNA')
alt_down = -1
if gene in gene2ncmut and pat in gene2ncmut[gene]:
ncmut = gene2ncmut[gene][pat]
if ncmut > 0:
ncmut_up += 1
tmp_oncoprinter += '{}\t{}\t{}\t{}\n'.format(pat,gene,'UP','EXP')
alt_up = 1
elif ncmut < 0:
ncmut_down += 1
tmp_oncoprinter += '{}\t{}\t{}\t{}\n'.format(pat,gene,'DOWN','EXP')
alt_down = -1
if alt_up == 1:
up += 1.
if alt_down == -1:
down += 1.
line_up += '\t{}'.format(alt_up)
line_down += '\t{}'.format(alt_down)
line_up += '\n'
line_down += '\n'
if og_tsg_term in ['Oncogene','Amplification_Oncogene']:
score = up
elif og_tsg_term in ['TSG','Homozygous_deletion_TSG']:
score = -down
elif up > down:
score = up
elif up < down:
score = -down
if og_tsg_term == 'ncmut' and gene2dd[gene][0] < 0:
if ncmut_down < 5:
continue
score = -down
if og_tsg_term == 'ncmut' and gene2dd[gene][0] > 0:
if ncmut_up < 5:
continue
score = up
if og_tsg_term == 'Amplification_Oncogene' and Amp < 5:
continue
if og_tsg_term == 'Homozygous_deletion_TSG' and Del < 5:
continue
if og_tsg_term == 'Oncogene' and Mut_oncogene < 5:
continue
if og_tsg_term == 'TSG' and Mut < 5:
continue
line_oncoprinter += tmp_oncoprinter
dd = ''
if og_tsg_term == 'ncmut':
dd = gene2dd[gene]
line_info += '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(gene, og_tsg_term, Mut_oncogene, Mut, Amp, Del, ncmut_up, ncmut_down, dd, up/len(active_pats), down/len(active_pats), score/len(active_pats))
In [42]:
open('summarize_muts2info_min.txt','wb').write(line_info)
open('summarize_muts4oncoprinter.txt','wb').write(line_oncoprinter)
In [43]:
print 'Reading pat2disease'
pat2disease = {}
for line in open('../pat2clin4surv_gender.txt').read().splitlines():
row = line.split('\t')
if row[0] not in pat2disease:
pat2disease[row[0]] = set()
pat2disease[row[0]].add(row[1])
In [44]:
line_out = ''
for pat in sorted(active_pats):
line_out += pat+'\t'+', '.join(pat2disease[pat[:12]])+'\n'
open('acitve_pats.txt','w').write(line_out)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: