In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import random
import numpy as np
import os
from multiprocessing import Pool, cpu_count
import glob
import pandas as pd
import gc
In [3]:
binSize = 1000000
n_perm = 20000
In [4]:
# build the library of annotations
bin2ann = {}
chr_list = ['chr{}'.format(i) for i in range(1, 23)]
chr_list.extend(['chrX', 'chrY'])
chr2ranges = {}
for Chr in chr_list:
chr2ranges[Chr] = []
chr_prev = ''
for line in open('ann/gene_promoter_enhancer.ann.final.txt').read().splitlines():
row = line.split('\t')
Chr = row[1]
if Chr not in chr_list:
continue
if Chr != chr_prev:
print Chr
chr_prev = Chr
Start = int(row[2])
End = int(row[3])
ann = row[5]
startBin = (Start-200)/binSize
endBin = (End+200)/binSize
for Bin in range(startBin, endBin+1):
chr_bin = '{}:{}'.format(Chr, Bin)
if chr_bin not in bin2ann:
bin2ann[chr_bin] = {}
if ann not in bin2ann[chr_bin]:
bin2ann[chr_bin][ann] = []
bin2ann[chr_bin][ann].append([Start, End])
if ann in ['P', 'Enhancer']:
chr2ranges[Chr].append([Start, End])
In [5]:
chr2p_list = []
chr2region2p = {}
for Chr in chr_list:
chr2p_list.append(0)
chr2region2p[Chr] = []
for region in chr2ranges[Chr]:
Len = region[1] - region[0] + 1.
chr2p_list[-1] += Len
chr2region2p[Chr].append(Len)
chr2region2p[Chr] = np.array(chr2region2p[Chr])
chr2region2p[Chr] /= np.sum(chr2region2p[Chr])
chr2p_list = np.array(chr2p_list)
chr2p_list /= np.sum(chr2p_list)
In [6]:
eQTL2info = {}
for line in open('regions_loci5_glmnet.txt').read().splitlines():
row = line.split('\t')
Chr = 'chr'+row[0]
Start = int(row[1])
End = int(row[2])
eQTL = row[-1]
mid = int(round((Start+End)/2.0))
half_len = int(round((End-Start)/2.0))
startBin = (mid-200)/binSize
endBin = (mid+200)/binSize
eQTL_ann = ''
for Bin in range(startBin, endBin+1):
chr_bin = '{}:{}'.format(Chr, Bin)
for ann in bin2ann[chr_bin]:
for start_end in bin2ann[chr_bin][ann]:
[ann_start, ann_end]=start_end
if mid>=ann_start and mid<=ann_end:
eQTL_ann = ann
eQTL2info[eQTL] = [Chr, Start, End, mid, half_len, ann]
break
if eQTL_ann!='':
break
if eQTL_ann!='':
break
In [7]:
def sample_bg(eQTL):
[Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
print eQTL, ann
# exclude any overlap with the eQTL
eQTL_range = range(Start-half_len, End+half_len+1)
chr_choices = np.random.choice(chr_list, n_perm, p=chr2p_list)
i = 0
bgs = []
while i < n_perm:
bgChr = chr_choices[i]
rangeIndex = np.random.choice(len(chr2ranges[bgChr]), 1, p=chr2region2p[bgChr])[0]
[bgStart, bgEnd] = chr2ranges[bgChr][rangeIndex]
Range = range(bgStart, bgEnd+1)
if bgChr == Chr:
Range = list(set(Range) - set(eQTL_range))
if Range != []:
bgs.append([bgChr, np.random.choice(Range, 1)[0]])
i+=1
return bgs
In [8]:
eQTLs = sorted(eQTL2info.keys())
n_processes = 16
pool = Pool(processes=n_processes)
bgs = pool.map(sample_bg, eQTLs)
pool.close()
pool.join()
In [9]:
eQTL2bg = {}
for i in range(len(eQTLs)):
eQTL2bg[eQTLs[i]] = bgs[i]
In [10]:
os.system('mkdir tmp/')
os.system('rm tmp/*')
Out[10]:
In [11]:
# Create fasta files for the background
def bg2fa(eQTL):
[Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
Len = End - Start
line_out = '{}:{}-{}\n'.format(Chr, Start-1, End+2)
for bg in eQTL2bg[eQTL]:
[bgChr, bgMid] = bg
bgMid = int(bgMid)
bgStart = bgMid - half_len
bgEnd = bgStart + Len
line_out += '{}:{}-{}\n'.format(bgChr, bgStart-1, bgEnd+2)
fn = 'tmp/{}_{}_{}_{}'.format(eQTL.split(':')[1], Chr, Start, End)
with open(fn, 'w') as f:
f.write(line_out)
os.system('twoBitToFa ../motifAnalysis/hg19.2bit {}.fa -seqList={} -noMask'.format(fn, fn))
os.system('rm {}'.format(fn))
In [12]:
eQTLs = sorted(eQTL2info.keys())
n_processes = 16
pool = Pool(processes=n_processes)
pool.map(bg2fa, eQTLs)
pool.close()
pool.join()
In [13]:
# Create a dictionary of 3-nucleotide context
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
context2category = {}
nucleotides = ['A', 'C', 'G', 'T']
for nuc1 in nucleotides:
revcom = complement[nuc1]
context2category[nuc1] = min(nuc1, revcom)
for nuc2 in nucleotides:
dinuc = nuc1 + nuc2
revcom = complement[nuc2] + complement[nuc1]
context2category[dinuc] = min(dinuc, revcom)
for nuc3 in nucleotides:
trinuc = nuc1 + nuc2 + nuc3
revcom = complement[nuc3] + complement[nuc2] + complement[nuc1]
context2category[trinuc] = min(trinuc, revcom)
context_list = sorted(list(set(context2category.values())))
In [14]:
# Calculate the correlation of the 3-nucleotide context between eQTL and background
def bg2contextcorr(eQTL):
def generate_context_vector(seq):
context_vector = [0.] * len(context_list)
for i in range(len(seq)-2):
trinuc = seq[i:i+3]
if 'N' in trinuc:
continue
context_vector[context_list.index(context2category[trinuc])] += 1
dinuc = seq[i:i+2]
context_vector[context_list.index(context2category[dinuc])] += 1
mononuc = seq[i+1:i+2]
context_vector[context_list.index(context2category[mononuc])] += 1
dinuc = seq[-2:] # The last di-nucleotide
if 'N' not in dinuc:
context_vector[context_list.index(context2category[dinuc])] += 1
return context_vector
[Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
fn = 'tmp/{}_{}_{}_{}.fa'.format(eQTL.split(':')[1], Chr, Start, End)
context_vectors = []
with open(fn) as f:
seq = ''
for line in f.read().rstrip().splitlines():
if line[0] == '>':
if seq != '':
context_vectors.append(generate_context_vector(seq))
seq = ''
else:
seq += line
context_vectors.append(generate_context_vector(seq))
context_vectors = np.array(context_vectors)
pcc = np.corrcoef(context_vectors)
return pcc[0]
In [15]:
# eQTLs = sorted(eQTL2info.keys())
# n_processes = 4
# pool = Pool(processes=n_processes)
# pccs = pool.map(bg2contextcorr, eQTLs)
# pool.close()
# pool.join()
In [ ]:
# eQTL2bgpcc = {}
# for i in range(len(eQTLs)):
# print eQTLs[i], np.count_nonzero(~np.isnan(pccs[i]))
# eQTL2bgpcc[eQTLs[i]] = np.nan_to_num(pccs[i])
In [ ]:
eQTL2bgpcc = {}
for eQTL in eQTLs:
pcc = bg2contextcorr(eQTL)
print eQTL, np.count_nonzero(~np.isnan(pcc))
eQTL2bgpcc[eQTL] = np.nan_to_num(pcc)
In [ ]:
# Build the library of replication timing
bin2reptime = {}
fns = glob.glob('/cellar/users/wzhang1984/Data/replication_timing/*')
for fn in fns:
print fn.split('/')[-1]
with open(fn) as f:
for line in f.read().rstrip().splitlines()[1:]:
row = line.split('\t')
Chr = row[0]
Start = int(row[1])
End = int(row[2])
Ws = float(row[3])
start_end = '\t'.join(row[1:3])
startBin = (Start-200)/binSize
endBin = (End+200)/binSize
for Bin in range(startBin, endBin+1):
chr_bin = '{}:{}'.format(Chr, Bin)
if chr_bin not in bin2reptime:
bin2reptime[chr_bin] = {}
if start_end not in bin2reptime[chr_bin]:
bin2reptime[chr_bin][start_end] = []
bin2reptime[chr_bin][start_end].append(Ws)
In [ ]:
def bg2reptime(eQTL):
def pos2reptime(Chr, mid):
reptimes = []
startBin = (mid-200)/binSize
endBin = (mid+200)/binSize
for Bin in range(startBin, endBin+1):
chr_bin = '{}:{}'.format(Chr, Bin)
if chr_bin not in bin2reptime:
continue
for start_end in bin2reptime[chr_bin]:
[Start, End] = [int(i) for i in start_end.split('\t')]
if mid>=Start and mid<=End:
reptimes.extend(bin2reptime[chr_bin][start_end])
if reptimes == []:
return 0
else:
return np.mean(reptimes)
[Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
eQTL_reptime = pos2reptime(Chr, mid)
reptime = [eQTL_reptime]
for bg in eQTL2bg[eQTL]:
[bgChr, bgMid] = bg
bg_reptime = pos2reptime(bgChr, bgMid)
reptime.append(bg_reptime)
return reptime
In [ ]:
eQTLs = sorted(eQTL2info.keys())
n_processes = 16
pool = Pool(processes=n_processes)
reptimes = pool.map(bg2reptime, eQTLs)
pool.close()
pool.join()
In [ ]:
eQTL2bgreptime = {}
for i in range(len(eQTLs)):
eQTL2bgreptime[eQTLs[i]] = reptimes[i]
In [ ]:
# Write bed files and use Homer to find the nearest gene
def bg2ann(eQTL):
[Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
Len = End - Start
line_out = '{}\t{}\t{}\t{}\t.\t.\n'.format(Chr, Start, End, eQTL)
for bg in eQTL2bg[eQTL]:
[bgChr, bgMid] = bg
bgStart = bgMid - half_len
bgEnd = bgStart + Len
line_out += '{}\t{}\t{}\t{}:{}\t.\t.\n'.format(bgChr, bgStart, bgEnd, bgChr, bgMid)
fn = 'tmp/{}_{}_{}_{}'.format(eQTL.split(':')[1], Chr, Start, End)
with open(fn, 'w') as f:
f.write(line_out)
os.system('annotatePeaks.pl {} hg19 >{}.ann.txt'.format(fn, fn))
os.system('rm {}'.format(fn))
In [ ]:
eQTLs = sorted(eQTL2info.keys())
n_processes = 8
pool = Pool(processes=n_processes)
pool.map(bg2ann, eQTLs)
pool.close()
pool.join()
In [ ]:
# Read gene expression
df = pd.read_table('../eQTL_rev/pat2exp.txt', index_col=0)
gene2exp = np.log2(df.clip(1/8.)).median(axis=1).to_dict()
In [ ]:
eQTL2bg_str = {}
for eQTL in eQTL2bg:
eQTL2bg_str[eQTL] = []
for bg in eQTL2bg[eQTL]:
eQTL2bg_str[eQTL].append('{}:{}'.format(bg[0], bg[1]))
eQTL2bg_str[eQTL] = np.array(eQTL2bg_str[eQTL])
In [ ]:
def bg2exp(eQTL):
[Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
fn = 'tmp/{}_{}_{}_{}.ann.txt'.format(eQTL.split(':')[1], Chr, Start, End)
exp_list = [-3.] * (n_perm+1)
with open(fn) as f:
for line in f.read().rstrip().splitlines()[1:]:
row = line.split('\t')
gene = row[-4]
exp = -3.
if gene in gene2exp:
exp = gene2exp[gene]
else:
continue
if row[0] == eQTL:
exp_list[0] = exp
else:
bg = row[0].split('-')[0]
indices = np.where(eQTL2bg_str[eQTL] == bg)[0]
for i in indices:
exp_list[i+1] = exp
return exp_list
In [ ]:
eQTLs = sorted(eQTL2info.keys())
n_processes = 8
pool = Pool(processes=n_processes)
exp_lists = pool.map(bg2exp, eQTLs)
pool.close()
pool.join()
In [ ]:
eQTL2bgexp = {}
for i in range(len(eQTLs)):
eQTL2bgexp[eQTLs[i]] = exp_lists[i]
In [ ]:
os.system('mkdir bg/')
os.system('rm bg/*')
In [46]:
def select_bg(eQTL):
bg_list = [eQTL]
bg_list.extend(eQTL2bg_str[eQTL])
df = pd.DataFrame({'context': eQTL2bgpcc[eQTL], 'reptime': eQTL2bgreptime[eQTL], 'exp': eQTL2bgexp[eQTL]}, index=bg_list)
df_norm = ((df-df.mean())/df.std())
df['dist'] = ((df_norm-df_norm.loc[eQTL])**2).T.sum()
df = df.sort_values('dist').iloc[1:1001,:]
[Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
fn = '{}_{}_{}_{}'.format(eQTL.split(':')[1], Chr, Start, End)
df.to_csv('bg/{}.txt'.format(fn), sep='\t')
return df
In [47]:
eQTLs = sorted(eQTL2info.keys())
n_processes = 8
pool = Pool(processes=n_processes)
dfs = pool.map(select_bg, eQTLs)
pool.close()
pool.join()
In [48]:
eQTL2bgdf = {}
for i in range(len(eQTLs)):
eQTL2bgdf[eQTLs[i]] = dfs[i]
In [ ]:
del bin2ann, bin2reptime
gc.collect()
In [ ]:
bin2mut = {}
pats = set()
print 'Reading mutations'
cline = 0
for line in open('input_muts_ICGC.txt'):
cline+=1
if cline%1000000 == 0:
print cline
row = line.rstrip().split('\t')
Chr = 'chr' + row[0]
Start = int(row[1])
pat = row[-1]
pats.add(pat)
startBin = (Start-200)/binSize
endBin = (Start+200)/binSize
for Bin in range(startBin, endBin+1):
chr_bin='{}:{}'.format(Chr, Bin)
if chr_bin not in bin2mut:
bin2mut[chr_bin]=[]
bin2mut[chr_bin].append([Start, pat])
In [49]:
def eQTL2p(eQTL):
[Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
Len = End - Start
eQTL2pats = set()
startBin = (Start-200)/binSize
endBin = (End+200)/binSize
for Bin in range(startBin, endBin+1):
chr_bin = '{}:{}'.format(Chr, Bin)
for mut_pat in bin2mut[chr_bin]:
[mut, pat] = mut_pat
if mut>=Start and mut<=End:
eQTL2pats.add(pat)
n_pats = len(eQTL2pats)
pat2bgp = {} # per-sample probability of mutation
for pat in pats:
pat2bgp[pat] = set()
nbg = len(eQTL2bgdf[eQTL].index)
for bg in eQTL2bgdf[eQTL].index:
[bgChr, bgMid] = bg.split(':')
bgMid = int(bgMid)
bgStart = bgMid - half_len
bgEnd = bgStart + Len
startBin = (bgStart-200)/binSize
endBin = (bgEnd+200)/binSize
for Bin in range(startBin, endBin+1):
chr_bin = '{}:{}'.format(bgChr, Bin)
if chr_bin not in bin2mut:
continue
for mut_pat in bin2mut[chr_bin]:
[mut, pat] = mut_pat
if mut>=bgStart and mut<=bgEnd:
pat2bgp[pat].add(bg)
for pat in pat2bgp:
pat2bgp[pat] = len(pat2bgp[pat])/float(nbg)
df = pd.DataFrame.from_dict(pat2bgp, orient='index')
df.columns = ['pr']
return (n_pats, df)
In [50]:
eQTLs = sorted(eQTL2info.keys())
n_processes = 4
pool = Pool(processes=n_processes)
npats_df_list = pool.map(eQTL2p, eQTLs)
pool.close()
pool.join()
In [51]:
eQTL2npats = {}
eQTL2pat2pr_df = {}
for i in range(len(eQTLs)):
eQTL2npats[eQTLs[i]] = npats_df_list[i][0]
eQTL2pat2pr_df[eQTLs[i]] = npats_df_list[i][1]
In [52]:
from poibin import PoiBin
In [53]:
line_out = ''
for eQTL in eQTLs:
pb = PoiBin(list(eQTL2pat2pr_df[eQTL]['pr']))
p = pb.pval(eQTL2npats[eQTL])
print eQTL, eQTL2npats[eQTL], p
[Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
line_out += '{}\t{}\t{}\t{}\t{}\t{}\n'.format(eQTL, Chr, Start, End, eQTL2npats[eQTL], p)
with open('eQTL2recurrent_p_ICGC.txt', 'w') as f:
f.write(line_out)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: