In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
import numpy as np
import os
from multiprocessing import Pool, cpu_count
import glob
import pandas as pd
import gc

Sample local background


In [3]:
binSize = 1000000
n_perm = 20000

In [4]:
# build the library of annotations
bin2ann = {}
chr_list = ['chr{}'.format(i) for i in range(1, 23)]
chr_list.extend(['chrX', 'chrY'])
chr2ranges = {}
for Chr in chr_list:
    chr2ranges[Chr] = []
chr_prev = ''
for line in open('ann/gene_promoter_enhancer.ann.final.txt').read().splitlines():
    row = line.split('\t')
    Chr = row[1]
    if Chr not in chr_list:
        continue
    if Chr != chr_prev:
        print Chr
        chr_prev = Chr
    Start = int(row[2])
    End = int(row[3])
    ann = row[5]
    startBin = (Start-200)/binSize
    endBin = (End+200)/binSize
    for Bin in range(startBin, endBin+1):
        chr_bin = '{}:{}'.format(Chr, Bin)
        if chr_bin not in bin2ann:
            bin2ann[chr_bin] = {}
        if ann not in bin2ann[chr_bin]:
            bin2ann[chr_bin][ann] = []
        bin2ann[chr_bin][ann].append([Start, End])
    if ann in ['P', 'Enhancer']:
        chr2ranges[Chr].append([Start, End])


chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY

In [5]:
chr2p_list = []
chr2region2p = {}
for Chr in chr_list:
    chr2p_list.append(0)
    chr2region2p[Chr] = []
    for region in chr2ranges[Chr]:
        Len = region[1] - region[0] + 1.
        chr2p_list[-1] += Len
        chr2region2p[Chr].append(Len)
    chr2region2p[Chr] = np.array(chr2region2p[Chr])
    chr2region2p[Chr] /= np.sum(chr2region2p[Chr])
chr2p_list = np.array(chr2p_list)
chr2p_list /= np.sum(chr2p_list)

In [6]:
eQTL2info = {}
for line in open('regions_loci5_glmnet.txt').read().splitlines():
    row = line.split('\t')
    Chr = 'chr'+row[0]
    Start = int(row[1])
    End = int(row[2])
    eQTL = row[-1]
    mid = int(round((Start+End)/2.0))
    half_len = int(round((End-Start)/2.0))
    startBin = (mid-200)/binSize
    endBin = (mid+200)/binSize
    eQTL_ann = ''
    for Bin in range(startBin, endBin+1):
        chr_bin = '{}:{}'.format(Chr, Bin)
        for ann in bin2ann[chr_bin]:
            for start_end in bin2ann[chr_bin][ann]:
                [ann_start, ann_end]=start_end
                if mid>=ann_start and mid<=ann_end:
                    eQTL_ann = ann
                    eQTL2info[eQTL] = [Chr, Start, End, mid, half_len, ann]
                    break
            if eQTL_ann!='':
                break
        if eQTL_ann!='':
            break

In [7]:
def sample_bg(eQTL):
    [Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
    print eQTL, ann
    # exclude any overlap with the eQTL
    eQTL_range = range(Start-half_len, End+half_len+1)
    
    chr_choices = np.random.choice(chr_list, n_perm, p=chr2p_list)
    i = 0
    bgs = []
    while i < n_perm:
        bgChr = chr_choices[i]
        rangeIndex = np.random.choice(len(chr2ranges[bgChr]), 1, p=chr2region2p[bgChr])[0]
        [bgStart, bgEnd] = chr2ranges[bgChr][rangeIndex]
        Range = range(bgStart, bgEnd+1)
        if bgChr == Chr:
            Range = list(set(Range) - set(eQTL_range))
        if Range != []:
            bgs.append([bgChr, np.random.choice(Range, 1)[0]])
            i+=1
    return bgs

In [8]:
eQTLs = sorted(eQTL2info.keys())

n_processes = 16
pool = Pool(processes=n_processes)

bgs = pool.map(sample_bg, eQTLs)

pool.close()
pool.join()


+10021-+10079:P2RY13 Enhancer
+128894-+129002:ZNF551 Enhancer
+111835-+111871:EXT1 Enhancer
+117473-+117514:MECP2 P
+168344-+168385:ZDHHC11 Enhancer
+19-+27:KCNJ5 P
+2152-+2246:MST1 Enhancer
+148505-+148529:PARD3 Enhancer
+310640-+310663:S100B Enhancer
+35402-+35530:IRF6 Enhancer
+252498-+252565:HMG20A Enhancer
+38208-+38210:ERAP2 Enhancer
+23295-+23305:TMEM140 P
+464676-+464681:AFF4 Enhancer
+328033-+328135:ADGRE1 Enhancer
+441029-+441033:RTP4 Enhancer
+34444-+34451:ZNF665 P
+311675-+311778:CA11 Enhancer
+170021-+170064:PDXDC1 Enhancer
+26306-+26397:SDHAP1 Enhancer
+24327-+24369:ZNF561 Enhancer
+101126-+101274:NFX1 Enhancer
+11842-+11895:UNC93B1 Enhancer
+149272-+149297:ETV5 P
+471747-+471747:CTNNBL1 P
+20183-+20191:USP25 Enhancer
+2159-+2159:VIPR1 Enhancer
+11212-+11313:FMO2 Enhancer
+1349-+1357:TRIM54 Enhancer
+446794-+446838:IGF2 Enhancer
+38337-+38366:MUL1 Enhancer
+361799-+361849:EDC3 Enhancer
+321860-+321920:TRIM31 P
+350566-+350585:SMIM8 Enhancer
+175-+216:GLYCTK P
+103408-+103582:NFX1 Enhancer
+205467-+205579:DISC1 Enhancer
+47265-+47272:SLC9A7 Enhancer
+224223-+224388:SULT1A2 Enhancer
+1261-+1318:APOBEC3A Enhancer
+1557-+1627:RELL1 Enhancer
+30620-+30625:PDCD6 Enhancer
+24540-+24554:SDHAP1 Enhancer
+377602-+377603:SPECC1 Enhancer
+451550-+451667:TMEM150B P
+112150-+112150:UBE2C Enhancer
+135430-+135462:PAWR Enhancer
+38990-+39000:SLC25A52 Enhancer
+32678-+32744:GYPC Enhancer
+353-+394:TINAGL1 P
+17844-+17872:NCALD Enhancer
+227129-+227149:ACOT1 Enhancer
+104494-+104535:PCDH1 Enhancer
+12614-+12656:BCAR3 Enhancer
+4728-+4755:GSTT2 Enhancer
+168335-+168337:ARID1A Enhancer
+30970-+30985:NCK1 Enhancer
+377921-+377932:TMEM189 Enhancer
+206255-+206255:CACTIN Enhancer
+2466-+2492:SIRPB1 Enhancer
+46129-+46129:FCGR2C Enhancer
+112266-+112334:ARNT Enhancer
+426951-+426974:THADA Enhancer
+138656-+138740:WARS2 Enhancer
+477801-+477817:CXorf40A N
+5053-+5105:PLEKHM1 Enhancer
+544685-+544744:C2orf27A Enhancer
+605718-+605719:TMEM138 P
+7496-+7531:FAM228B P
+820713-+820767:PGM5 Enhancer
+84-+84:IQUB P
+90161-+90223:C6orf136 Enhancer
−106708-−106695:DUSP12 Enhancer
+48828-+48974:DHX34 Enhancer
−122715-−122671:GPER1 Enhancer
+51149-+51223:IL1A Enhancer
−13625-−13598:TC2N Enhancer
+547294-+547337:C2orf27A Enhancer
−1415-−1358:INAFM1 Enhancer
+66408-+66492:ZNF790 Enhancer
−151976-−151976:NBPF1 Enhancer
+753-+784:NOMO2 P
−181-−116:ESR2 P
+82104-+82146:SPESP1 Enhancer
−200112-−200107:MPC2 P
+857517-+857586:STX5 Enhancer
−213437-−213400:ACSM1 Enhancer
+929278-+929278:NR2F6 Enhancer
+496405-+496464:ISG20 Enhancer
−109572-−109520:UCKL1 Enhancer
−12857-−12716:ZNF251 Enhancer
+516264-+516272:ZNF155 P
−13647-−13581:GTF3C6 Enhancer
+579966-+579974:TBC1D2B Enhancer
−146211-−146048:NBPF1 Enhancer
+67914-+68002:TIGD6 P
−154145-−154111:JADE2 P
+754705-+754733:LRRC4B P
−18111-−18108:PLEKHG6 Enhancer
+826924-+826988:METTL16 P
−2013-−2010:LRRC40 Enhancer
+87164-+87266:PLCG2 Enhancer
−23010-−22972:FCGR2C Enhancer
+95097-+95132:HYI P
+5048-+5228:APOBEC2 Enhancer
−114732-−114704:RARS2 Enhancer
−133161-−133158:RTCA P
+529473-+529558:CCL23 P
−137-−25:MASP1 P
+59237-+59289:ZNF550 P
−1475-−1465:HCG4B Enhancer
+69877-+69973:NSFL1C Enhancer
+781185-+781238:C3orf18 P
−157548-−157492:HMBS Enhancer
−18999-−18953:EXOC3 Enhancer
+8316-+8320:OPA3 Enhancer
+891753-+891799:ZNF764 Enhancer
−202-−191:DAAM1 P
−2434-−2332:TUBBP5 Enhancer
+97957-+97958:ADIPOR2 Enhancer
−115798-−115779:ZNF622 Enhancer
−251737-−251712:CDC23 P
−134187-−134176:NAT1 Enhancer
−140262-−140218:RAB40A Enhancer
−267580-−267538:RUNX3 P
−148244-−148202:CBLN2 Enhancer
−286286-−286179:TXNDC15 Enhancer
−19584-−19526:AKR1C1 Enhancer
−305539-−305530:TSPAN32 P
−171556-−171538:RCSD1 Enhancer
−35456-−35289:UPK3B P
−21155-−21051:LGALS8 Enhancer
−24986-−24957:FUK Enhancer
−365839-−365797:CA4 Enhancer
−254171-−254163:SLC25A44 Enhancer
−39110-−39082:C1QTNF6 Enhancer
−45127-−45070:RAB3D P
−27257-−27141:IQCE Enhancer
−510970-−510887:CA11 P
−296801-−296686:CCND2 Enhancer
−542407-−542270:USP32P2 Enhancer
−319-−252:PDPR P
−67178-−67164:FCGR2C Enhancer
−356-−333:SDHAP2 P
−70607-−70402:HLTF Enhancer
−366671-−366628:SOCS5 Enhancer
−775368-−775352:LYSMD1 Enhancer
−402890-−402874:RDH13 Enhancer
−26028-−26028:GFOD1 Enhancer
−81119-−81108:PEX26 Enhancer
−46444-−46444:CDC42 Enhancer
−277187-−277103:BRD2 Enhancer
−876032-−876024:HERC3 Enhancer
−51710-−51608:AHRR Enhancer
−299635-−299556:SFRP4 Enhancer
−963-−941:HSD17B6 P
−6338-−6326:MAPK13 Enhancer
−32181-−32054:FAM69C Enhancer
−67894-−67745:ENPP2 Enhancer
−356584-−356500:PDGFA P
−71678-−71678:IL12RB1 Enhancer
−369-−320:LIMS2 P
−413682-−413672:SYN2 Enhancer
−78-−18:HLA-DRB1 P
−81161-−81101:ERBB4 Enhancer
−264727-−264670:ZNF575 Enhancer
−46746-−46725:ITGB4 Enhancer
−28160-−28112:RNF126P1 Enhancer
−88481-−88457:CDC42 Enhancer
−30054-−30027:ATP9A Enhancer
−52060-−52060:TAS2R5 P
−963-−949:C3orf18 P
−346141-−346128:FAM66A Enhancer
−65798-−65733:SIRT5 Enhancer
−358045-−358013:SRGAP2 Enhancer
−681910-−681862:PUM3 P
−722743-−722615:ZNF44 Enhancer
−379072-−379013:FAU Enhancer
−424468-−424424:SF3A2 P
−81298-−81201:TBC1D20 P
−795-−761:RLN2 P
−492492-−492443:CLEC4E Enhancer
−990-−980:PSMD12 P
−887841-−887760:ITFG2 Enhancer
−53421-−53224:DPY19L2P1 Enhancer
−97279-−97226:NOV Enhancer
−6633-−6576:CD40 Enhancer
−69-−61:MAP1S P
−77027-−76992:DSTYK Enhancer
−869128-−869128:ZNF284 Enhancer
−803819-−803760:ZNF284 Enhancer
−91-+1:TERT P
−97983-−97929:RASA4 Enhancer

In [9]:
eQTL2bg = {}
for i in range(len(eQTLs)):
    eQTL2bg[eQTLs[i]] = bgs[i]

mono-, di- and tri-nucleotide context


In [10]:
os.system('mkdir tmp/')
os.system('rm tmp/*')


Out[10]:
0

In [11]:
# Create fasta files for the background
def bg2fa(eQTL):
    [Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
    Len = End - Start
    line_out = '{}:{}-{}\n'.format(Chr, Start-1, End+2)
    for bg in eQTL2bg[eQTL]:
        [bgChr, bgMid] = bg
        bgMid = int(bgMid)
        bgStart = bgMid - half_len
        bgEnd = bgStart + Len
        line_out += '{}:{}-{}\n'.format(bgChr, bgStart-1, bgEnd+2)
        fn = 'tmp/{}_{}_{}_{}'.format(eQTL.split(':')[1], Chr, Start, End)
    with open(fn, 'w') as f:
        f.write(line_out)
    os.system('twoBitToFa ../motifAnalysis/hg19.2bit {}.fa -seqList={} -noMask'.format(fn, fn))
    os.system('rm {}'.format(fn))

In [12]:
eQTLs = sorted(eQTL2info.keys())

n_processes = 16
pool = Pool(processes=n_processes)

pool.map(bg2fa, eQTLs)

pool.close()
pool.join()

In [13]:
# Create a dictionary of 3-nucleotide context
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
context2category = {}
nucleotides = ['A', 'C', 'G', 'T']
for nuc1 in nucleotides:
    revcom = complement[nuc1]
    context2category[nuc1] = min(nuc1, revcom)
    for nuc2 in nucleotides:
        dinuc = nuc1 + nuc2
        revcom = complement[nuc2] + complement[nuc1]
        context2category[dinuc] = min(dinuc, revcom)
        for nuc3 in nucleotides:
            trinuc = nuc1 + nuc2 + nuc3
            revcom = complement[nuc3] + complement[nuc2] + complement[nuc1]
            context2category[trinuc] = min(trinuc, revcom)
            
context_list = sorted(list(set(context2category.values())))

In [14]:
# Calculate the correlation of the 3-nucleotide context between eQTL and background
def bg2contextcorr(eQTL):
    
    def generate_context_vector(seq):
        context_vector = [0.] * len(context_list)
        for i in range(len(seq)-2):
            trinuc = seq[i:i+3]
            if 'N' in trinuc:
                continue
            context_vector[context_list.index(context2category[trinuc])] += 1
            dinuc = seq[i:i+2]
            context_vector[context_list.index(context2category[dinuc])] += 1
            mononuc = seq[i+1:i+2]
            context_vector[context_list.index(context2category[mononuc])] += 1
        dinuc = seq[-2:] # The last di-nucleotide
        if 'N' not in dinuc:
            context_vector[context_list.index(context2category[dinuc])] += 1
        return context_vector
    
    [Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
    fn = 'tmp/{}_{}_{}_{}.fa'.format(eQTL.split(':')[1], Chr, Start, End)
    context_vectors = []
    
    with open(fn) as f:
        seq = ''
        for line in f.read().rstrip().splitlines():
            if line[0] == '>':
                if seq != '':
                    context_vectors.append(generate_context_vector(seq))
                seq = ''
            else:
                seq += line
    context_vectors.append(generate_context_vector(seq))
                
    context_vectors = np.array(context_vectors)
    pcc = np.corrcoef(context_vectors)
    return pcc[0]

In [15]:
# eQTLs = sorted(eQTL2info.keys())

# n_processes = 4
# pool = Pool(processes=n_processes)

# pccs = pool.map(bg2contextcorr, eQTLs)

# pool.close()
# pool.join()

In [ ]:
# eQTL2bgpcc = {}
# for i in range(len(eQTLs)):
#     print eQTLs[i], np.count_nonzero(~np.isnan(pccs[i]))
#     eQTL2bgpcc[eQTLs[i]] = np.nan_to_num(pccs[i])

In [ ]:
eQTL2bgpcc = {}
for eQTL in eQTLs:
    pcc = bg2contextcorr(eQTL)
    print eQTL, np.count_nonzero(~np.isnan(pcc))
    eQTL2bgpcc[eQTL] = np.nan_to_num(pcc)


+10021-+10079:P2RY13 20001
+101126-+101274:NFX1 20001
+103408-+103582:NFX1 20001
/cellar/users/wzhang1984/Data/bcbio/anaconda/lib/python2.7/site-packages/numpy/lib/function_base.py:3162: RuntimeWarning: invalid value encountered in true_divide
  c /= stddev[:, None]
/cellar/users/wzhang1984/Data/bcbio/anaconda/lib/python2.7/site-packages/numpy/lib/function_base.py:3163: RuntimeWarning: invalid value encountered in true_divide
  c /= stddev[None, :]
+104494-+104535:PCDH1 20000
+111835-+111871:EXT1 20001
+11212-+11313:FMO2 20000
+112150-+112150:UBE2C 20001
+112266-+112334:ARNT 20000
+117473-+117514:MECP2 20001
+11842-+11895:UNC93B1 20000
+1261-+1318:APOBEC3A 20001
+12614-+12656:BCAR3 20000
+128894-+129002:ZNF551 20001
+1349-+1357:TRIM54 20000
+135430-+135462:PAWR 20001
+138656-+138740:WARS2 20000
+148505-+148529:PARD3 20001
+149272-+149297:ETV5 20000
+1557-+1627:RELL1 20001
+168335-+168337:ARID1A 20000
+168344-+168385:ZDHHC11 20001
+170021-+170064:PDXDC1 20000
+175-+216:GLYCTK 20001
+17844-+17872:NCALD 20000
+19-+27:KCNJ5 20001
+20183-+20191:USP25 20000
+205467-+205579:DISC1 20001
+206255-+206255:CACTIN 20000
+2152-+2246:MST1 20001
+2159-+2159:VIPR1 20000
+224223-+224388:SULT1A2 20001
+227129-+227149:ACOT1 20000
+23295-+23305:TMEM140 20001
+24327-+24369:ZNF561 20000
+24540-+24554:SDHAP1 20001
+2466-+2492:SIRPB1 20000
+252498-+252565:HMG20A 20001
+26306-+26397:SDHAP1 20000
+30620-+30625:PDCD6 20001
+30970-+30985:NCK1 20000
+310640-+310663:S100B 20001
+311675-+311778:CA11 20000
+321860-+321920:TRIM31 20001
+32678-+32744:GYPC 20000
+328033-+328135:ADGRE1 20001
+34444-+34451:ZNF665 20000
+350566-+350585:SMIM8 20001
+353-+394:TINAGL1 20000
+35402-+35530:IRF6 20001
+361799-+361849:EDC3 20000
+377602-+377603:SPECC1 20001
+377921-+377932:TMEM189 20000
+38208-+38210:ERAP2 20001
+38337-+38366:MUL1 20000
+38990-+39000:SLC25A52 20001
+426951-+426974:THADA 20000
+441029-+441033:RTP4 20001
+446794-+446838:IGF2 20000
+451550-+451667:TMEM150B 20001
+46129-+46129:FCGR2C 20000
+464676-+464681:AFF4 20001
+471747-+471747:CTNNBL1 20000
+47265-+47272:SLC9A7 20001
+4728-+4755:GSTT2 20000
+477801-+477817:CXorf40A 20001
+48828-+48974:DHX34 20001
+496405-+496464:ISG20 20001
+5048-+5228:APOBEC2 20001
+5053-+5105:PLEKHM1 20001
+51149-+51223:IL1A 20001
+516264-+516272:ZNF155 20001
+529473-+529558:CCL23 20001
+544685-+544744:C2orf27A 20001
+547294-+547337:C2orf27A 20001
+579966-+579974:TBC1D2B 20001
+59237-+59289:ZNF550 20001
+605718-+605719:TMEM138 20001
+66408-+66492:ZNF790 20001
+67914-+68002:TIGD6 20001
+69877-+69973:NSFL1C 20001
+7496-+7531:FAM228B 20001
+753-+784:NOMO2 20001
+754705-+754733:LRRC4B 20001
+781185-+781238:C3orf18 20001
+820713-+820767:PGM5 20001
+82104-+82146:SPESP1 20001
+826924-+826988:METTL16 20001
+8316-+8320:OPA3 20001
+84-+84:IQUB 20001
+857517-+857586:STX5 20001
+87164-+87266:PLCG2 20001
+891753-+891799:ZNF764 20001
+90161-+90223:C6orf136 20001
+929278-+929278:NR2F6 20001
+95097-+95132:HYI 20001
+97957-+97958:ADIPOR2 20001
−106708-−106695:DUSP12 20001
−109572-−109520:UCKL1 20001
−114732-−114704:RARS2 20001
−115798-−115779:ZNF622 20001
−122715-−122671:GPER1 20001
−12857-−12716:ZNF251 20001
−133161-−133158:RTCA 20001
−134187-−134176:NAT1 20001
−13625-−13598:TC2N 20001
−13647-−13581:GTF3C6 20001
−137-−25:MASP1 20001
−140262-−140218:RAB40A 20001
−1415-−1358:INAFM1 20001
−146211-−146048:NBPF1 20001
−1475-−1465:HCG4B 20001
−148244-−148202:CBLN2 20001
−151976-−151976:NBPF1 20001
−154145-−154111:JADE2 20001
−157548-−157492:HMBS 20001
−171556-−171538:RCSD1 20001
−181-−116:ESR2 20001
−18111-−18108:PLEKHG6 20001
−18999-−18953:EXOC3 20001
−19584-−19526:AKR1C1 20001
−200112-−200107:MPC2 20001
−2013-−2010:LRRC40 20001
−202-−191:DAAM1 20001
−21155-−21051:LGALS8 20001
−213437-−213400:ACSM1 20001
−23010-−22972:FCGR2C 20001
−2434-−2332:TUBBP5 20001
−24986-−24957:FUK 20001
−251737-−251712:CDC23 20001
−254171-−254163:SLC25A44 20001
−26028-−26028:GFOD1 20001
−264727-−264670:ZNF575 20001
−267580-−267538:RUNX3 20001
−27257-−27141:IQCE 20001
−277187-−277103:BRD2 20001
−28160-−28112:RNF126P1 20001
−286286-−286179:TXNDC15 20001
−296801-−296686:CCND2 20001
−299635-−299556:SFRP4 20001

Replication timing


In [ ]:
# Build the library of replication timing
bin2reptime = {}
fns = glob.glob('/cellar/users/wzhang1984/Data/replication_timing/*')
for fn in fns:
    print fn.split('/')[-1]
    with open(fn) as f:
        for line in f.read().rstrip().splitlines()[1:]:
            row = line.split('\t')
            Chr = row[0]
            Start = int(row[1])
            End = int(row[2])
            Ws = float(row[3])
            start_end = '\t'.join(row[1:3])
            startBin = (Start-200)/binSize
            endBin = (End+200)/binSize
            for Bin in range(startBin, endBin+1):
                chr_bin = '{}:{}'.format(Chr, Bin)
                if chr_bin not in bin2reptime:
                    bin2reptime[chr_bin] = {}
                if start_end not in bin2reptime[chr_bin]:
                    bin2reptime[chr_bin][start_end] = []
                bin2reptime[chr_bin][start_end].append(Ws)

In [ ]:
def bg2reptime(eQTL):
    
    def pos2reptime(Chr, mid):
        reptimes = []
        startBin = (mid-200)/binSize
        endBin = (mid+200)/binSize
        for Bin in range(startBin, endBin+1):
            chr_bin = '{}:{}'.format(Chr, Bin)
            if chr_bin not in bin2reptime:
                continue
            for start_end in bin2reptime[chr_bin]:
                [Start, End] = [int(i) for i in start_end.split('\t')]
                if mid>=Start and mid<=End:
                    reptimes.extend(bin2reptime[chr_bin][start_end])
        if reptimes == []:
            return 0
        else:
            return np.mean(reptimes)
    
    [Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
    eQTL_reptime = pos2reptime(Chr, mid)
    reptime = [eQTL_reptime]
    for bg in eQTL2bg[eQTL]:
        [bgChr, bgMid] = bg
        bg_reptime = pos2reptime(bgChr, bgMid)
        reptime.append(bg_reptime)
    return reptime

In [ ]:
eQTLs = sorted(eQTL2info.keys())

n_processes = 16
pool = Pool(processes=n_processes)

reptimes = pool.map(bg2reptime, eQTLs)

pool.close()
pool.join()

In [ ]:
eQTL2bgreptime = {}
for i in range(len(eQTLs)):
    eQTL2bgreptime[eQTLs[i]] = reptimes[i]

gene expression


In [ ]:
# Write bed files and use Homer to find the nearest gene
def bg2ann(eQTL):
    [Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
    Len = End - Start
    line_out = '{}\t{}\t{}\t{}\t.\t.\n'.format(Chr, Start, End, eQTL)
    for bg in eQTL2bg[eQTL]:
        [bgChr, bgMid] = bg
        bgStart = bgMid - half_len
        bgEnd = bgStart + Len
        line_out += '{}\t{}\t{}\t{}:{}\t.\t.\n'.format(bgChr, bgStart, bgEnd, bgChr, bgMid)
        fn = 'tmp/{}_{}_{}_{}'.format(eQTL.split(':')[1], Chr, Start, End)
    with open(fn, 'w') as f:
        f.write(line_out)
    os.system('annotatePeaks.pl {}  hg19 >{}.ann.txt'.format(fn, fn))
    os.system('rm {}'.format(fn))

In [ ]:
eQTLs = sorted(eQTL2info.keys())

n_processes = 8
pool = Pool(processes=n_processes)

pool.map(bg2ann, eQTLs)

pool.close()
pool.join()

In [ ]:
# Read gene expression
df = pd.read_table('../eQTL_rev/pat2exp.txt', index_col=0)
gene2exp = np.log2(df.clip(1/8.)).median(axis=1).to_dict()

In [ ]:
eQTL2bg_str = {}
for eQTL in eQTL2bg:
    eQTL2bg_str[eQTL] = []
    for bg in eQTL2bg[eQTL]:
        eQTL2bg_str[eQTL].append('{}:{}'.format(bg[0], bg[1]))
    eQTL2bg_str[eQTL] = np.array(eQTL2bg_str[eQTL])

In [ ]:
def bg2exp(eQTL):
    [Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
    fn = 'tmp/{}_{}_{}_{}.ann.txt'.format(eQTL.split(':')[1], Chr, Start, End)
    exp_list = [-3.] * (n_perm+1)
    with open(fn) as f:
        for line in f.read().rstrip().splitlines()[1:]:
            row = line.split('\t')
            gene = row[-4]
            exp = -3.
            if gene in gene2exp:
                exp = gene2exp[gene]
            else:
                continue
            if row[0] == eQTL:
                exp_list[0] = exp
            else:
                bg = row[0].split('-')[0]
                indices = np.where(eQTL2bg_str[eQTL] == bg)[0]
                for i in indices:
                    exp_list[i+1] = exp
    return exp_list

In [ ]:
eQTLs = sorted(eQTL2info.keys())

n_processes = 8
pool = Pool(processes=n_processes)

exp_lists = pool.map(bg2exp, eQTLs)

pool.close()
pool.join()

In [ ]:
eQTL2bgexp = {}
for i in range(len(eQTLs)):
    eQTL2bgexp[eQTLs[i]] = exp_lists[i]

Select nearest bg sites of the eQTL in the feature space


In [ ]:
os.system('mkdir bg/')
os.system('rm bg/*')

In [46]:
def select_bg(eQTL):
    bg_list = [eQTL]
    bg_list.extend(eQTL2bg_str[eQTL])
    df = pd.DataFrame({'context': eQTL2bgpcc[eQTL], 'reptime': eQTL2bgreptime[eQTL], 'exp': eQTL2bgexp[eQTL]}, index=bg_list)
    df_norm = ((df-df.mean())/df.std())
    df['dist'] = ((df_norm-df_norm.loc[eQTL])**2).T.sum()
    df = df.sort_values('dist').iloc[1:1001,:]
    [Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
    fn = '{}_{}_{}_{}'.format(eQTL.split(':')[1], Chr, Start, End)
    df.to_csv('bg/{}.txt'.format(fn), sep='\t')
    return df

In [47]:
eQTLs = sorted(eQTL2info.keys())

n_processes = 8
pool = Pool(processes=n_processes)

dfs = pool.map(select_bg, eQTLs)

pool.close()
pool.join()

In [48]:
eQTL2bgdf = {}
for i in range(len(eQTLs)):
    eQTL2bgdf[eQTLs[i]] = dfs[i]

Load mutations


In [ ]:
del bin2ann, bin2reptime
gc.collect()

In [ ]:
bin2mut = {}
pats = set()
print 'Reading mutations'
cline = 0
for line in open('input_muts_ICGC.txt'):
    cline+=1
    if cline%1000000 == 0:
        print cline
    row = line.rstrip().split('\t')
    Chr = 'chr' + row[0]
    Start = int(row[1])
    pat = row[-1]
    pats.add(pat)
    startBin = (Start-200)/binSize
    endBin = (Start+200)/binSize
    for Bin in range(startBin, endBin+1):
        chr_bin='{}:{}'.format(Chr, Bin)
        if chr_bin not in bin2mut:
            bin2mut[chr_bin]=[]
        bin2mut[chr_bin].append([Start, pat])

Calculating mutation number and per-patient mutation probability for each eQTL


In [49]:
def eQTL2p(eQTL):
    [Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
    Len = End - Start
    eQTL2pats = set()
    startBin = (Start-200)/binSize
    endBin = (End+200)/binSize
    for Bin in range(startBin, endBin+1):
        chr_bin = '{}:{}'.format(Chr, Bin)
        for mut_pat in bin2mut[chr_bin]:
            [mut, pat] = mut_pat
            if mut>=Start and mut<=End:
                eQTL2pats.add(pat)
    n_pats = len(eQTL2pats)
    
    pat2bgp = {} # per-sample probability of mutation
    for pat in pats:
        pat2bgp[pat] = set()
    nbg = len(eQTL2bgdf[eQTL].index)
    for bg in eQTL2bgdf[eQTL].index:
        [bgChr, bgMid] = bg.split(':')
        bgMid = int(bgMid)
        bgStart = bgMid - half_len
        bgEnd = bgStart + Len
        startBin = (bgStart-200)/binSize
        endBin = (bgEnd+200)/binSize
        for Bin in range(startBin, endBin+1):
            chr_bin = '{}:{}'.format(bgChr, Bin)
            if chr_bin not in bin2mut:
                continue
            for mut_pat in bin2mut[chr_bin]:
                [mut, pat] = mut_pat
                if mut>=bgStart and mut<=bgEnd:
                    pat2bgp[pat].add(bg)
    for pat in pat2bgp:
        pat2bgp[pat] = len(pat2bgp[pat])/float(nbg)
    
    df = pd.DataFrame.from_dict(pat2bgp, orient='index')
    df.columns = ['pr']
    return (n_pats, df)

In [50]:
eQTLs = sorted(eQTL2info.keys())

n_processes = 4
pool = Pool(processes=n_processes)

npats_df_list = pool.map(eQTL2p, eQTLs)

pool.close()
pool.join()

In [51]:
eQTL2npats = {}
eQTL2pat2pr_df = {}
for i in range(len(eQTLs)):
    eQTL2npats[eQTLs[i]] = npats_df_list[i][0]
    eQTL2pat2pr_df[eQTLs[i]] = npats_df_list[i][1]

Calculating p-value using poibin

https://github.com/tsakim/poibin


In [52]:
from poibin import PoiBin

In [53]:
line_out = ''
for eQTL in eQTLs:
    pb = PoiBin(list(eQTL2pat2pr_df[eQTL]['pr']))
    p = pb.pval(eQTL2npats[eQTL])
    print eQTL, eQTL2npats[eQTL], p
    [Chr, Start, End, mid, half_len, ann] = eQTL2info[eQTL]
    line_out += '{}\t{}\t{}\t{}\t{}\t{}\n'.format(eQTL, Chr, Start, End, eQTL2npats[eQTL], p)

with open('eQTL2recurrent_p_ICGC.txt', 'w') as f:
    f.write(line_out)


+10021-+10079:P2RY13 0 1
+101126-+101274:NFX1 3 0.139648123677
+103408-+103582:NFX1 5 0.0202102129968
+104494-+104535:PCDH1 3 0.00474270226885
+111835-+111871:EXT1 1 0.251906521413
+11212-+11313:FMO2 0 1
+112150-+112150:UBE2C 0 1
+112266-+112334:ARNT 0 1
+117473-+117514:MECP2 0 1
+11842-+11895:UNC93B1 4 0.00068145564505
+1261-+1318:APOBEC3A 2 0.0884800245432
+12614-+12656:BCAR3 4 0.000327864877159
+128894-+129002:ZNF551 1 0.710034205723
+1349-+1357:TRIM54 1 0.102432605537
+135430-+135462:PAWR 0 1
+138656-+138740:WARS2 3 0.131239645201
+148505-+148529:PARD3 4 4.48588627534e-05
+149272-+149297:ETV5 1 0.213539982021
+1557-+1627:RELL1 0 1
+168335-+168337:ARID1A 3 1.09983717757e-05
+168344-+168385:ZDHHC11 4 0.000723709579426
+170021-+170064:PDXDC1 1 0.355705116408
+175-+216:GLYCTK 8 1.48986156745e-09
+17844-+17872:NCALD 6 1.186497115e-07
+19-+27:KCNJ5 7 5.25135490648e-13
+20183-+20191:USP25 2 0.00158718396841
+205467-+205579:DISC1 0 1
+206255-+206255:CACTIN 1 0.003994003999
+2152-+2246:MST1 12 4.17386125662e-11
+2159-+2159:VIPR1 1 0.0129222852863
+224223-+224388:SULT1A2 4 0.0763124390476
+227129-+227149:ACOT1 0 1
+23295-+23305:TMEM140 6 7.32779170676e-10
+24327-+24369:ZNF561 2 0.0744007957478
+24540-+24554:SDHAP1 2 0.00490571873877
+2466-+2492:SIRPB1 1 0.196825105744
+252498-+252565:HMG20A 1 0.454357760588
+26306-+26397:SDHAP1 0 1
+30620-+30625:PDCD6 3 6.3584033303e-06
+30970-+30985:NCK1 4 6.04737843268e-06
+310640-+310663:S100B 2 0.0201067181211
+311675-+311778:CA11 6 0.000158040280068
+321860-+321920:TRIM31 0 1
+32678-+32744:GYPC 3 0.0581049138981
+328033-+328135:ADGRE1 1 0.581694283617
+34444-+34451:ZNF665 1 0.100656440221
+350566-+350585:SMIM8 3 0.00042749180597
+353-+394:TINAGL1 2 0.0252356518001
+35402-+35530:IRF6 0 1
+361799-+361849:EDC3 2 0.0568838515842
+377602-+377603:SPECC1 0 1
+377921-+377932:TMEM189 0 1
+38208-+38210:ERAP2 4 1.46891639963e-08
+38337-+38366:MUL1 4 0.000131089066219
+38990-+39000:SLC25A52 1 0.0897682607789
+426951-+426974:THADA 1 0.172329118233
+441029-+441033:RTP4 2 0.000918077871609
+446794-+446838:IGF2 9 4.38423541915e-09
+451550-+451667:TMEM150B 2 0.391832722466
+46129-+46129:FCGR2C 1 0.005985019985
+464676-+464681:AFF4 0 1
+471747-+471747:CTNNBL1 0 1
+47265-+47272:SLC9A7 6 2.75205414013e-11
+4728-+4755:GSTT2 0 1
+477801-+477817:CXorf40A 0 1
+48828-+48974:DHX34 1 0.727531240528
+496405-+496464:ISG20 0 1
+5048-+5228:APOBEC2 3 0.309463353319
+5053-+5105:PLEKHM1 0 1
+51149-+51223:IL1A 3 0.0189507712024
+516264-+516272:ZNF155 9 -1.55431223448e-15
+529473-+529558:CCL23 2 0.216084249253
+544685-+544744:C2orf27A 6 0.000457459214769
+547294-+547337:C2orf27A 1 0.489465596552
+579966-+579974:TBC1D2B 1 0.0713794493088
+59237-+59289:ZNF550 1 0.475490445425
+605718-+605719:TMEM138 13 3.88578058619e-15
+66408-+66492:ZNF790 3 0.179483794989
+67914-+68002:TIGD6 1 0.509391572891
+69877-+69973:NSFL1C 1 0.51367857105
+7496-+7531:FAM228B 2 0.0390062383453
+753-+784:NOMO2 1 0.242903454006
+754705-+754733:LRRC4B 7 1.61156549217e-08
+781185-+781238:C3orf18 3 0.0109447709382
+820713-+820767:PGM5 0 1
+82104-+82146:SPESP1 3 0.011143992207
+826924-+826988:METTL16 18 -3.77475828373e-15
+8316-+8320:OPA3 0 1
+84-+84:IQUB 1 0.014895453638
+857517-+857586:STX5 4 0.00201299786178
+87164-+87266:PLCG2 0 1
+891753-+891799:ZNF764 9 1.46453293937e-10
+90161-+90223:C6orf136 5 0.000339290902384
+929278-+929278:NR2F6 2 1.49600449724e-05
+95097-+95132:HYI 28 -4.88498130835e-15
+97957-+97958:ADIPOR2 0 1
−106708-−106695:DUSP12 0 1
−109572-−109520:UCKL1 5 7.753291881e-05
−114732-−114704:RARS2 2 0.025224319781
−115798-−115779:ZNF622 1 0.196020327776
−122715-−122671:GPER1 3 0.0080980538953
−12857-−12716:ZNF251 1 0.680702079499
−133161-−133158:RTCA 21 -4.66293670343e-15
−134187-−134176:NAT1 6 3.06142844408e-09
−13625-−13598:TC2N 5 1.51993595987e-06
−13647-−13581:GTF3C6 1 0.39563759251
−137-−25:MASP1 5 0.00660430232789
−140262-−140218:RAB40A 2 0.0778421205181
−1415-−1358:INAFM1 2 0.0772760055732
−146211-−146048:NBPF1 8 0.000279253890039
−1475-−1465:HCG4B 0 1
−148244-−148202:CBLN2 6 0.000194764937484
−151976-−151976:NBPF1 3 2.0865402639e-07
−154145-−154111:JADE2 0 1
−157548-−157492:HMBS 2 0.0606112327233
−171556-−171538:RCSD1 4 4.73682139268e-05
−181-−116:ESR2 0 1
−18111-−18108:PLEKHG6 0 1
−18999-−18953:EXOC3 9 3.9552301434e-09
−19584-−19526:AKR1C1 3 0.0208094134813
−200112-−200107:MPC2 0 1
−2013-−2010:LRRC40 0 1
−202-−191:DAAM1 8 4.24171808788e-12
−21155-−21051:LGALS8 2 0.28045576051
−213437-−213400:ACSM1 0 1
−23010-−22972:FCGR2C 5 6.76486000528e-06
−2434-−2332:TUBBP5 3 0.126680474641
−24986-−24957:FUK 1 0.182211650668
−251737-−251712:CDC23 12 -1.99840144433e-15
−254171-−254163:SLC25A44 0 1
−26028-−26028:GFOD1 0 1
−264727-−264670:ZNF575 1 0.47745548189
−267580-−267538:RUNX3 18 -3.99680288865e-15
−27257-−27141:IQCE 6 0.000757289707647
−277187-−277103:BRD2 1 0.521150349206
−28160-−28112:RNF126P1 0 1
−286286-−286179:TXNDC15 2 0.169327819522
−296801-−296686:CCND2 5 0.00940321212974
−299635-−299556:SFRP4 9 8.113970551e-08
−30054-−30027:ATP9A 3 0.0038012848947
−305539-−305530:TSPAN32 0 1
−319-−252:PDPR 6 1.29240928469e-05
−32181-−32054:FAM69C 3 0.154138979939
−346141-−346128:FAM66A 0 1
−35456-−35289:UPK3B 13 2.95387347915e-09
−356-−333:SDHAP2 0 1
−356584-−356500:PDGFA 3 0.0367339370872
−358045-−358013:SRGAP2 1 0.227571143059
−365839-−365797:CA4 0 1
−366671-−366628:SOCS5 0 1
−369-−320:LIMS2 2 0.0487427783124
−379072-−379013:FAU 5 9.49114902528e-05
−39110-−39082:C1QTNF6 1 0.268309342346
−402890-−402874:RDH13 1 0.126383425566
−413682-−413672:SYN2 7 1.34514621664e-12
−424468-−424424:SF3A2 6 3.12807513581e-06
−45127-−45070:RAB3D 0 1
−46444-−46444:CDC42 1 0.00797205593006
−46746-−46725:ITGB4 10 8.881784197e-16
−492492-−492443:CLEC4E 1 0.465806278439
−510970-−510887:CA11 0 1
−51710-−51608:AHRR 5 0.00476088788616
−52060-−52060:TAS2R5 1 0.0119342195058
−53421-−53224:DPY19L2P1 8 0.0113557485388
−542407-−542270:USP32P2 1 0.691781502204
−6338-−6326:MAPK13 3 0.000320608887024
−65798-−65733:SIRT5 0 1
−6633-−6576:CD40 3 0.0176991692173
−67178-−67164:FCGR2C 2 0.00490409033087
−67894-−67745:ENPP2 5 0.0221198584145
−681910-−681862:PUM3 4 0.00300204034925
−69-−61:MAP1S 9 -1.55431223448e-15
−70607-−70402:HLTF 0 1
−71678-−71678:IL12RB1 0 1
−722743-−722615:ZNF44 1 0.619291358346
−77027-−76992:DSTYK 3 0.00284275898859
−775368-−775352:LYSMD1 3 0.00020960277781
−78-−18:HLA-DRB1 6 2.36833034521e-05
−795-−761:RLN2 6 4.02011450573e-07
−803819-−803760:ZNF284 3 0.0235965179688
−81119-−81108:PEX26 0 1
−81161-−81101:ERBB4 5 0.00206960704967
−81298-−81201:TBC1D20 11 1.34610000924e-09
−869128-−869128:ZNF284 2 6.45813473176e-05
−876032-−876024:HERC3 0 1
−88481-−88457:CDC42 2 0.0265709027044
−887841-−887760:ITFG2 6 7.15816678497e-05
−91-+1:TERT 94 -2.26485497024e-14
−963-−941:HSD17B6 1 0.155592153613
−963-−949:C3orf18 0 1
−97279-−97226:NOV 4 0.00934322334456
−97983-−97929:RASA4 3 0.00889236723739
−990-−980:PSMD12 0 1

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: