In [1]:

    
from __future__ import print_function, division

from collections import defaultdict
import os
import pickle

import numpy as np

Data download



In [2]:

    
!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2
!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2

!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/relationships_w_pops_121708.txt









    



--2015-06-26 14:52:43--  http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2
Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113
Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10590722 (10M) [application/x-bzip2]
Saving to: ‘hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2.1’

hapmap3_r2_b36_fwd. 100%[=====================>]  10.10M  3.74MB/s   in 2.7s   

2015-06-26 14:52:46 (3.74 MB/s) - ‘hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2.1’ saved [10590722/10590722]

--2015-06-26 14:52:46--  http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2
Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113
Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 757962593 (723M) [application/x-bzip2]
Saving to: ‘hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2.1’

hapmap3_r2_b36_fwd. 100%[=====================>] 722.85M  8.51MB/s   in 3m 58s 

2015-06-26 14:56:44 (3.03 MB/s) - ‘hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2.1’ saved [757962593/757962593]

--2015-06-26 14:56:45--  http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/relationships_w_pops_121708.txt
Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113
Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36765 (36K) [text/plain]
Saving to: ‘relationships_w_pops_121708.txt.4’

relationships_w_pop 100%[=====================>]  35.90K   174KB/s   in 0.2s   

2015-06-26 14:56:45 (174 KB/s) - ‘relationships_w_pops_121708.txt.4’ saved [36765/36765]



In [3]:

    
!bunzip2 hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2
!bunzip2 hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2









    



bunzip2: Output file hapmap3_r2_b36_fwd.consensus.qc.poly.map already exists.
bunzip2: Output file hapmap3_r2_b36_fwd.consensus.qc.poly.ped already exists.

Data preparation



In [4]:

    
tsi = open('tsi.ind', 'w')
f = open('relationships_w_pops_121708.txt')

for l in f:
    toks = l.rstrip().split('\t')
    fam_id = toks[0]
    ind_id = toks[1]
    mom = toks[2]
    dad = toks[3]
    pop = toks[-1]
    if pop != 'TSI' or mom != '0' or dad != '0':
        continue
    tsi.write('%s\t%s\n' % (fam_id, ind_id))
f.close()
tsi.close()



In [5]:

    
os.system('plink --file hapmap3_r2_b36_fwd.consensus.qc.poly --maf 0.001 --keep tsi.ind --make-bed --out tsi')









    Out[5]:





0



In [6]:

    
max_chro_pos = defaultdict(int)
f = open('tsi.bim')
for l in f:
    toks = l.rstrip().split('\t')
    chrom = int(toks[0])
    if chrom > 22:
        continue
    pos = int(toks[3])
    if pos > max_chro_pos[chrom]:
        max_chro_pos[chrom] = pos
f.close()
w = open('max_chro_pos.pickle', 'w')
pickle.dump(max_chro_pos, w)
w.close()

Sequencial run



In [7]:

    
#os.system('plink --bfile tsi --freq --out tsi')



In [8]:

    
window_size = 2000000



In [9]:

    
%time os.system('plink --bfile tsi --freq --out tsi')









    



CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.68 s






    Out[9]:





0



In [10]:

    
def traverse_genome(traverse_fun, state=None):
    if state is None:
        state = {}
    for chrom, max_pos in max_chro_pos.items():
        num_bin = (max_pos + 1) // window_size
        for my_bin in range(num_bin):
            start_pos = my_bin * window_size + 1  # 1-start
            end_pos = start_pos + window_size
            traverse_fun(state, chrom, start_pos, end_pos)



In [11]:

    
def compute_MAF(state, chrom, start_pos, end_pos):
    os.system('plink --bfile tsi --freq --out tsi-%d-%d --chr %d --from-bp %d --to-bp %d' %
                  (chrom, start_pos, chrom, start_pos, end_pos))
    os.remove('tsi-%d-%d.log' % (chrom, start_pos))
    
%time traverse_genome(compute_MAF)









    



CPU times: user 192 ms, sys: 1.06 s, total: 1.25 s
Wall time: 4min 47s



In [12]:

    
def gather_statistics(state, chrom, start_pos, end_pos):
    try:
        f = open('tsi-%d-%d.frq' % (chrom, start_pos))
    except:
        # empty block
        state['block_mafs'][(chrom, start_pos)] = []
        return
    f.readline()
    for cnt, l in enumerate(f):
        toks = [tok for tok in l.rstrip().split(' ') if tok != '']
        maf = float(toks[-2])
        state['snp_cnt'] += 1
        state['sum_maf'] += maf
        state['block_mafs'][(chrom, start_pos)].append(maf)
    f.close()



In [13]:

    
stats = {'snp_cnt': 0, 'sum_maf': 0.0, 'block_mafs': defaultdict(list)}
traverse_genome(gather_statistics, state=stats)



In [15]:

    
print(stats['snp_cnt'], stats['sum_maf'] / stats['snp_cnt'])









    



1222126 0.23159641651



In [16]:

    
all_mafs = []
for mafs in stats['block_mafs'].values():
    all_mafs.extend(mafs)



In [17]:

    
%time np.median(all_mafs)









    



CPU times: user 84 ms, sys: 4 ms, total: 88 ms
Wall time: 132 ms






    Out[17]:





0.22159999999999999



In [18]:

    
all_mafs.sort()
middle = len(all_mafs) // 2
#array of even size
print((all_mafs[middle] + all_mafs[middle + 1]) / 2)



In [19]:

    
stats.keys()









    Out[19]:





['block_mafs', 'snp_cnt', 'sum_maf']



In [20]:

    
import functools

def collect_mafs(state, chrom, start_pos, end_pos, block_mafs):
    state[chrom] += len(block_mafs[(chrom, start_pos)])

chrom_cnts = defaultdict(int)
traverse_genome(functools.partial(collect_mafs, block_mafs=stats['block_mafs']),
                state=chrom_cnts)



In [21]:

    
for chrom in range(1, 23):
    print('%2d\t%6d' % (chrom, chrom_cnts[chrom]))
#block printing on the next chapter



In [ ]: