In [1]:
from __future__ import print_function, division

from collections import defaultdict
import os
import pickle

import numpy as np

Data download


In [2]:
!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2
!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2

!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/relationships_w_pops_121708.txt


--2015-06-26 14:52:43--  http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2
Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113
Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10590722 (10M) [application/x-bzip2]
Saving to: ‘hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2.1’

hapmap3_r2_b36_fwd. 100%[=====================>]  10.10M  3.74MB/s   in 2.7s   

2015-06-26 14:52:46 (3.74 MB/s) - ‘hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2.1’ saved [10590722/10590722]

--2015-06-26 14:52:46--  http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2
Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113
Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 757962593 (723M) [application/x-bzip2]
Saving to: ‘hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2.1’

hapmap3_r2_b36_fwd. 100%[=====================>] 722.85M  8.51MB/s   in 3m 58s 

2015-06-26 14:56:44 (3.03 MB/s) - ‘hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2.1’ saved [757962593/757962593]

--2015-06-26 14:56:45--  http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/relationships_w_pops_121708.txt
Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113
Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36765 (36K) [text/plain]
Saving to: ‘relationships_w_pops_121708.txt.4’

relationships_w_pop 100%[=====================>]  35.90K   174KB/s   in 0.2s   

2015-06-26 14:56:45 (174 KB/s) - ‘relationships_w_pops_121708.txt.4’ saved [36765/36765]


In [3]:
!bunzip2 hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2
!bunzip2 hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2


bunzip2: Output file hapmap3_r2_b36_fwd.consensus.qc.poly.map already exists.
bunzip2: Output file hapmap3_r2_b36_fwd.consensus.qc.poly.ped already exists.

Data preparation


In [4]:
tsi = open('tsi.ind', 'w')
f = open('relationships_w_pops_121708.txt')

for l in f:
    toks = l.rstrip().split('\t')
    fam_id = toks[0]
    ind_id = toks[1]
    mom = toks[2]
    dad = toks[3]
    pop = toks[-1]
    if pop != 'TSI' or mom != '0' or dad != '0':
        continue
    tsi.write('%s\t%s\n' % (fam_id, ind_id))
f.close()
tsi.close()

In [5]:
os.system('plink --file hapmap3_r2_b36_fwd.consensus.qc.poly --maf 0.001 --keep tsi.ind --make-bed --out tsi')


Out[5]:
0

In [6]:
max_chro_pos = defaultdict(int)
f = open('tsi.bim')
for l in f:
    toks = l.rstrip().split('\t')
    chrom = int(toks[0])
    if chrom > 22:
        continue
    pos = int(toks[3])
    if pos > max_chro_pos[chrom]:
        max_chro_pos[chrom] = pos
f.close()
w = open('max_chro_pos.pickle', 'w')
pickle.dump(max_chro_pos, w)
w.close()

Sequencial run


In [7]:
#os.system('plink --bfile tsi --freq --out tsi')

In [8]:
window_size = 2000000

In [9]:
%time os.system('plink --bfile tsi --freq --out tsi')


CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.68 s
Out[9]:
0

In [10]:
def traverse_genome(traverse_fun, state=None):
    if state is None:
        state = {}
    for chrom, max_pos in max_chro_pos.items():
        num_bin = (max_pos + 1) // window_size
        for my_bin in range(num_bin):
            start_pos = my_bin * window_size + 1  # 1-start
            end_pos = start_pos + window_size
            traverse_fun(state, chrom, start_pos, end_pos)

In [11]:
def compute_MAF(state, chrom, start_pos, end_pos):
    os.system('plink --bfile tsi --freq --out tsi-%d-%d --chr %d --from-bp %d --to-bp %d' %
                  (chrom, start_pos, chrom, start_pos, end_pos))
    os.remove('tsi-%d-%d.log' % (chrom, start_pos))
    
%time traverse_genome(compute_MAF)


CPU times: user 192 ms, sys: 1.06 s, total: 1.25 s
Wall time: 4min 47s

In [12]:
def gather_statistics(state, chrom, start_pos, end_pos):
    try:
        f = open('tsi-%d-%d.frq' % (chrom, start_pos))
    except:
        # empty block
        state['block_mafs'][(chrom, start_pos)] = []
        return
    f.readline()
    for cnt, l in enumerate(f):
        toks = [tok for tok in l.rstrip().split(' ') if tok != '']
        maf = float(toks[-2])
        state['snp_cnt'] += 1
        state['sum_maf'] += maf
        state['block_mafs'][(chrom, start_pos)].append(maf)
    f.close()

In [13]:
stats = {'snp_cnt': 0, 'sum_maf': 0.0, 'block_mafs': defaultdict(list)}
traverse_genome(gather_statistics, state=stats)

In [15]:
print(stats['snp_cnt'], stats['sum_maf'] / stats['snp_cnt'])


1222126 0.23159641651

In [16]:
all_mafs = []
for mafs in stats['block_mafs'].values():
    all_mafs.extend(mafs)

In [17]:
%time np.median(all_mafs)


CPU times: user 84 ms, sys: 4 ms, total: 88 ms
Wall time: 132 ms
Out[17]:
0.22159999999999999

In [18]:
all_mafs.sort()
middle = len(all_mafs) // 2
#array of even size
print((all_mafs[middle] + all_mafs[middle + 1]) / 2)


0.2216

In [19]:
stats.keys()


Out[19]:
['block_mafs', 'snp_cnt', 'sum_maf']

In [20]:
import functools

def collect_mafs(state, chrom, start_pos, end_pos, block_mafs):
    state[chrom] += len(block_mafs[(chrom, start_pos)])

chrom_cnts = defaultdict(int)
traverse_genome(functools.partial(collect_mafs, block_mafs=stats['block_mafs']),
                state=chrom_cnts)

In [21]:
for chrom in range(1, 23):
    print('%2d\t%6d' % (chrom, chrom_cnts[chrom]))
#block printing on the next chapter


 1	100780
 2	102377
 3	 84921
 4	 75819
 5	 78013
 6	 82169
 7	 67218
 8	 66803
 9	 56921
10	 65166
11	 62182
12	 60381
13	 46354
14	 40525
15	 37217
16	 38443
17	 33175
18	 36144
19	 21749
20	 31872
21	 16978
22	 16919

In [ ]: