In [1]:
from __future__ import print_function, division
from collections import defaultdict
import os
import pickle
import numpy as np
In [2]:
!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2
!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2
!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/relationships_w_pops_121708.txt
In [3]:
!bunzip2 hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2
!bunzip2 hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2
In [4]:
tsi = open('tsi.ind', 'w')
f = open('relationships_w_pops_121708.txt')
for l in f:
toks = l.rstrip().split('\t')
fam_id = toks[0]
ind_id = toks[1]
mom = toks[2]
dad = toks[3]
pop = toks[-1]
if pop != 'TSI' or mom != '0' or dad != '0':
continue
tsi.write('%s\t%s\n' % (fam_id, ind_id))
f.close()
tsi.close()
In [5]:
os.system('plink --file hapmap3_r2_b36_fwd.consensus.qc.poly --maf 0.001 --keep tsi.ind --make-bed --out tsi')
Out[5]:
In [6]:
max_chro_pos = defaultdict(int)
f = open('tsi.bim')
for l in f:
toks = l.rstrip().split('\t')
chrom = int(toks[0])
if chrom > 22:
continue
pos = int(toks[3])
if pos > max_chro_pos[chrom]:
max_chro_pos[chrom] = pos
f.close()
w = open('max_chro_pos.pickle', 'w')
pickle.dump(max_chro_pos, w)
w.close()
In [7]:
#os.system('plink --bfile tsi --freq --out tsi')
In [8]:
window_size = 2000000
In [9]:
%time os.system('plink --bfile tsi --freq --out tsi')
Out[9]:
In [10]:
def traverse_genome(traverse_fun, state=None):
if state is None:
state = {}
for chrom, max_pos in max_chro_pos.items():
num_bin = (max_pos + 1) // window_size
for my_bin in range(num_bin):
start_pos = my_bin * window_size + 1 # 1-start
end_pos = start_pos + window_size
traverse_fun(state, chrom, start_pos, end_pos)
In [11]:
def compute_MAF(state, chrom, start_pos, end_pos):
os.system('plink --bfile tsi --freq --out tsi-%d-%d --chr %d --from-bp %d --to-bp %d' %
(chrom, start_pos, chrom, start_pos, end_pos))
os.remove('tsi-%d-%d.log' % (chrom, start_pos))
%time traverse_genome(compute_MAF)
In [12]:
def gather_statistics(state, chrom, start_pos, end_pos):
try:
f = open('tsi-%d-%d.frq' % (chrom, start_pos))
except:
# empty block
state['block_mafs'][(chrom, start_pos)] = []
return
f.readline()
for cnt, l in enumerate(f):
toks = [tok for tok in l.rstrip().split(' ') if tok != '']
maf = float(toks[-2])
state['snp_cnt'] += 1
state['sum_maf'] += maf
state['block_mafs'][(chrom, start_pos)].append(maf)
f.close()
In [13]:
stats = {'snp_cnt': 0, 'sum_maf': 0.0, 'block_mafs': defaultdict(list)}
traverse_genome(gather_statistics, state=stats)
In [15]:
print(stats['snp_cnt'], stats['sum_maf'] / stats['snp_cnt'])
In [16]:
all_mafs = []
for mafs in stats['block_mafs'].values():
all_mafs.extend(mafs)
In [17]:
%time np.median(all_mafs)
Out[17]:
In [18]:
all_mafs.sort()
middle = len(all_mafs) // 2
#array of even size
print((all_mafs[middle] + all_mafs[middle + 1]) / 2)
In [19]:
stats.keys()
Out[19]:
In [20]:
import functools
def collect_mafs(state, chrom, start_pos, end_pos, block_mafs):
state[chrom] += len(block_mafs[(chrom, start_pos)])
chrom_cnts = defaultdict(int)
traverse_genome(functools.partial(collect_mafs, block_mafs=stats['block_mafs']),
state=chrom_cnts)
In [21]:
for chrom in range(1, 23):
print('%2d\t%6d' % (chrom, chrom_cnts[chrom]))
#block printing on the next chapter
In [ ]: