In [1]:
from collections import defaultdict

from Bio.PopGen.GenePop import read
from Bio.PopGen.GenePop.LargeFileParser import read as read_large

from genomics.popgen.plink.convert import to_genepop

In [2]:
f = open('relationships_w_pops_121708.txt')
pop_ind = defaultdict(list)
f.readline()  # header
for line in f:
    toks = line.rstrip().split('\t')
    fam_id = toks[0]
    ind_id = toks[1]
    pop = toks[-1]
    pop_ind[pop].append((fam_id, ind_id))

Check for consistency issues


In [3]:
all_inds = []
for inds in pop_ind.values():
    all_inds.extend(inds)
for line in open('hapmap1.ped'):
    toks = line.rstrip().replace(' ', '\t').split('\t')
    fam = toks[0]
    ind = toks[1]
    if (fam, ind) not in all_inds:
        print('Problems with %s/%s' % (fam, ind))


Problems with 2469/NA20281

In [4]:
to_genepop('hapmap1_auto', 'hapmap1_auto', pop_ind)
to_genepop('hapmap10', 'hapmap10', pop_ind)
to_genepop('hapmap10_auto', 'hapmap10_auto', pop_ind)
to_genepop('hapmap10_auto_noofs_ld', 'hapmap10_auto_noofs_ld', pop_ind)
to_genepop('hapmap10_auto_noofs_2', 'hapmap10_auto_noofs_2', pop_ind)
#slow
#talk about coding: ACGT - 1234
#talk about pop Pop names

Using the in-memory parser

Careful: this will severely increase your memory usage, consider it optional


In [ ]:
rec = read(open('hapmap1_auto.gp'))
print('Header: %s' % len(rec.comment_line))
print('Number of loci %d' % len(rec.loci_list))
print('Number of populations %d' % len(rec.pop_list))
print('Population names: %s' % ', '.join(rec.pop_list))
print('Individuals per population %s' % ', '.join([str(len(inds)) for inds in rec.populations]))
ind = rec.populations[1][0]
print('Individual %s, SNP %s, alleles: %d %d' % (ind[0], rec.loci_list[0], ind[1][0][0], ind[1][0][1]))
del rec
#talk about population names

Using the Large File Parser


In [5]:
def count_individuals(fname):
    rec = read_large(open(fname))
    pop_sizes = []
    for line in rec.data_generator():
        if line == ():
            pop_sizes.append(0)
        else:
            pop_sizes[-1] += 1
    return pop_sizes

In [6]:
print('Individuals per population %s' % ', '.join([str(ninds) for ninds in count_individuals('hapmap1_auto.gp')]))


Individuals per population 82, 165, 84, 85, 88, 86, 90, 77, 171, 88, 167

In [7]:
print(len(read_large(open('hapmap10.gp')).loci_list))
print(len(read_large(open('hapmap10_auto.gp')).loci_list))
print(len(read_large(open('hapmap10_auto_noofs_ld.gp')).loci_list))


143809
138694
55217

In [ ]: