In [1]:
from __future__ import print_function, division

PyPlink

PyPlink is a Python module to read and write binary Plink files. Here are small examples for PyPlink.


In [2]:
from pyplink import PyPlink

Reading binary pedfile

Getting the demo data

The Plink softwares provides a testing dataset on the resources page. It contains the 270 samples from the HapMap project (release 23) on build GRCh36/hg18.


In [3]:
import zipfile
try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve

# Downloading the demo data from Plink webset
urlretrieve(
    "http://pngu.mgh.harvard.edu/~purcell/plink/dist/hapmap_r23a.zip",
    "hapmap_r23a.zip",
)

# Extracting the archive content
with zipfile.ZipFile("hapmap_r23a.zip", "r") as z:
    z.extractall(".")

Reading the binary file

To read a binary file, PyPlink only requires the prefix of the files.


In [4]:
pedfile = PyPlink("hapmap_r23a")

Getting dataset information


In [5]:
print("{:,d} samples and {:,d} markers".format(
    pedfile.get_nb_samples(),
    pedfile.get_nb_markers(),
))


270 samples and 4,098,136 markers

In [6]:
all_samples = pedfile.get_fam()
all_samples.head()


Out[6]:
fid iid father mother gender status
0 NA18524 NA18524 0 0 1 -9
1 NA18526 NA18526 0 0 2 -9
2 NA18529 NA18529 0 0 2 -9
3 NA18532 NA18532 0 0 2 -9
4 NA18537 NA18537 0 0 2 -9

In [7]:
all_markers = pedfile.get_bim()
all_markers.head()


Out[7]:
chrom pos cm a1 a2
snp
rs10399749 1 45162 0 0 C
rs2949420 1 45257 0 0 T
rs4030303 1 72434 0 A G
rs4030300 1 72515 0 0 C
rs3855952 1 77689 0 0 A

Iterating over all markers

Additive format

Cycling through genotypes as -1, 0, 1 and 2 values, where -1 is unknown, 0 is homozygous (major allele), 1 is heterozygous, and 2 is homozygous (minor allele).


In [8]:
for marker_id, genotypes in pedfile:
    print(marker_id)
    print(genotypes)
    break


rs10399749
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0
  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0 -1
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0 -1  0  0  0
  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0 -1  0  0  0  0  0  0  0
 -1  0 -1  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0 -1  0 -1  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]

In [9]:
for marker_id, genotypes in pedfile.iter_geno():
    print(marker_id)
    print(genotypes)
    break


rs10399749
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0
  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0 -1
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0 -1  0  0  0
  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0 -1  0  0  0  0  0  0  0
 -1  0 -1  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0 -1  0 -1  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]

Nucleotide format

Cycling through genotypes as A, C, G and T values (where 00 is unknown).


In [10]:
for marker_id, genotypes in pedfile.iter_acgt_geno():
    print(marker_id)
    print(genotypes)
    break


rs10399749
['CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'
 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'
 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC'
 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'
 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'
 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'
 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC'
 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'
 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC'
 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'
 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00'
 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC'
 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'
 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'
 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'
 '00' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC'
 'CC' 'CC' 'CC' '00' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'
 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC']

Iterating over selected markers

Additive format

Cycling through genotypes as -1, 0, 1 and 2 values, where -1 is unknown, 0 is homozygous (major allele), 1 is heterozygous, and 2 is homozygous (minor allele).


In [11]:
markers = ["rs7092431", "rs9943770", "rs1587483"]
for marker_id, genotypes in pedfile.iter_geno_marker(markers):
    print(marker_id)
    print(genotypes, end="\n\n")


rs7092431
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]

rs9943770
[ 0  0  0  2  0  2  0  1  1  0  0  0  0  0  0  0  1  0  0  1  0  0  1  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  1  0  1  0  0
  1  0  1  0  0  1  1  1  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  1
  0  0  0  0  0  0  0  0  1  0  0  1  0  0  0  0  1  1  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  1  0  0  1  0  0  0  1  0  0  0  0  0  0  0
  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  1  0  0  0  0
  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  1 -1  0  0  0  1  2  1  1  0  1  1  2  0  0  0  1  0  0  0  0  1  1  0  2
  1  1  1  0  1  1 -1  1  0  0  1  0  0  2  0  1  2  0  1  1  1  2  0  1  0
  0  0  0  0  0  0  2  1  2  1  1  2  1  1  1  1  0  1  1  2  1  0 -1  0  1
  1  1  0  1  0  1 -1  2  1  0  0  0  1  1  1  2  0  0  1  1]

rs1587483
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  1  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]

Nucleotide format

Cycling through genotypes as A, C, G and T values (where 00 is unknown).


In [12]:
markers = ["rs7092431", "rs9943770", "rs1587483"]
for marker_id, genotypes in pedfile.iter_acgt_geno_marker(markers):
    print(marker_id)
    print(genotypes, end="\n\n")


rs7092431
['GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' '00'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG']

rs9943770
['AA' 'AA' 'AA' 'GG' 'AA' 'GG' 'AA' 'GA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'
 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'
 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'
 'GA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'GA' 'AA' 'AA' 'GA' 'GA' 'GA' 'AA' 'AA'
 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA'
 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA'
 'AA' 'GA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'
 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA'
 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'
 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA'
 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'
 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' '00' 'AA' 'AA' 'AA'
 'GA' 'GG' 'GA' 'GA' 'AA' 'GA' 'GA' 'GG' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA'
 'AA' 'GA' 'GA' 'AA' 'GG' 'GA' 'GA' 'GA' 'AA' 'GA' 'GA' '00' 'GA' 'AA' 'AA'
 'GA' 'AA' 'AA' 'GG' 'AA' 'GA' 'GG' 'AA' 'GA' 'GA' 'GA' 'GG' 'AA' 'GA' 'AA'
 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GG' 'GA' 'GG' 'GA' 'GA' 'GG' 'GA' 'GA' 'GA'
 'GA' 'AA' 'GA' 'GA' 'GG' 'GA' 'AA' '00' 'AA' 'GA' 'GA' 'GA' 'AA' 'GA' 'AA'
 'GA' '00' 'GG' 'GA' 'AA' 'AA' 'AA' 'GA' 'GA' 'GA' 'GG' 'AA' 'AA' 'GA' 'GA']

rs1587483
['GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' '00' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' '00' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'CG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' '00' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'
 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG']

Extracting a single marker

Additive format

Cycling through genotypes as -1, 0, 1 and 2 values, where -1 is unknown, 0 is homozygous (major allele), 1 is heterozygous, and 2 is homozygous (minor allele).


In [13]:
pedfile.get_geno_marker("rs7619974")


Out[13]:
array([ 0,  0,  0,  0,  0, -1,  0,  0, -1, -1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0, -1,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], dtype=int8)

Nucleotide format

Cycling through genotypes as A, C, G and T values (where 00 is unknown).


In [14]:
pedfile.get_acgt_geno_marker("rs7619974")


Out[14]:
array(['CC', 'CC', 'CC', 'CC', 'CC', '00', 'CC', 'CC', '00', '00', 'CC',
       'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',
       'CC', 'CC', '00', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',
       '00', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',
       'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',
       'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',
       'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',
       'CC', '00', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',
       'CC', 'CC', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',
       '00', '00', '00', '00', '00', '00'], 
      dtype='<U2')

Misc example

Extracting a subset of markers and samples

To get all markers on the Y chromosomes for the males.


In [15]:
# Getting the Y markers
y_markers = all_markers[all_markers.chrom == 24].index.values

# Getting the males
males = all_samples.gender == 1

# Cycling through the Y markers
for marker_id, genotypes in pedfile.iter_geno_marker(y_markers):
    male_genotypes = genotypes[males.values]
    print("{:,d} total genotypes".format(len(genotypes)))
    print("{:,d} genotypes for {:,d} males ({} on chr{} and position {:,d})".format(
        len(male_genotypes),
        males.sum(),
        marker_id,
        all_markers.loc[marker_id, "chrom"],
        all_markers.loc[marker_id, "pos"],
    ))
    break


270 total genotypes
142 genotypes for 142 males (rs1140798 on chr24 and position 169,542)

Counting the allele frequency of markers

To count the minor allele frequency of a subset of markers (only for founders).


In [16]:
# Getting the founders
founders = (all_samples.father == "0") & (all_samples.mother == "0")

# Computing the MAF
markers = ["rs7619974", "rs2949048", "rs16941434"]
for marker_id, genotypes in pedfile.iter_geno_marker(markers):
    valid_genotypes = genotypes[founders.values & (genotypes != -1)]
    maf = valid_genotypes.sum() / (len(valid_genotypes) * 2)
    print(marker_id, round(maf, 6), sep="\t")


rs7619974	0.0
rs2949048	0.02381
rs16941434	0.357143

Writing binary pedfile

SNP-major format

The following examples shows how to write a binary file using the PyPlink module. The SNP-major format is the default. It means that the binary file is written one marker at a time.

Note that PyPlink only writes the BED file. The user is required to create the FAM and BIM files.


In [17]:
# The genotypes for 3 markers and 10 samples
all_genotypes = [
    [0, 0, 0, 1, 0, 0, -1, 2, 1, 0],
    [0, 0, 1, 1, 0, 0,  0, 1, 2, 0],
    [0, 0, 0, 0, 1, 1,  0, 0, 0, 1],
]

# Writing the BED file using PyPlink
with PyPlink("test_output", "w") as pedfile:
    for genotypes in all_genotypes:
        pedfile.write_genotypes(genotypes)

# Writing a dummy FAM file
with open("test_output.fam", "w") as fam_file:
    for i in range(10):
        print("family_{}".format(i+1), "sample_{}".format(i+1), "0", "0", "0", "-9",
              sep=" ", file=fam_file)

# Writing a dummy BIM file
with open("test_output.bim", "w") as bim_file:
    for i in range(3):
        print("1", "marker_{}".format(i+1), "0", i+1, "A", "T",
              sep="\t", file=bim_file)

In [18]:
# Checking the content of the newly created binary files
pedfile = PyPlink("test_output")

In [19]:
pedfile.get_fam()


Out[19]:
fid iid father mother gender status
0 family_1 sample_1 0 0 0 -9
1 family_2 sample_2 0 0 0 -9
2 family_3 sample_3 0 0 0 -9
3 family_4 sample_4 0 0 0 -9
4 family_5 sample_5 0 0 0 -9
5 family_6 sample_6 0 0 0 -9
6 family_7 sample_7 0 0 0 -9
7 family_8 sample_8 0 0 0 -9
8 family_9 sample_9 0 0 0 -9
9 family_10 sample_10 0 0 0 -9

In [20]:
pedfile.get_bim()


Out[20]:
chrom pos cm a1 a2
snp
marker_1 1 1 0 A T
marker_2 1 2 0 A T
marker_3 1 3 0 A T

In [21]:
for marker, genotypes in pedfile:
    print(marker, genotypes)


marker_1 [ 0  0  0  1  0  0 -1  2  1  0]
marker_2 [0 0 1 1 0 0 0 1 2 0]
marker_3 [0 0 0 0 1 1 0 0 0 1]

The newly created binary files are compatible with Plink.


In [22]:
from subprocess import Popen, PIPE

# Computing frequencies
proc = Popen(["plink", "--noweb", "--bfile", "test_output", "--freq"],
             stdout=PIPE, stderr=PIPE)
outs, errs = proc.communicate()
print(outs.decode(), end="")


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ plink.log ]
Analysis started: Fri Jan 20 09:20:55 2017

Options in effect:
	--noweb
	--bfile test_output
	--freq

Reading map (extended format) from [ test_output.bim ] 
3 markers to be included from [ test_output.bim ]
Reading pedigree information from [ test_output.fam ] 
10 individuals read from [ test_output.fam ] 
0 individuals with nonmissing phenotypes
Assuming a disease phenotype (1=unaff, 2=aff, 0=miss)
Missing phenotype value is also -9
0 cases, 0 controls and 10 missing
0 males, 0 females, and 10 of unspecified sex
Warning, found 10 individuals with ambiguous sex codes
These individuals will be set to missing ( or use --allow-no-sex )
Writing list of these individuals to [ plink.nosex ]
Reading genotype bitfile from [ test_output.bed ] 
Detected that binary PED file is v1.00 SNP-major mode
Before frequency and genotyping pruning, there are 3 SNPs
10 founders and 0 non-founders found
Writing allele frequencies (founders-only) to [ plink.frq ] 

Analysis finished: Fri Jan 20 09:20:55 2017


In [23]:
with open("plink.frq", "r") as i_file:
    print(i_file.read(), end="")


 CHR        SNP   A1   A2          MAF  NCHROBS
   1   marker_1    A    T       0.2222       18
   1   marker_2    A    T         0.25       20
   1   marker_3    A    T         0.15       20

INDIVIDUAL-major format

The following examples shows how to write a binary file using the PyPlink module. The INDIVIDUAL-major format means that the binary file is written one sample at a time.

Files in INDIVIDUAL-major format is not readable by PyPlink. You need to convert it using Plink.

Note that PyPlink only writes the BED file. The user is required to create the FAM and BIM files.


In [24]:
# The genotypes for 3 markers and 10 samples (INDIVIDUAL-major)
all_genotypes = [
    [ 0, 0, 0],
    [ 0, 0, 0],
    [ 0, 1, 0],
    [ 1, 1, 0],
    [ 0, 0, 1],
    [ 0, 0, 1],
    [-1, 0, 0],
    [ 2, 1, 0],
    [ 1, 2, 0],
    [ 0, 0, 1],
]

# Writing the BED file using PyPlink
with PyPlink("test_output_2", "w", bed_format="INDIVIDUAL-major") as pedfile:
    for genotypes in all_genotypes:
        pedfile.write_genotypes(genotypes)

# Writing a dummy FAM file
with open("test_output_2.fam", "w") as fam_file:
    for i in range(10):
        print("family_{}".format(i+1), "sample_{}".format(i+1), "0", "0", "0", "-9",
              sep=" ", file=fam_file)

# Writing a dummy BIM file
with open("test_output_2.bim", "w") as bim_file:
    for i in range(3):
        print("1", "marker_{}".format(i+1), "0", i+1, "A", "T",
              sep="\t", file=bim_file)

In [25]:
from subprocess import Popen, PIPE

# Computing frequencies
proc = Popen(["plink", "--noweb", "--bfile", "test_output_2", "--freq", "--out", "plink_2"],
             stdout=PIPE, stderr=PIPE)
outs, errs = proc.communicate()
print(outs.decode(), end="")


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ plink_2.log ]
Analysis started: Fri Jan 20 09:21:10 2017

Options in effect:
	--noweb
	--bfile test_output_2
	--freq
	--out plink_2

Reading map (extended format) from [ test_output_2.bim ] 
3 markers to be included from [ test_output_2.bim ]
Reading pedigree information from [ test_output_2.fam ] 
10 individuals read from [ test_output_2.fam ] 
0 individuals with nonmissing phenotypes
Assuming a disease phenotype (1=unaff, 2=aff, 0=miss)
Missing phenotype value is also -9
0 cases, 0 controls and 10 missing
0 males, 0 females, and 10 of unspecified sex
Warning, found 10 individuals with ambiguous sex codes
These individuals will be set to missing ( or use --allow-no-sex )
Writing list of these individuals to [ plink_2.nosex ]
Reading genotype bitfile from [ test_output_2.bed ] 
Detected that binary PED file is v1.00 individual-major mode
Before frequency and genotyping pruning, there are 3 SNPs
Converting data to SNP-major format
10 founders and 0 non-founders found
Writing allele frequencies (founders-only) to [ plink_2.frq ] 

Analysis finished: Fri Jan 20 09:21:10 2017


In [26]:
with open("plink_2.frq", "r") as i_file:
    print(i_file.read(), end="")


 CHR        SNP   A1   A2          MAF  NCHROBS
   1   marker_1    A    T       0.2222       18
   1   marker_2    A    T         0.25       20
   1   marker_3    A    T         0.15       20