In [1]:
from __future__ import print_function, division
In [2]:
from pyplink import PyPlink
The Plink
softwares provides a testing dataset on the resources page. It contains the 270 samples from the HapMap project (release 23) on build GRCh36/hg18.
In [3]:
import zipfile
try:
from urllib.request import urlretrieve
except ImportError:
from urllib import urlretrieve
# Downloading the demo data from Plink webset
urlretrieve(
"http://pngu.mgh.harvard.edu/~purcell/plink/dist/hapmap_r23a.zip",
"hapmap_r23a.zip",
)
# Extracting the archive content
with zipfile.ZipFile("hapmap_r23a.zip", "r") as z:
z.extractall(".")
In [4]:
pedfile = PyPlink("hapmap_r23a")
In [5]:
print("{:,d} samples and {:,d} markers".format(
pedfile.get_nb_samples(),
pedfile.get_nb_markers(),
))
In [6]:
all_samples = pedfile.get_fam()
all_samples.head()
Out[6]:
In [7]:
all_markers = pedfile.get_bim()
all_markers.head()
Out[7]:
In [8]:
for marker_id, genotypes in pedfile:
print(marker_id)
print(genotypes)
break
In [9]:
for marker_id, genotypes in pedfile.iter_geno():
print(marker_id)
print(genotypes)
break
In [10]:
for marker_id, genotypes in pedfile.iter_acgt_geno():
print(marker_id)
print(genotypes)
break
In [11]:
markers = ["rs7092431", "rs9943770", "rs1587483"]
for marker_id, genotypes in pedfile.iter_geno_marker(markers):
print(marker_id)
print(genotypes, end="\n\n")
In [12]:
markers = ["rs7092431", "rs9943770", "rs1587483"]
for marker_id, genotypes in pedfile.iter_acgt_geno_marker(markers):
print(marker_id)
print(genotypes, end="\n\n")
In [13]:
pedfile.get_geno_marker("rs7619974")
Out[13]:
In [14]:
pedfile.get_acgt_geno_marker("rs7619974")
Out[14]:
In [15]:
# Getting the Y markers
y_markers = all_markers[all_markers.chrom == 24].index.values
# Getting the males
males = all_samples.gender == 1
# Cycling through the Y markers
for marker_id, genotypes in pedfile.iter_geno_marker(y_markers):
male_genotypes = genotypes[males.values]
print("{:,d} total genotypes".format(len(genotypes)))
print("{:,d} genotypes for {:,d} males ({} on chr{} and position {:,d})".format(
len(male_genotypes),
males.sum(),
marker_id,
all_markers.loc[marker_id, "chrom"],
all_markers.loc[marker_id, "pos"],
))
break
In [16]:
# Getting the founders
founders = (all_samples.father == "0") & (all_samples.mother == "0")
# Computing the MAF
markers = ["rs7619974", "rs2949048", "rs16941434"]
for marker_id, genotypes in pedfile.iter_geno_marker(markers):
valid_genotypes = genotypes[founders.values & (genotypes != -1)]
maf = valid_genotypes.sum() / (len(valid_genotypes) * 2)
print(marker_id, round(maf, 6), sep="\t")
The following examples shows how to write a binary file using the PyPlink
module. The SNP-major format is the default. It means that the binary file is written one marker at a time.
Note that
PyPlink
only writes theBED
file. The user is required to create theFAM
andBIM
files.
In [17]:
# The genotypes for 3 markers and 10 samples
all_genotypes = [
[0, 0, 0, 1, 0, 0, -1, 2, 1, 0],
[0, 0, 1, 1, 0, 0, 0, 1, 2, 0],
[0, 0, 0, 0, 1, 1, 0, 0, 0, 1],
]
# Writing the BED file using PyPlink
with PyPlink("test_output", "w") as pedfile:
for genotypes in all_genotypes:
pedfile.write_genotypes(genotypes)
# Writing a dummy FAM file
with open("test_output.fam", "w") as fam_file:
for i in range(10):
print("family_{}".format(i+1), "sample_{}".format(i+1), "0", "0", "0", "-9",
sep=" ", file=fam_file)
# Writing a dummy BIM file
with open("test_output.bim", "w") as bim_file:
for i in range(3):
print("1", "marker_{}".format(i+1), "0", i+1, "A", "T",
sep="\t", file=bim_file)
In [18]:
# Checking the content of the newly created binary files
pedfile = PyPlink("test_output")
In [19]:
pedfile.get_fam()
Out[19]:
In [20]:
pedfile.get_bim()
Out[20]:
In [21]:
for marker, genotypes in pedfile:
print(marker, genotypes)
The newly created binary files are compatible with Plink.
In [22]:
from subprocess import Popen, PIPE
# Computing frequencies
proc = Popen(["plink", "--noweb", "--bfile", "test_output", "--freq"],
stdout=PIPE, stderr=PIPE)
outs, errs = proc.communicate()
print(outs.decode(), end="")
In [23]:
with open("plink.frq", "r") as i_file:
print(i_file.read(), end="")
The following examples shows how to write a binary file using the PyPlink
module. The INDIVIDUAL-major format means that the binary file is written one sample at a time.
Files in INDIVIDUAL-major format is not readable by PyPlink
. You need to convert it using Plink.
Note that
PyPlink
only writes theBED
file. The user is required to create theFAM
andBIM
files.
In [24]:
# The genotypes for 3 markers and 10 samples (INDIVIDUAL-major)
all_genotypes = [
[ 0, 0, 0],
[ 0, 0, 0],
[ 0, 1, 0],
[ 1, 1, 0],
[ 0, 0, 1],
[ 0, 0, 1],
[-1, 0, 0],
[ 2, 1, 0],
[ 1, 2, 0],
[ 0, 0, 1],
]
# Writing the BED file using PyPlink
with PyPlink("test_output_2", "w", bed_format="INDIVIDUAL-major") as pedfile:
for genotypes in all_genotypes:
pedfile.write_genotypes(genotypes)
# Writing a dummy FAM file
with open("test_output_2.fam", "w") as fam_file:
for i in range(10):
print("family_{}".format(i+1), "sample_{}".format(i+1), "0", "0", "0", "-9",
sep=" ", file=fam_file)
# Writing a dummy BIM file
with open("test_output_2.bim", "w") as bim_file:
for i in range(3):
print("1", "marker_{}".format(i+1), "0", i+1, "A", "T",
sep="\t", file=bim_file)
In [25]:
from subprocess import Popen, PIPE
# Computing frequencies
proc = Popen(["plink", "--noweb", "--bfile", "test_output_2", "--freq", "--out", "plink_2"],
stdout=PIPE, stderr=PIPE)
outs, errs = proc.communicate()
print(outs.decode(), end="")
In [26]:
with open("plink_2.frq", "r") as i_file:
print(i_file.read(), end="")