In [1]:
from __future__ import print_function, division
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import vcfnp
vcfnp.__version__
Out[1]:
In [2]:
# VCF file name
filename = 'fixture/sample.vcf'
In [3]:
# load data from fixed fields (including INFO)
v = vcfnp.variants(filename, cache=True).view(np.recarray)
In [4]:
v.dtype
Out[4]:
In [5]:
v.CHROM
Out[5]:
In [6]:
v.POS
Out[6]:
In [7]:
v.DP
Out[7]:
In [8]:
# print some simple variant metrics
print('found %s variants (%s SNPs)' % (v.size, np.count_nonzero(v.is_snp)))
print('QUAL mean (std): %s (%s)' % (np.mean(v.QUAL), np.std(v.QUAL)))
In [9]:
# plot a histogram of variant depth
fig = plt.figure(1)
ax = fig.add_subplot(111)
ax.hist(v.DP)
ax.set_title('DP histogram')
ax.set_xlabel('DP');
In [10]:
# load data from sample columns
c = vcfnp.calldata_2d(filename, cache=True).view(np.recarray)
In [11]:
c.dtype
Out[11]:
In [12]:
c.genotype
Out[12]:
In [13]:
# print some simple genotype metrics
count_phased = np.count_nonzero(c.is_phased)
count_variant = np.count_nonzero(np.any(c.genotype > 0, axis=2))
count_missing = np.count_nonzero(~c.is_called)
print('calls (phased, variant, missing): %s (%s, %s, %s)'
% (c.flatten().size, count_phased, count_variant, count_missing))
In [14]:
# plot a histogram of genotype quality
fig = plt.figure(2)
ax = fig.add_subplot(111)
ax.hist(c.GQ.flatten())
ax.set_title('GQ histogram')
ax.set_xlabel('GQ');