notebook.community

Edit and run



In [1]:

    
from __future__ import print_function, division
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import vcfnp
vcfnp.__version__









    Out[1]:





'2.0.0'



In [2]:

    
# VCF file name
filename = 'fixture/sample.vcf'



In [3]:

    
# load data from fixed fields (including INFO)
v = vcfnp.variants(filename, cache=True).view(np.recarray)









    



[vcfnp] 2015-01-23 11:22:59.232507 :: caching is enabled
[vcfnp] 2015-01-23 11:22:59.232909 :: cache file available
[vcfnp] 2015-01-23 11:22:59.233095 :: loading from cache file fixture/sample.vcf.vcfnp_cache/variants.npy



In [4]:

    
v.dtype









    Out[4]:





dtype([('CHROM', 'S12'), ('POS', '<i4'), ('ID', 'S12'), ('REF', 'S12'), ('ALT', 'S12'), ('QUAL', '<f4'), ('FILTER_q10', '?'), ('FILTER_s50', '?'), ('FILTER_PASS', '?'), ('num_alleles', 'u1'), ('is_snp', '?'), ('svlen', '<i4'), ('AA', 'S12'), ('AC', '<u2'), ('AF', '<f2'), ('AN', '<u2'), ('DB', '?'), ('DP', '<i4'), ('H2', '?'), ('NS', '<i4')])



In [5]:

    
v.CHROM









    Out[5]:





chararray([b'19', b'19', b'20', b'20', b'20', b'20', b'20', b'20', b'X'], 
      dtype='|S12')



In [6]:

    
v.POS









    Out[6]:





array([    111,     112,   14370,   17330, 1110696, 1230237, 1234567,
       1235237,      10], dtype=int32)



In [7]:

    
v.DP









    Out[7]:





array([ 0,  0, 14, 11, 10, 13,  9,  0,  0], dtype=int32)



In [8]:

    
# print some simple variant metrics
print('found %s variants (%s SNPs)' % (v.size, np.count_nonzero(v.is_snp)))
print('QUAL mean (std): %s (%s)' % (np.mean(v.QUAL), np.std(v.QUAL)))









    



found 9 variants (5 SNPs)
QUAL mean (std): 25.0667 (22.816)



In [9]:

    
# plot a histogram of variant depth
fig = plt.figure(1)
ax = fig.add_subplot(111)
ax.hist(v.DP)
ax.set_title('DP histogram')
ax.set_xlabel('DP');



In [10]:

    
# load data from sample columns
c = vcfnp.calldata_2d(filename, cache=True).view(np.recarray)









    



[vcfnp] 2015-01-23 11:22:59.494181 :: caching is enabled
[vcfnp] 2015-01-23 11:22:59.494571 :: cache file available
[vcfnp] 2015-01-23 11:22:59.494840 :: loading from cache file fixture/sample.vcf.vcfnp_cache/calldata_2d.npy



In [11]:

    
c.dtype









    Out[11]:





dtype([('is_called', '?'), ('is_phased', '?'), ('genotype', 'i1', (2,)), ('DP', '<u2'), ('GQ', 'u1'), ('GT', 'S3'), ('HQ', '<i4', (2,))])



In [12]:

    
c.genotype









    Out[12]:





array([[[ 0,  0],
        [ 0,  0],
        [ 0,  1]],

       [[ 0,  0],
        [ 0,  0],
        [ 0,  1]],

       [[ 0,  0],
        [ 1,  0],
        [ 1,  1]],

       [[ 0,  0],
        [ 0,  1],
        [ 0,  0]],

       [[ 1,  2],
        [ 2,  1],
        [ 2,  2]],

       [[ 0,  0],
        [ 0,  0],
        [ 0,  0]],

       [[ 0,  1],
        [ 0,  2],
        [-1, -1]],

       [[ 0,  0],
        [ 0,  0],
        [-1, -1]],

       [[ 0, -1],
        [ 0,  1],
        [ 0,  2]]], dtype=int8)



In [13]:

    
# print some simple genotype metrics
count_phased = np.count_nonzero(c.is_phased)
count_variant = np.count_nonzero(np.any(c.genotype > 0, axis=2))
count_missing = np.count_nonzero(~c.is_called)
print('calls (phased, variant, missing): %s (%s, %s, %s)'
    % (c.flatten().size, count_phased, count_variant, count_missing))









    



calls (phased, variant, missing): 27 (14, 12, 2)



In [14]:

    
# plot a histogram of genotype quality
fig = plt.figure(2)
ax = fig.add_subplot(111)
ax.hist(c.GQ.flatten())
ax.set_title('GQ histogram')
ax.set_xlabel('GQ');