`anhima.gt` - Genotype transformations



In [1]:

    
import sys
sys.version_info









    Out[1]:





sys.version_info(major=3, minor=4, micro=2, releaselevel='final', serial=0)



In [2]:

    
import numpy as np
np.random.seed(42)
import scipy.stats
import scipy.sparse
import random
random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
# import anhima
# dev imports
sys.path.insert(0, '..')
%reload_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.gt

Simulate some genotypes



In [3]:

    
# simulate non-uniform variant positions
n_variants = 1000
p = 0
pos = [p]
for i in range(n_variants-1):
    gap = int(np.abs(np.cos(i/100))*100)
    p += gap
    pos.append(p)
pos = np.array(pos)



In [4]:

    
# simulate genotypes 
n_samples = 100
ploidy = 2
af_dist = scipy.stats.beta(a=.4, b=.6)
p_missing = .1
genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy)
genotypes









    Out[4]:





array([[[ 0,  1],
        [ 1,  0],
        [ 1,  0],
        ..., 
        [ 0,  1],
        [ 0,  0],
        [ 0,  1]],

       [[ 0,  0],
        [ 0,  0],
        [ 0,  0],
        ..., 
        [ 0,  0],
        [ 0,  0],
        [ 0,  0]],

       [[-1, -1],
        [ 0,  0],
        [ 0,  0],
        ..., 
        [-1, -1],
        [ 0,  0],
        [ 0,  0]],

       ..., 
       [[ 0,  1],
        [ 1,  0],
        [-1, -1],
        ..., 
        [ 0,  0],
        [-1, -1],
        [ 1,  0]],

       [[ 1,  1],
        [ 1,  1],
        [ 1,  1],
        ..., 
        [ 1,  1],
        [ 1,  1],
        [ 1,  1]],

       [[ 0,  0],
        [ 0,  0],
        [ 0,  0],
        ..., 
        [ 0,  0],
        [ 0,  0],
        [ 0,  1]]], dtype=int8)



In [5]:

    
genotypes.shape









    Out[5]:





(1000, 100, 2)

Count genotypes



In [6]:

    
n_called = anhima.gt.count_called(genotypes)
n_called









    Out[6]:





89945



In [7]:

    
n_missing = anhima.gt.count_missing(genotypes)
n_missing









    Out[7]:





10055



In [8]:

    
n_hom = anhima.gt.count_hom(genotypes)
n_hom









    Out[8]:





68488



In [9]:

    
n_het = anhima.gt.count_het(genotypes)
n_het









    Out[9]:





21457



In [10]:

    
n_hom_ref = anhima.gt.count_hom_ref(genotypes)
n_hom_ref









    Out[10]:





45085



In [11]:

    
n_hom_alt = anhima.gt.count_hom_alt(genotypes)
n_hom_alt









    Out[11]:





23403



In [12]:

    
n_hom == n_hom_ref + n_hom_alt









    Out[12]:





True



In [13]:

    
n_missing + n_hom_ref + n_het + n_hom_alt









    Out[13]:





100000

Transform genotypes



In [14]:

    
anhima.gt.as_haplotypes(genotypes)









    Out[14]:





array([[ 0,  1,  1, ...,  0,  0,  1],
       [ 0,  0,  0, ...,  0,  0,  0],
       [-1, -1,  0, ...,  0,  0,  0],
       ..., 
       [ 0,  1,  1, ..., -1,  1,  0],
       [ 1,  1,  1, ...,  1,  1,  1],
       [ 0,  0,  0, ...,  0,  0,  1]], dtype=int8)



In [15]:

    
anhima.gt.as_n_alt(genotypes)









    Out[15]:





array([[1, 1, 1, ..., 1, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [1, 1, 0, ..., 0, 0, 1],
       [2, 2, 2, ..., 2, 2, 2],
       [0, 0, 0, ..., 0, 0, 1]], dtype=uint8)



In [16]:

    
anhima.gt.as_012(genotypes)









    Out[16]:





array([[ 1,  1,  1, ...,  1,  0,  1],
       [ 0,  0,  0, ...,  0,  0,  0],
       [-1,  0,  0, ..., -1,  0,  0],
       ..., 
       [ 1,  1, -1, ...,  0, -1,  1],
       [ 2,  2,  2, ...,  2,  2,  2],
       [ 0,  0,  0, ...,  0,  0,  1]], dtype=int8)



In [17]:

    
anhima.gt.as_allele_counts(genotypes)









    Out[17]:





array([[[1, 1],
        [1, 1],
        [1, 1],
        ..., 
        [1, 1],
        [2, 0],
        [1, 1]],

       [[2, 0],
        [2, 0],
        [2, 0],
        ..., 
        [2, 0],
        [2, 0],
        [2, 0]],

       [[0, 0],
        [2, 0],
        [2, 0],
        ..., 
        [0, 0],
        [2, 0],
        [2, 0]],

       ..., 
       [[1, 1],
        [1, 1],
        [0, 0],
        ..., 
        [2, 0],
        [0, 0],
        [1, 1]],

       [[0, 2],
        [0, 2],
        [0, 2],
        ..., 
        [0, 2],
        [0, 2],
        [0, 2]],

       [[2, 0],
        [2, 0],
        [2, 0],
        ..., 
        [2, 0],
        [2, 0],
        [1, 1]]], dtype=uint8)



In [18]:

    
packed = anhima.gt.pack_diploid(genotypes)
packed.nbytes, genotypes.nbytes









    Out[18]:





(100000, 200000)



In [19]:

    
packed









    Out[19]:





array([[  1,  16,  16, ...,   1,   0,   1],
       [  0,   0,   0, ...,   0,   0,   0],
       [239,   0,   0, ..., 239,   0,   0],
       ..., 
       [  1,  16, 239, ...,   0, 239,  16],
       [ 17,  17,  17, ...,  17,  17,  17],
       [  0,   0,   0, ...,   0,   0,   1]], dtype=uint8)



In [20]:

    
unpacked = anhima.gt.unpack_diploid(packed)
assert np.array_equal(unpacked, genotypes)



In [21]:

    
packed_sparse = scipy.sparse.csr_matrix(packed)
packed_sparse









    Out[21]:





<1000x100 sparse matrix of type '<class 'numpy.uint8'>'
	with 54915 stored elements in Compressed Sparse Row format>



In [22]:

    
# in this case, the sparse matrix is actually bigger than the original
# however, for real data where sparsity is higher, the sparse matrix
# may reduce data size further
packed_sparse.data.nbytes + packed_sparse.indices.nbytes + packed_sparse.indptr.nbytes









    Out[22]:





278579

Plot genotypes for a single sample



In [23]:

    
gn = anhima.gt.as_012(genotypes)



In [24]:

    
# plot missingness counts for sample 0
anhima.gt.plot_windowed_genotype_counts(pos, gn[:, 0], -1, 1000);



In [25]:

    
# plot missingness rate for sample 0
anhima.gt.plot_windowed_genotype_rate(pos, gn[:, 0], -1, 3000);



In [26]:

    
# plot heterozygosity density for sample 0
anhima.gt.plot_windowed_genotype_density(pos, gn[:, 0], 1, 1000);



In [27]:

    
# plot hom alt density for sample 0
anhima.gt.plot_windowed_genotype_density(pos, gn[:, 0], 2, 1000);

Plot genotypes for multiple samples



In [28]:

    
fig = plt.figure(figsize=(12, 8))

# plot genotypes
ax = plt.subplot2grid((8, 1), (0, 0), rowspan=7)
anhima.gt.plot_diploid_genotypes(gn, ax=ax)

# plot variant locator
ax = plt.subplot2grid((8, 1), (7, 0), rowspan=1)
anhima.loc.plot_variant_locator(pos, step=10, ax=ax);



In [29]:

    
# simulate continuous call data
#dp = np.random.randint(low=0, high=50, size=(n_variants*n_samples)).reshape(n_variants, n_samples)
dp = scipy.stats.norm(loc=25, scale=10).rvs(n_variants*n_samples).reshape(n_variants, n_samples)
dp[dp < 0] = 0



In [30]:

    
fig = plt.figure(figsize=(12, 8))

# plot dp
ax = plt.subplot2grid((8, 1), (0, 0), rowspan=7)
anhima.gt.plot_continuous_calldata(dp, ax=ax, pcolormesh_kwargs=dict(cmap='Greys'))

# plot variant locator
ax = plt.subplot2grid((8, 1), (7, 0), rowspan=1)
anhima.loc.plot_variant_locator(pos, step=10, ax=ax);



In [31]:

    
ax = anhima.gt.plot_genotype_counts_by_sample(gn)
ax.legend(loc='upper left', bbox_to_anchor=(1.03, 1));



In [32]:

    
ax = anhima.gt.plot_genotype_counts_by_sample(gn, orientation='horizontal')
ax.legend(loc='upper left', bbox_to_anchor=(1.03, 1));



In [33]:

    
fig, ax = plt.subplots(figsize=(12, 6))
anhima.gt.plot_genotype_counts_by_variant(gn, ax=ax)
ax.legend(loc='upper left', bbox_to_anchor=(1.03, 1));



In [34]:

    
fig, ax = plt.subplots(figsize=(6, 12))
anhima.gt.plot_genotype_counts_by_variant(gn, ax=ax, orientation='horizontal')
ax.legend(loc='upper left', bbox_to_anchor=(1.03, 1));



In [35]:

    
fig = plt.figure(figsize=(16, 14))

layout = (16, 12)

ax = plt.subplot2grid(layout, (0, 0), rowspan=4, colspan=10)
anhima.gt.plot_genotype_counts_by_variant(gn, orientation='vertical', ax=ax)
ax.set_yticks([])

ax = plt.subplot2grid(layout, (4, 0), rowspan=8, colspan=10)
anhima.gt.plot_diploid_genotypes(gn, ax=ax)

ax = plt.subplot2grid(layout, (4, 10), rowspan=8, colspan=4)
anhima.gt.plot_genotype_counts_by_sample(gn, orientation='horizontal', ax=ax)
ax.set_xticks([])

ax = plt.subplot2grid(layout, (12, 0), rowspan=2, colspan=10)
anhima.loc.plot_variant_locator(pos, step=10, ax=ax);



In [36]:

    
fig, ax = plt.subplots(figsize=(16, 6))
anhima.gt.plot_continuous_calldata_by_sample(dp, ax=ax);



In [37]:

anhima.gt - Genotype transformations

Simulate some genotypes

Count genotypes

Transform genotypes

Plot genotypes for a single sample

Plot genotypes for multiple samples

`anhima.gt` - Genotype transformations