anhima.gt - Genotype transformations
In [1]:
    
import sys
sys.version_info
    
    Out[1]:
In [2]:
    
import numpy as np
np.random.seed(42)
import scipy.stats
import scipy.sparse
import random
random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
# import anhima
# dev imports
sys.path.insert(0, '..')
%reload_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.gt
    
In [3]:
    
# simulate non-uniform variant positions
n_variants = 1000
p = 0
pos = [p]
for i in range(n_variants-1):
    gap = int(np.abs(np.cos(i/100))*100)
    p += gap
    pos.append(p)
pos = np.array(pos)
    
In [4]:
    
# simulate genotypes 
n_samples = 100
ploidy = 2
af_dist = scipy.stats.beta(a=.4, b=.6)
p_missing = .1
genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy)
genotypes
    
    Out[4]:
In [5]:
    
genotypes.shape
    
    Out[5]:
In [6]:
    
n_called = anhima.gt.count_called(genotypes)
n_called
    
    Out[6]:
In [7]:
    
n_missing = anhima.gt.count_missing(genotypes)
n_missing
    
    Out[7]:
In [8]:
    
n_hom = anhima.gt.count_hom(genotypes)
n_hom
    
    Out[8]:
In [9]:
    
n_het = anhima.gt.count_het(genotypes)
n_het
    
    Out[9]:
In [10]:
    
n_hom_ref = anhima.gt.count_hom_ref(genotypes)
n_hom_ref
    
    Out[10]:
In [11]:
    
n_hom_alt = anhima.gt.count_hom_alt(genotypes)
n_hom_alt
    
    Out[11]:
In [12]:
    
n_hom == n_hom_ref + n_hom_alt
    
    Out[12]:
In [13]:
    
n_missing + n_hom_ref + n_het + n_hom_alt
    
    Out[13]:
In [14]:
    
anhima.gt.as_haplotypes(genotypes)
    
    Out[14]:
In [15]:
    
anhima.gt.as_n_alt(genotypes)
    
    Out[15]:
In [16]:
    
anhima.gt.as_012(genotypes)
    
    Out[16]:
In [17]:
    
anhima.gt.as_allele_counts(genotypes)
    
    Out[17]:
In [18]:
    
packed = anhima.gt.pack_diploid(genotypes)
packed.nbytes, genotypes.nbytes
    
    Out[18]:
In [19]:
    
packed
    
    Out[19]:
In [20]:
    
unpacked = anhima.gt.unpack_diploid(packed)
assert np.array_equal(unpacked, genotypes)
    
In [21]:
    
packed_sparse = scipy.sparse.csr_matrix(packed)
packed_sparse
    
    Out[21]:
In [22]:
    
# in this case, the sparse matrix is actually bigger than the original
# however, for real data where sparsity is higher, the sparse matrix
# may reduce data size further
packed_sparse.data.nbytes + packed_sparse.indices.nbytes + packed_sparse.indptr.nbytes
    
    Out[22]:
In [23]:
    
gn = anhima.gt.as_012(genotypes)
    
In [24]:
    
# plot missingness counts for sample 0
anhima.gt.plot_windowed_genotype_counts(pos, gn[:, 0], -1, 1000);
    
    
In [25]:
    
# plot missingness rate for sample 0
anhima.gt.plot_windowed_genotype_rate(pos, gn[:, 0], -1, 3000);
    
    
In [26]:
    
# plot heterozygosity density for sample 0
anhima.gt.plot_windowed_genotype_density(pos, gn[:, 0], 1, 1000);
    
    
In [27]:
    
# plot hom alt density for sample 0
anhima.gt.plot_windowed_genotype_density(pos, gn[:, 0], 2, 1000);
    
    
In [28]:
    
fig = plt.figure(figsize=(12, 8))
# plot genotypes
ax = plt.subplot2grid((8, 1), (0, 0), rowspan=7)
anhima.gt.plot_diploid_genotypes(gn, ax=ax)
# plot variant locator
ax = plt.subplot2grid((8, 1), (7, 0), rowspan=1)
anhima.loc.plot_variant_locator(pos, step=10, ax=ax);
    
    
In [29]:
    
# simulate continuous call data
#dp = np.random.randint(low=0, high=50, size=(n_variants*n_samples)).reshape(n_variants, n_samples)
dp = scipy.stats.norm(loc=25, scale=10).rvs(n_variants*n_samples).reshape(n_variants, n_samples)
dp[dp < 0] = 0
    
In [30]:
    
fig = plt.figure(figsize=(12, 8))
# plot dp
ax = plt.subplot2grid((8, 1), (0, 0), rowspan=7)
anhima.gt.plot_continuous_calldata(dp, ax=ax, pcolormesh_kwargs=dict(cmap='Greys'))
# plot variant locator
ax = plt.subplot2grid((8, 1), (7, 0), rowspan=1)
anhima.loc.plot_variant_locator(pos, step=10, ax=ax);
    
    
In [31]:
    
ax = anhima.gt.plot_genotype_counts_by_sample(gn)
ax.legend(loc='upper left', bbox_to_anchor=(1.03, 1));
    
    
In [32]:
    
ax = anhima.gt.plot_genotype_counts_by_sample(gn, orientation='horizontal')
ax.legend(loc='upper left', bbox_to_anchor=(1.03, 1));
    
    
In [33]:
    
fig, ax = plt.subplots(figsize=(12, 6))
anhima.gt.plot_genotype_counts_by_variant(gn, ax=ax)
ax.legend(loc='upper left', bbox_to_anchor=(1.03, 1));
    
    
In [34]:
    
fig, ax = plt.subplots(figsize=(6, 12))
anhima.gt.plot_genotype_counts_by_variant(gn, ax=ax, orientation='horizontal')
ax.legend(loc='upper left', bbox_to_anchor=(1.03, 1));
    
    
In [35]:
    
fig = plt.figure(figsize=(16, 14))
layout = (16, 12)
ax = plt.subplot2grid(layout, (0, 0), rowspan=4, colspan=10)
anhima.gt.plot_genotype_counts_by_variant(gn, orientation='vertical', ax=ax)
ax.set_yticks([])
ax = plt.subplot2grid(layout, (4, 0), rowspan=8, colspan=10)
anhima.gt.plot_diploid_genotypes(gn, ax=ax)
ax = plt.subplot2grid(layout, (4, 10), rowspan=8, colspan=4)
anhima.gt.plot_genotype_counts_by_sample(gn, orientation='horizontal', ax=ax)
ax.set_xticks([])
ax = plt.subplot2grid(layout, (12, 0), rowspan=2, colspan=10)
anhima.loc.plot_variant_locator(pos, step=10, ax=ax);
    
    
In [36]:
    
fig, ax = plt.subplots(figsize=(16, 6))
anhima.gt.plot_continuous_calldata_by_sample(dp, ax=ax);
    
    
In [37]: