In [1]:
import numpy as np
np.random.seed(1)
import scipy.stats
import itertools
import random
random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
# import anhima
# dev imports
sys.path.insert(0, '..')
%reload_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.util
%aimport anhima.gt
%aimport anhima.ped
In [2]:
# simulate parent diplotype
n_variants = 1000
p_alt = .5
parent_diplotype_truth = scipy.stats.bernoulli.rvs(p_alt, size=1000*2).reshape(1000, 2)
# introduce some missingness
p_missing = .03
n_missing = scipy.stats.binom(p=p_missing, n=n_variants).rvs()
loc_missing = random.sample(range(n_variants), n_missing)
parent_diplotype = parent_diplotype_truth.copy()
parent_diplotype[loc_missing] = (-1, -1)
parent_diplotype
Out[2]:
In [3]:
anhima.gt.count_called(parent_diplotype)
Out[3]:
In [4]:
anhima.gt.count_hom_ref(parent_diplotype)
Out[4]:
In [5]:
anhima.gt.count_het(parent_diplotype)
Out[5]:
In [6]:
anhima.gt.count_hom_alt(parent_diplotype)
Out[6]:
In [7]:
# simulate gamete haplotypes
n_gametes = 20
gamete_haplotypes = np.empty((n_variants, n_gametes), dtype='i1')
n_crossovers = scipy.stats.poisson.rvs(.8, size=n_gametes)
p_mendel_error = .03
for i in range(n_gametes):
# randomly choose which parent to start with
parent = scipy.stats.bernoulli(.5).rvs()
h = parent_diplotype_truth[:, parent].copy()
# simulate crossovers
loc_switches = sorted(np.random.randint(0, n_variants, size=n_crossovers[i]))
for l in loc_switches:
parent = 0 if parent == 1 else 1
h[l:] = parent_diplotype_truth[l:, parent]
# simulate errors
n_me = scipy.stats.binom(p=p_mendel_error, n=n_variants).rvs()
loc_me = random.sample(range(n_variants), n_me)
h[loc_me] = scipy.stats.bernoulli.rvs(.5, size=n_me)
# simulate missingness
n_missing = scipy.stats.binom(p=p_missing, n=n_variants).rvs()
loc_missing = random.sample(range(n_variants), n_missing)
h[loc_missing] = -1
gamete_haplotypes[:, i] = h
gamete_haplotypes
Out[7]:
In [8]:
inh = anhima.ped.diploid_inheritance(parent_diplotype, gamete_haplotypes)
In [9]:
inh
Out[9]:
In [10]:
inheritance_colors = ['red', # parent 1
'blue', # parent 2
'green', # parents both ref
'orange', # parents both alt
'black', # non-parental
'yellow', # parents missing
'white'] # missing
In [11]:
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh,
colors=inheritance_colors,
states=range(1, 8),
ax=ax);
In [12]:
inh_seg = inh[anhima.gt.is_het(parent_diplotype), :]
inh_seg.shape
Out[12]:
In [13]:
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_seg,
colors=inheritance_colors,
states=range(8),
ax=ax)
ax.set_xticks(np.arange(0, inh_seg.shape[0], 50));
In [14]:
# simulate random variant positions
pos = np.asarray(random.sample(range(100000), inh_seg.shape[0]))
pos.sort()
In [15]:
pos_impute = [0] + random.sample(range(100000), 100) + [200000]
pos_impute = np.array(pos_impute)
pos_impute.sort()
pos_impute
Out[15]:
In [16]:
inh_imputed = anhima.ped.impute_inheritance_nearest(inh_seg, pos, pos_impute)
In [17]:
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_imputed,
colors=inheritance_colors,
states=range(8),
ax=ax);
In [18]:
# consider first gamete only
x = inh_seg[:, 0]
fig, ax = plt.subplots(figsize=(14, 1))
anhima.gt.plot_discrete_calldata(x[:, None],
colors=inheritance_colors,
states=range(8),
ax=ax)
ax.set_xticks(np.arange(0, x.size, 50));
In [19]:
states = anhima.ped.INHERIT_PARENT1, anhima.ped.INHERIT_PARENT2
df_switches = anhima.util.tabulate_state_transitions(x, states, pos)
df_switches.head()
Out[19]:
In [20]:
df_switches.tail()
Out[20]:
In [21]:
df_blocks = anhima.util.tabulate_state_blocks(x, states, pos)
df_blocks.head()
Out[21]:
In [22]:
df_blocks.tail()
Out[22]:
In [23]:
df_all_switches = anhima.ped.tabulate_inheritance_switches(inh_seg, pos)
df_all_switches.head()
Out[23]:
In [24]:
df_switches.equals(df_all_switches.loc[0])
Out[24]:
In [25]:
df_all_blocks = anhima.ped.tabulate_inheritance_blocks(inh_seg, pos)
df_all_blocks.head()
Out[25]:
In [26]:
df_blocks.equals(df_all_blocks.loc[0])
Out[26]:
In [27]:
block_support, block_length_min = anhima.ped.inheritance_block_masks(inh_seg, pos)
In [28]:
# filter using block support
inh_seg_flt = inh_seg.copy()
inh_seg_flt[block_support < 2] = anhima.ped.INHERIT_MISSING
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_seg_flt,
colors=inheritance_colors,
states=range(8),
ax=ax)
ax.set_xticks(np.arange(0, inh_seg.shape[0], 50));
In [29]:
anhima.util.tabulate_state_blocks(inh_seg_flt[:, 0], states, pos)
Out[29]:
In [30]:
# filter using minimal block length
inh_seg_flt = inh_seg.copy()
inh_seg_flt[block_length_min < 100] = anhima.ped.INHERIT_MISSING
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_seg_flt,
colors=inheritance_colors,
states=range(8),
ax=ax)
ax.set_xticks(np.arange(0, inh_seg.shape[0], 50));
In [31]:
anhima.util.tabulate_state_blocks(inh_seg_flt[:, 0], states, pos)
Out[31]:
In [32]: