anhima.ped - Pedigrees


In [1]:
import numpy as np
np.random.seed(1)
import scipy.stats
import itertools
import random
random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
# import anhima
# dev imports
sys.path.insert(0, '..')
%reload_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.util
%aimport anhima.gt
%aimport anhima.ped

In [2]:
# simulate parent diplotype
n_variants = 1000
p_alt = .5
parent_diplotype_truth = scipy.stats.bernoulli.rvs(p_alt, size=1000*2).reshape(1000, 2)

# introduce some missingness
p_missing = .03
n_missing = scipy.stats.binom(p=p_missing, n=n_variants).rvs()
loc_missing = random.sample(range(n_variants), n_missing)
parent_diplotype = parent_diplotype_truth.copy()
parent_diplotype[loc_missing] = (-1, -1)

parent_diplotype


Out[2]:
array([[0, 1],
       [0, 0],
       [0, 0],
       ..., 
       [1, 0],
       [0, 1],
       [0, 1]])

In [3]:
anhima.gt.count_called(parent_diplotype)


Out[3]:
969

In [4]:
anhima.gt.count_hom_ref(parent_diplotype)


Out[4]:
214

In [5]:
anhima.gt.count_het(parent_diplotype)


Out[5]:
502

In [6]:
anhima.gt.count_hom_alt(parent_diplotype)


Out[6]:
253

In [7]:
# simulate gamete haplotypes
n_gametes = 20
gamete_haplotypes = np.empty((n_variants, n_gametes), dtype='i1')
n_crossovers = scipy.stats.poisson.rvs(.8, size=n_gametes)
p_mendel_error = .03

for i in range(n_gametes):
    
    # randomly choose which parent to start with
    parent = scipy.stats.bernoulli(.5).rvs()
    h = parent_diplotype_truth[:, parent].copy()
    
    # simulate crossovers
    loc_switches = sorted(np.random.randint(0, n_variants, size=n_crossovers[i]))
    for l in loc_switches:
        parent = 0 if parent == 1 else 1
        h[l:] = parent_diplotype_truth[l:, parent]
        
    # simulate errors
    n_me = scipy.stats.binom(p=p_mendel_error, n=n_variants).rvs()
    loc_me = random.sample(range(n_variants), n_me)
    h[loc_me] = scipy.stats.bernoulli.rvs(.5, size=n_me)
    
    # simulate missingness
    n_missing = scipy.stats.binom(p=p_missing, n=n_variants).rvs()
    loc_missing = random.sample(range(n_variants), n_missing)
    h[loc_missing] = -1
    
    gamete_haplotypes[:, i] = h
    
gamete_haplotypes


Out[7]:
array([[ 0,  0,  0, ...,  1,  0,  1],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ..., 
       [ 1,  1,  1, ...,  1,  1, -1],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int8)

Diploid inheritance


In [8]:
inh = anhima.ped.diploid_inheritance(parent_diplotype, gamete_haplotypes)

In [9]:
inh


Out[9]:
array([[1, 1, 1, ..., 2, 1, 2],
       [3, 3, 3, ..., 3, 3, 3],
       [3, 3, 3, ..., 3, 3, 3],
       ..., 
       [1, 1, 1, ..., 1, 1, 7],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)

In [10]:
inheritance_colors = ['red',    # parent 1
                      'blue',   # parent 2
                      'green',  # parents both ref
                      'orange', # parents both alt
                      'black',  # non-parental
                      'yellow', # parents missing
                      'white']  # missing

In [11]:
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh, 
                                 colors=inheritance_colors,
                                 states=range(1, 8),
                                 ax=ax);



In [12]:
inh_seg = inh[anhima.gt.is_het(parent_diplotype), :]
inh_seg.shape


Out[12]:
(502, 20)

In [13]:
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_seg, 
                                 colors=inheritance_colors,
                                 states=range(8),
                                 ax=ax)
ax.set_xticks(np.arange(0, inh_seg.shape[0], 50));


Impute inheritance


In [14]:
# simulate random variant positions
pos = np.asarray(random.sample(range(100000), inh_seg.shape[0]))
pos.sort()

In [15]:
pos_impute = [0] + random.sample(range(100000), 100) + [200000]
pos_impute = np.array(pos_impute)
pos_impute.sort()
pos_impute


Out[15]:
array([     0,    371,   1947,   2068,   2799,   3379,   4082,   4288,
         5683,   6629,   6868,   7936,   8330,   8992,  12015,  12978,
        13022,  14293,  14408,  14896,  16011,  16505,  16614,  17038,
        17275,  18964,  19340,  20253,  22838,  22871,  24878,  26896,
        27152,  27801,  28305,  28477,  29038,  30539,  30947,  31186,
        31925,  32853,  34782,  36160,  36342,  36666,  39535,  42437,
        43064,  43713,  46955,  47012,  47453,  47537,  48147,  48566,
        52184,  53492,  54379,  54935,  55176,  58115,  58287,  58756,
        59245,  59600,  60036,  62422,  64139,  66039,  67271,  68347,
        68955,  70804,  70807,  71364,  72641,  74000,  76858,  76983,
        77182,  77812,  78116,  78269,  80623,  80694,  83819,  84109,
        84323,  84518,  85841,  86363,  86646,  86870,  89873,  90476,
        95236,  95321,  97218,  97290,  97735, 200000])

In [16]:
inh_imputed = anhima.ped.impute_inheritance_nearest(inh_seg, pos, pos_impute)

In [17]:
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_imputed, 
                                 colors=inheritance_colors,
                                 states=range(8),
                                 ax=ax);


Inheritance switches and blocks


In [18]:
# consider first gamete only
x = inh_seg[:, 0]
fig, ax = plt.subplots(figsize=(14, 1))
anhima.gt.plot_discrete_calldata(x[:, None], 
                                 colors=inheritance_colors,
                                 states=range(8),
                                 ax=ax)
ax.set_xticks(np.arange(0, x.size, 50));



In [19]:
states = anhima.ped.INHERIT_PARENT1, anhima.ped.INHERIT_PARENT2
df_switches = anhima.util.tabulate_state_transitions(x, states, pos)
df_switches.head()


Out[19]:
lidx ridx lpos rpos lval rval
0 2 3 350 1061 1 2
1 3 4 1061 1231 2 1
2 56 57 12735 12739 1 2
3 72 73 15292 15396 2 1
4 73 74 15396 15525 1 2

In [20]:
df_switches.tail()


Out[20]:
lidx ridx lpos rpos lval rval
19 409 410 82578 82727 2 1
20 429 430 86400 86499 1 2
21 430 431 86499 86563 2 1
22 474 475 94203 94493 1 2
23 475 476 94493 94692 2 1

In [21]:
df_blocks = anhima.util.tabulate_state_blocks(x, states, pos)
df_blocks.head()


Out[21]:
start_min_idx start_max_idx stop_min_idx stop_max_idx start_min_pos start_max_pos stop_min_pos stop_max_pos state support size_min size_max length_min length_max is_marginal
0 0 0 2 3 150 150 350 1061 1 3 2 2 200 911 True
1 2 3 3 4 350 1061 1061 1231 2 1 1 1 0 881 False
2 3 4 56 57 1061 1231 12735 12739 1 52 53 53 11504 11678 False
3 56 57 72 73 12735 12739 15292 15396 2 16 16 16 2553 2661 False
4 72 73 73 74 15292 15396 15396 15525 1 1 1 1 0 233 False

In [22]:
df_blocks.tail()


Out[22]:
start_min_idx start_max_idx stop_min_idx stop_max_idx start_min_pos start_max_pos stop_min_pos stop_max_pos state support size_min size_max length_min length_max is_marginal
20 409 410 429 430 82578 82727 86400 86499 1 19 20 20 3673 3921 False
21 429 430 430 431 86400 86499 86499 86563 2 1 1 1 0 163 False
22 430 431 474 475 86499 86563 94203 94493 1 42 44 44 7640 7994 False
23 474 475 475 476 94203 94493 94493 94692 2 1 1 1 0 489 False
24 475 476 501 501 94493 94692 99991 99991 1 26 26 26 5299 5498 True

In [23]:
df_all_switches = anhima.ped.tabulate_inheritance_switches(inh_seg, pos)
df_all_switches.head()


Out[23]:
lidx ridx lpos rpos lval rval
0 0 2 3 350 1061 1 2
1 3 4 1061 1231 2 1
2 56 57 12735 12739 1 2
3 72 73 15292 15396 2 1
4 73 74 15396 15525 1 2

In [24]:
df_switches.equals(df_all_switches.loc[0])


Out[24]:
True

In [25]:
df_all_blocks = anhima.ped.tabulate_inheritance_blocks(inh_seg, pos)
df_all_blocks.head()


Out[25]:
start_min_idx start_max_idx stop_min_idx stop_max_idx start_min_pos start_max_pos stop_min_pos stop_max_pos state support size_min size_max length_min length_max is_marginal
0 0 0 0 2 3 150 150 350 1061 1 3 2 2 200 911 True
1 2 3 3 4 350 1061 1061 1231 2 1 1 1 0 881 False
2 3 4 56 57 1061 1231 12735 12739 1 52 53 53 11504 11678 False
3 56 57 72 73 12735 12739 15292 15396 2 16 16 16 2553 2661 False
4 72 73 73 74 15292 15396 15396 15525 1 1 1 1 0 233 False

In [26]:
df_blocks.equals(df_all_blocks.loc[0])


Out[26]:
True

In [27]:
block_support, block_length_min = anhima.ped.inheritance_block_masks(inh_seg, pos)

In [28]:
# filter using block support
inh_seg_flt = inh_seg.copy()
inh_seg_flt[block_support < 2] = anhima.ped.INHERIT_MISSING
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_seg_flt, 
                                 colors=inheritance_colors,
                                 states=range(8),
                                 ax=ax)
ax.set_xticks(np.arange(0, inh_seg.shape[0], 50));



In [29]:
anhima.util.tabulate_state_blocks(inh_seg_flt[:, 0], states, pos)


Out[29]:
start_min_idx start_max_idx stop_min_idx stop_max_idx start_min_pos start_max_pos stop_min_pos stop_max_pos state support size_min size_max length_min length_max is_marginal
0 0 0 56 57 150 150 12735 12739 1 55 56 56 12585 12589 True
1 56 57 409 410 12735 12739 82578 82727 2 333 353 353 69839 69992 False
2 409 410 501 501 82578 82727 99991 99991 1 86 92 92 17264 17413 True

In [30]:
# filter using minimal block length
inh_seg_flt = inh_seg.copy()
inh_seg_flt[block_length_min < 100] = anhima.ped.INHERIT_MISSING
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_seg_flt, 
                                 colors=inheritance_colors,
                                 states=range(8),
                                 ax=ax)
ax.set_xticks(np.arange(0, inh_seg.shape[0], 50));



In [31]:
anhima.util.tabulate_state_blocks(inh_seg_flt[:, 0], states, pos)


Out[31]:
start_min_idx start_max_idx stop_min_idx stop_max_idx start_min_pos start_max_pos stop_min_pos stop_max_pos state support size_min size_max length_min length_max is_marginal
0 0 0 56 57 150 150 12735 12739 1 55 56 56 12585 12589 True
1 56 57 406 410 12735 12739 82242 82727 2 331 350 353 69503 69992 False
2 406 410 501 501 82242 82727 99991 99991 1 86 92 95 17264 17749 True

In [32]: