In [1]:
import sys
sys.path.insert(0, '../..')
import allel; print('allel', allel.__version__)
import h5py
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline


allel 1.0.0b6.dev0

In [2]:
callset_dir = '/kwiat/2/coluzzi/ag1000g/data/phase1/release/AR3/variation/crosses/ar3/hdf5/'

In [3]:
!ls -lh {callset_dir}


total 72G
-r--r--r-- 1 aliman aliman 7.5G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.2L.h5
-r--r--r-- 1 aliman aliman   81 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.2L.h5.md5
-r--r--r-- 1 aliman aliman 8.6G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.2R.h5
-r--r--r-- 1 aliman aliman   81 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.2R.h5.md5
-r--r--r-- 1 aliman aliman 6.3G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.3L.h5
-r--r--r-- 1 aliman aliman   81 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.3L.h5.md5
-r--r--r-- 1 aliman aliman 8.6G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.3R.h5
-r--r--r-- 1 aliman aliman   81 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.3R.h5.md5
-r--r--r-- 1 aliman aliman 4.0G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.2L.h5
-r--r--r-- 1 aliman aliman   86 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.2L.h5.md5
-r--r--r-- 1 aliman aliman 5.4G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.2R.h5
-r--r--r-- 1 aliman aliman   86 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.2R.h5.md5
-r--r--r-- 1 aliman aliman 3.7G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.3L.h5
-r--r--r-- 1 aliman aliman   86 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.3L.h5.md5
-r--r--r-- 1 aliman aliman 5.0G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.3R.h5
-r--r--r-- 1 aliman aliman   86 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.3R.h5.md5
-r--r--r-- 1 aliman aliman 3.2G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.2L.h5
-r--r--r-- 1 aliman aliman   96 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.2L.h5.md5
-r--r--r-- 1 aliman aliman 4.4G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.2R.h5
-r--r--r-- 1 aliman aliman   96 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.2R.h5.md5
-r--r--r-- 1 aliman aliman 2.9G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.3L.h5
-r--r--r-- 1 aliman aliman   96 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.3L.h5.md5
-r--r--r-- 1 aliman aliman 3.9G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.3R.h5
-r--r--r-- 1 aliman aliman   96 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.3R.h5.md5
-r--r--r-- 1 aliman aliman  33K Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.UNKN.h5
-r--r--r-- 1 aliman aliman   98 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.UNKN.h5.md5
-r--r--r-- 1 aliman aliman 1.6G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.X.h5
-r--r--r-- 1 aliman aliman   95 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.X.h5.md5
-r--r--r-- 1 aliman aliman  33K Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.Y_unplaced.h5
-r--r--r-- 1 aliman aliman  104 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.biallelic.Y_unplaced.h5.md5
-r--r--r-- 1 aliman aliman  33K Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.UNKN.h5
-r--r--r-- 1 aliman aliman   88 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.UNKN.h5.md5
-r--r--r-- 1 aliman aliman 2.0G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.X.h5
-r--r--r-- 1 aliman aliman   85 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.X.h5.md5
-r--r--r-- 1 aliman aliman  33K Apr 23  2015 ag1000g.crosses.phase1.ar3sites.pass.Y_unplaced.h5
-r--r--r-- 1 aliman aliman   94 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.pass.Y_unplaced.h5.md5
-r--r--r-- 1 aliman aliman 2.3G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.UNKN.h5
-r--r--r-- 1 aliman aliman   83 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.UNKN.h5.md5
-r--r--r-- 1 aliman aliman 3.2G Apr 23  2015 ag1000g.crosses.phase1.ar3sites.X.h5
-r--r--r-- 1 aliman aliman   80 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.X.h5.md5
-r--r--r-- 1 aliman aliman 8.4M Apr 23  2015 ag1000g.crosses.phase1.ar3sites.Y_unplaced.h5
-r--r--r-- 1 aliman aliman   89 Jul 15  2015 ag1000g.crosses.phase1.ar3sites.Y_unplaced.h5.md5

In [4]:
chrom = '2L'
callset = h5py.File(
    os.path.join(callset_dir, 'ag1000g.crosses.phase1.ar3sites.pass.%s.h5' % chrom),
    mode='r'
)
callset


Out[4]:
<HDF5 file "ag1000g.crosses.phase1.ar3sites.pass.2L.h5" (mode r)>

In [5]:
samples = list(callset[chrom]['samples'][:])
samples[:10]


Out[5]:
[b'AD0231-C',
 b'AD0232-C',
 b'AD0234-C',
 b'AD0235-C',
 b'AD0236-C',
 b'AD0237-C',
 b'AD0238-C',
 b'AD0239-C',
 b'AD0240-C',
 b'AD0241-C']

In [6]:
samples_29_2 = samples[:22]
samples_29_2


Out[6]:
[b'AD0231-C',
 b'AD0232-C',
 b'AD0234-C',
 b'AD0235-C',
 b'AD0236-C',
 b'AD0237-C',
 b'AD0238-C',
 b'AD0239-C',
 b'AD0240-C',
 b'AD0241-C',
 b'AD0242-C',
 b'AD0243-C',
 b'AD0244-C',
 b'AD0245-C',
 b'AD0246-C',
 b'AD0247-C',
 b'AD0248-C',
 b'AD0249-C',
 b'AD0250-C',
 b'AD0251-C',
 b'AD0252-C',
 b'AD0253-C']

In [7]:
genotype = allel.GenotypeChunkedArray(callset[chrom]['calldata']['genotype'])
genotype


Out[7]:
<GenotypeChunkedArray shape=(10377280, 80, 2) dtype=int8 chunks=(6553, 10, 2) nbytes=1.5G cbytes=40.7M cratio=38.9 compression=gzip compression_opts=1 values=h5py._hl.dataset.Dataset>
01234...7576777879
00/00/00/00/00/0...0/00/00/00/00/0
10/00/00/00/00/0...0/00/00/00/00/0
20/00/00/00/00/0...0/00/00/00/00/0
......
103772770/00/00/00/00/0...0/00/00/00/00/0
103772780/00/00/00/00/0...0/00/00/00/00/0
103772790/00/00/00/00/0...0/00/00/00/00/0

In [8]:
# take samples from a single cross
samples_cross = [samples.index(s) for s in samples_29_2]
genotype_cross = genotype.take(samples_cross, axis=1)
genotype_cross


Out[8]:
<GenotypeChunkedArray shape=(10377280, 22, 2) dtype=int8 chunks=(10135, 22, 2) nbytes=435.4M cbytes=13.1M cratio=33.1 compression=blosc compression_opts={'cname': 'lz4', 'shuffle': 1, 'clevel': 5} values=zarr.core.Array>
01234...1718192021
00/00/00/00/00/0...0/00/00/00/00/0
10/00/00/00/00/0...0/00/00/00/00/0
20/00/00/00/00/0...0/00/00/00/00/0
......
103772770/00/00/00/00/0...0/00/00/00/00/0
103772780/00/00/00/00/0...0/00/00/00/00/0
103772790/00/00/00/00/0...0/00/00/00/00/0

In [9]:
pos = callset[chrom]['variants/POS'][:]
pos


Out[9]:
array([   44688,    44691,    44732, ..., 49356426, 49356429, 49356435], dtype=int32)

New workflow


In [10]:
ac_cross = genotype_cross.count_alleles(max_allele=3)[:]
ac_cross


Out[10]:
<AlleleCountsArray shape=(10377280, 4) dtype=int32>
0123
044 0 0 0
144 0 0 0
244 0 0 0
......
1037727744 0 0 0
1037727844 0 0 0
1037727944 0 0 0

In [11]:
genotype_cross_seg = genotype_cross.compress(ac_cross.is_segregating(), axis=0)
genotype_cross_seg


Out[11]:
<GenotypeChunkedArray shape=(675172, 22, 2) dtype=int8 chunks=(5275, 22, 2) nbytes=28.3M cbytes=8.0M cratio=3.5 compression=blosc compression_opts={'cname': 'lz4', 'shuffle': 1, 'clevel': 5} values=zarr.core.Array>
01234...1718192021
01/10/00/10/10/1...0/10/10/10/10/1
11/10/00/10/10/1...0/10/10/10/10/1
20/10/00/00/10/1...0/10/00/10/10/0
......
6751690/10/00/10/10/0...0/00/00/00/10/0
6751700/10/00/10/10/0...0/00/00/00/10/0
6751710/10/00/10/10/0...0/00/00/00/10/0

In [12]:
pos_seg = pos.compress(ac_cross.is_segregating(), axis=0)

In [13]:
genotype_phased = allel.stats.phase_by_transmission(genotype_cross_seg, window_size=100)
genotype_phased


Out[13]:
<GenotypeArray shape=(675172, 22, 2) dtype=int8>
01234...1718192021
01|10|01|01|01|0...1|01|01|01|01|0
11|10|01|01|01|0...1|01|01|01|01|0
20|10|00|01|01|0...1|00|01|01|00|0
......
6751690|10|01|01|00|0...0|00|00|01|00|0
6751700|10|01|01|00|0...0|00|00|01|00|0
6751710|10|01|01|00|0...0|00|00|01|00|0

In [14]:
genotype_phased.is_phased


Out[14]:
array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ..., 
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]], dtype=bool)

In [15]:
# check how many genotypes got phased
is_phased = genotype_phased.is_phased
np.count_nonzero(is_phased), is_phased.size


Out[15]:
(14106128, 14853784)

In [16]:
def plot_inheritance(g, parent, plot=True):

    # paint inheritance
    inh = allel.stats.paint_transmission(
        g[:, parent], 
        np.column_stack([
            g[:, parent],  # include parent
            g[:, 2:, parent]
        ])
    )

    # fix where phase is unknown
    inh[:, 0:2][~g.is_phased[:, parent]] = 0
    inh[:, 2:][~g.is_phased[:, 2:]] = 0

    # take a look...
    if plot:
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.pcolormesh(inh.T, 
                      cmap=mpl.colors.ListedColormap(['gray', 'r', 'b', 'orange', 'g', 'k', 'w', 'w']),
                      vmin=0, vmax=7)
        ax.set_yticks(np.arange(inh.shape[1])+.5)
        ax.set_yticklabels(np.arange(inh.shape[1]))
        ax.autoscale(axis='both', tight=True)    
    
    return inh

In [17]:
parent_is_het = genotype_cross_seg[:, :2].is_het()
parent_is_het


Out[17]:
array([[False, False],
       [False, False],
       [ True, False],
       ..., 
       [ True, False],
       [ True, False],
       [ True, False]], dtype=bool)

In [18]:
plot_inheritance(genotype_phased[::100], parent=0);



In [19]:
flt = parent_is_het[:, 0]
plot_inheritance(genotype_phased[flt][::100], parent=0);



In [20]:
flt = parent_is_het[:, 0] & ~parent_is_het[:, 1]
plot_inheritance(genotype_phased[flt][::100], parent=0);



In [21]:
flt = parent_is_het[:, 0] & ~parent_is_het[:, 1]
plot_inheritance(genotype_phased[flt][100020:100070], parent=0)
plt.show()
genotype_phased[flt][100020:100070].displayall()


<GenotypeArray shape=(50, 22, 2) dtype=int8>
0123456789101112131415161718192021
02|00|00|00|02|00|00|02|00|02|02|02|00|00|02|00|02|00|02|00|00|02|0
10|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
22|00|00|00|00|00|00|02|00|02|00|02|00|00|00|00|02|00|02|00|00|02|0
31|00|00|00|00|00|00|00|00|01|00|01|00|00|01|00|01|00|01|00|00|01|0
40|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
50|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
61|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
70|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
81|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|0
91|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
101|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
111|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
120|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
131|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
141|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
151|01|10|10|11|10|10|11|10|11|11|11|10|10|11|10|11|10|11|10|10|11|1
160|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
170|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
180|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
191|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
200|30|03|03|00|03|03|00|03|00|00|00|03|03|00|03|00|03|00|03|03|00|0
211|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
220|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
230|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
240|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
250|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
260|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
270|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
281|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
290|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
300|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
312|00|00|00|02|00|00|02|00|02|02|02|00|00|02|00|02|00|02|00|00|02|0
320|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
331|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
341|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
350|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
362|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|0
371|00|00|00|01|00|00|01|00|00|01|01|00|00|00|00|01|00|01|00|00|01|0
381|00|00|00|01|00|00|01|00|00|01|01|00|00|00|00|01|00|01|00|00|01|0
391|00|00|00|01|00|00|01|00|00|01|01|00|00|00|00|01|00|01|00|00|01|0
401|00|00|00|01|00|00|01|00|00|01|01|00|00|00|00|01|00|01|00|00|01|0
411|00|00|00|01|00|00|01|00|01|01|01|00|00|00|00|01|00|01|00|00|01|0
421|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
430|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
441|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
450|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
460|20|02|02|00|02|02|00|02|00|00|00|02|02|00|02|00|02|00|02|02|00|0
470|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
481|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
491|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0

In [22]:
plot_inheritance(genotype_phased[::100], parent=1);



In [23]:
flt = parent_is_het[:, 1]
plot_inheritance(genotype_phased[flt][::100], parent=1);



In [24]:
flt = parent_is_het[:, 1] & ~parent_is_het[:, 0]
plot_inheritance(genotype_phased[flt][::100], parent=1);



In [25]:
inh = plot_inheritance(genotype_phased, parent=0, plot=False)
inh


Out[25]:
array([[4, 4, 4, ..., 4, 4, 4],
       [4, 4, 4, ..., 4, 4, 4],
       [1, 2, 1, ..., 2, 2, 1],
       ..., 
       [1, 2, 2, ..., 1, 2, 1],
       [1, 2, 2, ..., 1, 2, 1],
       [1, 2, 2, ..., 1, 2, 1]], dtype=uint8)

In [26]:
states = {allel.INHERIT_PARENT1, allel.INHERIT_PARENT2}

In [27]:
switch_points, transitions, observations = allel.opt.stats.state_transitions(inh[:, 0], states=states)

In [28]:
switch_points


Out[28]:
array([[    -1,      2],
       [675171,     -1]])

In [29]:
transitions


Out[29]:
array([[255,   1],
       [  1, 255]], dtype=uint8)

In [30]:
observations


Out[30]:
array([     0, 289168])

In [31]:
df_switches = allel.tabulate_state_transitions(inh[:, 0], states=states)
df_switches


Out[31]:
lstate rstate lidx ridx
0 255 1 -1 2
1 1 255 675171 -1

In [32]:
df_switches = allel.tabulate_state_transitions(inh[:, 1], states=states)
df_switches


Out[32]:
lstate rstate lidx ridx
0 255 2 -1 2
1 2 255 675171 -1

In [33]:
df_switches = allel.tabulate_state_transitions(inh[:, 2], states=states)
print(df_switches.shape)
df_switches.head()


(5365, 4)
Out[33]:
lstate rstate lidx ridx
0 255 1 -1 2
1 1 2 466 467
2 2 1 467 468
3 1 2 1575 1576
4 2 1 1577 1578

In [34]:
df_switches = allel.tabulate_state_transitions(inh[:, 3], states=states)
print(df_switches.shape)
df_switches.head()


(2568, 4)
Out[34]:
lstate rstate lidx ridx
0 255 2 -1 2
1 2 1 231 232
2 1 2 233 234
3 2 1 277 278
4 1 2 278 279

In [35]:
df_blocks = allel.tabulate_state_blocks(inh[:, 0], states=states, pos=pos_seg)
df_blocks


Out[35]:
state support start_lidx start_ridx stop_lidx stop_ridx size_min size_max is_marginal start_lpos start_rpos stop_lpos stop_rpos length_min length_max
0 1 289168 -1 2 675171 -1 675170 -1 True -1 44906 49356087 -1 49311182 -1

In [36]:
df_blocks = allel.tabulate_state_blocks(inh[:, 1], states=states, pos=pos_seg)
df_blocks


Out[36]:
state support start_lidx start_ridx stop_lidx stop_ridx size_min size_max is_marginal start_lpos start_rpos stop_lpos stop_rpos length_min length_max
0 2 289168 -1 2 675171 -1 675170 -1 True -1 44906 49356087 -1 49311182 -1

In [37]:
df_blocks = allel.tabulate_state_blocks(inh[:, 3], states=states, pos=pos_seg)
print(df_blocks.shape)
df_blocks.head()


(2567, 15)
Out[37]:
state support start_lidx start_ridx stop_lidx stop_ridx size_min size_max is_marginal start_lpos start_rpos stop_lpos stop_rpos length_min length_max
0 2 186 -1 2 231 232 230 -1 True -1 44906 357432 358268 312527 -1
1 1 2 231 232 233 234 2 2 False 357432 358268 358270 370879 3 13446
2 2 41 233 234 277 278 44 44 False 358270 370879 452221 452430 81343 94159
3 1 1 277 278 278 279 1 1 False 452221 452430 452430 452608 1 386
4 2 1386 278 279 1981 1982 1703 1703 False 452430 452608 1572585 1572815 1119978 1120384

In [38]:
pos_seg.shape


Out[38]:
(675172,)

In [39]:
import matplotlib.pyplot as plt
%matplotlib inline

In [40]:
plt.hist(df_blocks.length_min, bins=np.logspace(1, 8, 40))
plt.xscale('log');


Legacy workflow


In [41]:
# take only the parental genotypes
genotype_cross_parents = genotype_cross.take([0, 1], axis=1)[:]
genotype_cross_parents


Out[41]:
<GenotypeArray shape=(10377280, 2, 2) dtype=int8>
01
00/00/0
10/00/0
20/00/0
......
103772770/00/0
103772780/00/0
103772790/00/0

In [42]:
# find heterozygous genotypes in the parents
is_het_parents = genotype_cross_parents.is_het()
is_het_parents


Out[42]:
array([[False, False],
       [False, False],
       [False, False],
       ..., 
       [False, False],
       [False, False],
       [False, False]], dtype=bool)

In [43]:
# select variants where maternal genotype is het
g_mat_het = genotype_cross.compress(is_het_parents[:, 0] & ~is_het_parents[:, 1])[:]
g_mat_het


Out[43]:
<GenotypeArray shape=(220784, 22, 2) dtype=int8>
01234...1718192021
00/10/00/00/10/1...0/10/00/10/10/0
10/10/00/00/10/1...0/10/00/10/10/0
20/10/00/10/00/0...0/00/10/00/00/1
......
2207810/10/00/10/10/0...0/00/00/00/10/0
2207820/10/00/10/10/0...0/00/00/00/10/0
2207830/10/00/10/10/0...0/00/00/00/10/0

In [44]:
%time _ = allel.stats.phase_progeny_by_transmission(g_mat_het)


CPU times: user 32 ms, sys: 0 ns, total: 32 ms
Wall time: 30.1 ms

In [45]:
# step 1 - phase progeny by transmission
g_mat_het_phased = allel.stats.phase_progeny_by_transmission(g_mat_het)
g_mat_het_phased


Out[45]:
<GenotypeArray shape=(220784, 22, 2) dtype=int8>
01234...1718192021
00/10/00|01|01|0...1|00|01|01|00|0
10/10/00|01|01|0...1|00|01|01|00|0
20/10/01|00|00|0...0|01|00|00|01|0
......
2207810/10/01|01|00|0...0|00|00|01|00|0
2207820/10/01|01|00|0...0|00|00|01|00|0
2207830/10/01|01|00|0...0|00|00|01|00|0

In [46]:
%%time
_ = allel.stats.phase_parents_by_transmission(g_mat_het_phased, window_size=10)


CPU times: user 144 ms, sys: 0 ns, total: 144 ms
Wall time: 141 ms

In [47]:
# step 2 - phase mother by transmission
g_mat_het_phased_parent = allel.stats.phase_parents_by_transmission(g_mat_het_phased, window_size=10)

In [48]:
g_mat_het_phased_parent


Out[48]:
<GenotypeArray shape=(220784, 22, 2) dtype=int8>
01234...1718192021
00|10|00|01|01|0...1|00|01|01|00|0
10|10|00|01|01|0...1|00|01|01|00|0
21|00|01|00|00|0...0|01|00|00|01|0
......
2207810|10|01|01|00|0...0|00|00|01|00|0
2207820|10|01|01|00|0...0|00|00|01|00|0
2207830|10|01|01|00|0...0|00|00|01|00|0

In [49]:
inh_mat = allel.stats.paint_transmission(
    g_mat_het_phased_parent[:, 0], 
    np.column_stack([
        g_mat_het_phased_parent[:, 0],  # include parent
        g_mat_het_phased_parent[:, 2:, 0]
    ])
)
# fix where phase is unknown
inh_mat[~g_mat_het_phased_parent.is_phased[:, 0]] = 0
inh_mat[~g_mat_het_phased_parent.is_phased] = 0
inh_mat


Out[49]:
array([[1, 2, 1, ..., 2, 2, 1],
       [1, 2, 1, ..., 2, 2, 1],
       [1, 2, 1, ..., 2, 2, 1],
       ..., 
       [1, 2, 2, ..., 1, 2, 1],
       [1, 2, 2, ..., 1, 2, 1],
       [1, 2, 2, ..., 1, 2, 1]], dtype=uint8)

In [50]:
# check how many genotypes got phased
is_phased = g_mat_het_phased_parent.is_phased
np.count_nonzero(is_phased), is_phased.size


Out[50]:
(4849164, 4857248)

In [51]:
np.min(inh_mat)


Out[51]:
0

In [52]:
np.max(inh_mat)


Out[52]:
2

In [53]:
# take a look...
fig, ax = plt.subplots(figsize=(10, 6))
ax.pcolormesh(inh_mat[::100].T, cmap=mpl.colors.ListedColormap(['w', 'r', 'b']))
ax.autoscale(axis='both', tight=True)



In [54]:
# take a look...
fig, ax = plt.subplots(figsize=(10, 6))
ax.pcolormesh(inh_mat[103010:103100].T, 
              cmap=mpl.colors.ListedColormap(['w', 'r', 'b']),
              vmin=0, vmax=2)
ax.set_yticks(np.arange(inh_mat.shape[1])+.5)
ax.set_yticklabels(np.arange(inh_mat.shape[1]))
ax.autoscale(axis='both', tight=True)



In [55]:
g_mat_het_phased_parent[103010:103100].displayall()


<GenotypeArray shape=(90, 22, 2) dtype=int8>
0123456789101112131415161718192021
01|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
12|00|00|00|02|00|00|02|00|02|02|02|00|00|02|00|02|00|02|00|00|02|0
21|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
31|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|0
41|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
52|00|00|00|02|00|00|02|00|02|02|02|00|00|02|00|02|00|02|00|00|02|0
60|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
71|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
80|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
91|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
101|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
111|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
121|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
131|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
140|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
151|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
161|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
170|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
180|20|02|02|00|02|02|00|02|00|00|00|02|02|00|02|00|02|00|02|02|00|0
191|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
200|20|02|02|00|02|02|00|02|00|00|00|02|02|00|02|00|02|00|02|02|00|0
211|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
221|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
231|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
240|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
250|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
260|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
270|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
280|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
290|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
300|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
311|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|0
320|20|02|02|00|02|02|00|02|00|00|00|02|02|00|02|00|02|00|02|02|00|0
331|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
341|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
350|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
361|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
371|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
380|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
391|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
400|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
410|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
422|00|00|00|02|00|00|02|00|02|02|02|00|00|02|00|02|00|02|00|00|02|0
430|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
440|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
450|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
461|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
471|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
480|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
490|20|02|02|00|02|02|00|02|00|00|00|02|02|00|02|00|02|00|02|02|00|0
500|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
510|20|02|02|00|02|02|00|02|00|00|00|02|02|00|02|00|02|00|02|02|00|0
520|10|01|00|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|00|00|0
530|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
541|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
553|00|00|00|03|00|00|03|00|03|03|03|00|00|03|00|03|00|03|00|00|03|0
560|20|02|02|00|02|02|00|02|00|00|00|02|02|00|02|00|02|00|02|02|00|0
571|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
580|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
591|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
600|20|02|02|00|02|02|00|02|00|00|00|02|02|00|02|00|02|00|02|02|00|0
611|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
621|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
630|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
642|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|00|0
650|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
661|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
672|00|00|00|02|00|00|02|00|02|02|02|00|00|02|00|02|00|02|00|00|02|0
681|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
692|00|00|00|02|00|00|02|00|02|02|02|00|00|02|00|02|00|02|00|00|02|0
701|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
711|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
721|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
732|00|00|00|02|00|00|02|00|02|02|02|00|00|02|00|02|00|02|00|00|02|0
740|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
750|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
760|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
770|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
780|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
792|00|00|00|02|00|00|02|00|02|02|02|00|00|02|00|02|00|02|00|00|02|0
801|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
811|00|00|00|01|00|00|01|00|01|01|01|00|00|01|00|01|00|01|00|00|01|0
820|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
830|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
840|20|02|02|00|02|02|00|02|00|00|00|02|02|00|02|00|02|00|02|02|00|0
850|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
860|20|02|02|00|02|02|00|02|00|00|00|02|02|00|02|00|02|00|02|02|00|0
870|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
880|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0
890|10|01|01|00|01|01|00|01|00|00|00|01|01|00|01|00|01|00|01|01|00|0

In [ ]:


In [ ]: