In [1]:
    
import sys
sys.path.insert(0, '../..')
import allel; print('allel', allel.__version__)
import h5py
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
    
    
In [2]:
    
callset_dir = '/kwiat/2/coluzzi/ag1000g/data/phase1/release/AR3/variation/crosses/ar3/hdf5/'
    
In [3]:
    
!ls -lh {callset_dir}
    
    
In [4]:
    
chrom = '2L'
callset = h5py.File(
    os.path.join(callset_dir, 'ag1000g.crosses.phase1.ar3sites.pass.%s.h5' % chrom),
    mode='r'
)
callset
    
    Out[4]:
In [5]:
    
samples = list(callset[chrom]['samples'][:])
samples[:10]
    
    Out[5]:
In [6]:
    
samples_29_2 = samples[:22]
samples_29_2
    
    Out[6]:
In [7]:
    
genotype = allel.GenotypeChunkedArray(callset[chrom]['calldata']['genotype'])
genotype
    
    Out[7]:
In [8]:
    
# take samples from a single cross
samples_cross = [samples.index(s) for s in samples_29_2]
genotype_cross = genotype.take(samples_cross, axis=1)
genotype_cross
    
    Out[8]:
In [9]:
    
pos = callset[chrom]['variants/POS'][:]
pos
    
    Out[9]:
In [10]:
    
ac_cross = genotype_cross.count_alleles(max_allele=3)[:]
ac_cross
    
    Out[10]:
In [11]:
    
genotype_cross_seg = genotype_cross.compress(ac_cross.is_segregating(), axis=0)
genotype_cross_seg
    
    Out[11]:
In [12]:
    
pos_seg = pos.compress(ac_cross.is_segregating(), axis=0)
    
In [13]:
    
genotype_phased = allel.stats.phase_by_transmission(genotype_cross_seg, window_size=100)
genotype_phased
    
    Out[13]:
In [14]:
    
genotype_phased.is_phased
    
    Out[14]:
In [15]:
    
# check how many genotypes got phased
is_phased = genotype_phased.is_phased
np.count_nonzero(is_phased), is_phased.size
    
    Out[15]:
In [16]:
    
def plot_inheritance(g, parent, plot=True):
    # paint inheritance
    inh = allel.stats.paint_transmission(
        g[:, parent], 
        np.column_stack([
            g[:, parent],  # include parent
            g[:, 2:, parent]
        ])
    )
    # fix where phase is unknown
    inh[:, 0:2][~g.is_phased[:, parent]] = 0
    inh[:, 2:][~g.is_phased[:, 2:]] = 0
    # take a look...
    if plot:
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.pcolormesh(inh.T, 
                      cmap=mpl.colors.ListedColormap(['gray', 'r', 'b', 'orange', 'g', 'k', 'w', 'w']),
                      vmin=0, vmax=7)
        ax.set_yticks(np.arange(inh.shape[1])+.5)
        ax.set_yticklabels(np.arange(inh.shape[1]))
        ax.autoscale(axis='both', tight=True)    
    
    return inh
    
In [17]:
    
parent_is_het = genotype_cross_seg[:, :2].is_het()
parent_is_het
    
    Out[17]:
In [18]:
    
plot_inheritance(genotype_phased[::100], parent=0);
    
    
In [19]:
    
flt = parent_is_het[:, 0]
plot_inheritance(genotype_phased[flt][::100], parent=0);
    
    
In [20]:
    
flt = parent_is_het[:, 0] & ~parent_is_het[:, 1]
plot_inheritance(genotype_phased[flt][::100], parent=0);
    
    
In [21]:
    
flt = parent_is_het[:, 0] & ~parent_is_het[:, 1]
plot_inheritance(genotype_phased[flt][100020:100070], parent=0)
plt.show()
genotype_phased[flt][100020:100070].displayall()
    
    
    
In [22]:
    
plot_inheritance(genotype_phased[::100], parent=1);
    
    
In [23]:
    
flt = parent_is_het[:, 1]
plot_inheritance(genotype_phased[flt][::100], parent=1);
    
    
In [24]:
    
flt = parent_is_het[:, 1] & ~parent_is_het[:, 0]
plot_inheritance(genotype_phased[flt][::100], parent=1);
    
    
In [25]:
    
inh = plot_inheritance(genotype_phased, parent=0, plot=False)
inh
    
    Out[25]:
In [26]:
    
states = {allel.INHERIT_PARENT1, allel.INHERIT_PARENT2}
    
In [27]:
    
switch_points, transitions, observations = allel.opt.stats.state_transitions(inh[:, 0], states=states)
    
In [28]:
    
switch_points
    
    Out[28]:
In [29]:
    
transitions
    
    Out[29]:
In [30]:
    
observations
    
    Out[30]:
In [31]:
    
df_switches = allel.tabulate_state_transitions(inh[:, 0], states=states)
df_switches
    
    Out[31]:
In [32]:
    
df_switches = allel.tabulate_state_transitions(inh[:, 1], states=states)
df_switches
    
    Out[32]:
In [33]:
    
df_switches = allel.tabulate_state_transitions(inh[:, 2], states=states)
print(df_switches.shape)
df_switches.head()
    
    
    Out[33]:
In [34]:
    
df_switches = allel.tabulate_state_transitions(inh[:, 3], states=states)
print(df_switches.shape)
df_switches.head()
    
    
    Out[34]:
In [35]:
    
df_blocks = allel.tabulate_state_blocks(inh[:, 0], states=states, pos=pos_seg)
df_blocks
    
    Out[35]:
In [36]:
    
df_blocks = allel.tabulate_state_blocks(inh[:, 1], states=states, pos=pos_seg)
df_blocks
    
    Out[36]:
In [37]:
    
df_blocks = allel.tabulate_state_blocks(inh[:, 3], states=states, pos=pos_seg)
print(df_blocks.shape)
df_blocks.head()
    
    
    Out[37]:
In [38]:
    
pos_seg.shape
    
    Out[38]:
In [39]:
    
import matplotlib.pyplot as plt
%matplotlib inline
    
In [40]:
    
plt.hist(df_blocks.length_min, bins=np.logspace(1, 8, 40))
plt.xscale('log');
    
    
In [41]:
    
# take only the parental genotypes
genotype_cross_parents = genotype_cross.take([0, 1], axis=1)[:]
genotype_cross_parents
    
    Out[41]:
In [42]:
    
# find heterozygous genotypes in the parents
is_het_parents = genotype_cross_parents.is_het()
is_het_parents
    
    Out[42]:
In [43]:
    
# select variants where maternal genotype is het
g_mat_het = genotype_cross.compress(is_het_parents[:, 0] & ~is_het_parents[:, 1])[:]
g_mat_het
    
    Out[43]:
In [44]:
    
%time _ = allel.stats.phase_progeny_by_transmission(g_mat_het)
    
    
In [45]:
    
# step 1 - phase progeny by transmission
g_mat_het_phased = allel.stats.phase_progeny_by_transmission(g_mat_het)
g_mat_het_phased
    
    Out[45]:
In [46]:
    
%%time
_ = allel.stats.phase_parents_by_transmission(g_mat_het_phased, window_size=10)
    
    
In [47]:
    
# step 2 - phase mother by transmission
g_mat_het_phased_parent = allel.stats.phase_parents_by_transmission(g_mat_het_phased, window_size=10)
    
In [48]:
    
g_mat_het_phased_parent
    
    Out[48]:
In [49]:
    
inh_mat = allel.stats.paint_transmission(
    g_mat_het_phased_parent[:, 0], 
    np.column_stack([
        g_mat_het_phased_parent[:, 0],  # include parent
        g_mat_het_phased_parent[:, 2:, 0]
    ])
)
# fix where phase is unknown
inh_mat[~g_mat_het_phased_parent.is_phased[:, 0]] = 0
inh_mat[~g_mat_het_phased_parent.is_phased] = 0
inh_mat
    
    Out[49]:
In [50]:
    
# check how many genotypes got phased
is_phased = g_mat_het_phased_parent.is_phased
np.count_nonzero(is_phased), is_phased.size
    
    Out[50]:
In [51]:
    
np.min(inh_mat)
    
    Out[51]:
In [52]:
    
np.max(inh_mat)
    
    Out[52]:
In [53]:
    
# take a look...
fig, ax = plt.subplots(figsize=(10, 6))
ax.pcolormesh(inh_mat[::100].T, cmap=mpl.colors.ListedColormap(['w', 'r', 'b']))
ax.autoscale(axis='both', tight=True)
    
    
In [54]:
    
# take a look...
fig, ax = plt.subplots(figsize=(10, 6))
ax.pcolormesh(inh_mat[103010:103100].T, 
              cmap=mpl.colors.ListedColormap(['w', 'r', 'b']),
              vmin=0, vmax=2)
ax.set_yticks(np.arange(inh_mat.shape[1])+.5)
ax.set_yticklabels(np.arange(inh_mat.shape[1]))
ax.autoscale(axis='both', tight=True)
    
    
In [55]:
    
g_mat_het_phased_parent[103010:103100].displayall()
    
    
In [ ]:
    
    
In [ ]: