In [1]:
import sys
sys.path.insert(0, '../..')
import allel; print('allel', allel.__version__)
import h5py
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
callset_dir = '/kwiat/2/coluzzi/ag1000g/data/phase1/release/AR3/variation/crosses/ar3/hdf5/'
In [3]:
!ls -lh {callset_dir}
In [4]:
chrom = '2L'
callset = h5py.File(
os.path.join(callset_dir, 'ag1000g.crosses.phase1.ar3sites.pass.%s.h5' % chrom),
mode='r'
)
callset
Out[4]:
In [5]:
samples = list(callset[chrom]['samples'][:])
samples[:10]
Out[5]:
In [6]:
samples_29_2 = samples[:22]
samples_29_2
Out[6]:
In [7]:
genotype = allel.GenotypeChunkedArray(callset[chrom]['calldata']['genotype'])
genotype
Out[7]:
In [8]:
# take samples from a single cross
samples_cross = [samples.index(s) for s in samples_29_2]
genotype_cross = genotype.take(samples_cross, axis=1)
genotype_cross
Out[8]:
In [9]:
pos = callset[chrom]['variants/POS'][:]
pos
Out[9]:
In [10]:
ac_cross = genotype_cross.count_alleles(max_allele=3)[:]
ac_cross
Out[10]:
In [11]:
genotype_cross_seg = genotype_cross.compress(ac_cross.is_segregating(), axis=0)
genotype_cross_seg
Out[11]:
In [12]:
pos_seg = pos.compress(ac_cross.is_segregating(), axis=0)
In [13]:
genotype_phased = allel.stats.phase_by_transmission(genotype_cross_seg, window_size=100)
genotype_phased
Out[13]:
In [14]:
genotype_phased.is_phased
Out[14]:
In [15]:
# check how many genotypes got phased
is_phased = genotype_phased.is_phased
np.count_nonzero(is_phased), is_phased.size
Out[15]:
In [16]:
def plot_inheritance(g, parent, plot=True):
# paint inheritance
inh = allel.stats.paint_transmission(
g[:, parent],
np.column_stack([
g[:, parent], # include parent
g[:, 2:, parent]
])
)
# fix where phase is unknown
inh[:, 0:2][~g.is_phased[:, parent]] = 0
inh[:, 2:][~g.is_phased[:, 2:]] = 0
# take a look...
if plot:
fig, ax = plt.subplots(figsize=(10, 6))
ax.pcolormesh(inh.T,
cmap=mpl.colors.ListedColormap(['gray', 'r', 'b', 'orange', 'g', 'k', 'w', 'w']),
vmin=0, vmax=7)
ax.set_yticks(np.arange(inh.shape[1])+.5)
ax.set_yticklabels(np.arange(inh.shape[1]))
ax.autoscale(axis='both', tight=True)
return inh
In [17]:
parent_is_het = genotype_cross_seg[:, :2].is_het()
parent_is_het
Out[17]:
In [18]:
plot_inheritance(genotype_phased[::100], parent=0);
In [19]:
flt = parent_is_het[:, 0]
plot_inheritance(genotype_phased[flt][::100], parent=0);
In [20]:
flt = parent_is_het[:, 0] & ~parent_is_het[:, 1]
plot_inheritance(genotype_phased[flt][::100], parent=0);
In [21]:
flt = parent_is_het[:, 0] & ~parent_is_het[:, 1]
plot_inheritance(genotype_phased[flt][100020:100070], parent=0)
plt.show()
genotype_phased[flt][100020:100070].displayall()
In [22]:
plot_inheritance(genotype_phased[::100], parent=1);
In [23]:
flt = parent_is_het[:, 1]
plot_inheritance(genotype_phased[flt][::100], parent=1);
In [24]:
flt = parent_is_het[:, 1] & ~parent_is_het[:, 0]
plot_inheritance(genotype_phased[flt][::100], parent=1);
In [25]:
inh = plot_inheritance(genotype_phased, parent=0, plot=False)
inh
Out[25]:
In [26]:
states = {allel.INHERIT_PARENT1, allel.INHERIT_PARENT2}
In [27]:
switch_points, transitions, observations = allel.opt.stats.state_transitions(inh[:, 0], states=states)
In [28]:
switch_points
Out[28]:
In [29]:
transitions
Out[29]:
In [30]:
observations
Out[30]:
In [31]:
df_switches = allel.tabulate_state_transitions(inh[:, 0], states=states)
df_switches
Out[31]:
In [32]:
df_switches = allel.tabulate_state_transitions(inh[:, 1], states=states)
df_switches
Out[32]:
In [33]:
df_switches = allel.tabulate_state_transitions(inh[:, 2], states=states)
print(df_switches.shape)
df_switches.head()
Out[33]:
In [34]:
df_switches = allel.tabulate_state_transitions(inh[:, 3], states=states)
print(df_switches.shape)
df_switches.head()
Out[34]:
In [35]:
df_blocks = allel.tabulate_state_blocks(inh[:, 0], states=states, pos=pos_seg)
df_blocks
Out[35]:
In [36]:
df_blocks = allel.tabulate_state_blocks(inh[:, 1], states=states, pos=pos_seg)
df_blocks
Out[36]:
In [37]:
df_blocks = allel.tabulate_state_blocks(inh[:, 3], states=states, pos=pos_seg)
print(df_blocks.shape)
df_blocks.head()
Out[37]:
In [38]:
pos_seg.shape
Out[38]:
In [39]:
import matplotlib.pyplot as plt
%matplotlib inline
In [40]:
plt.hist(df_blocks.length_min, bins=np.logspace(1, 8, 40))
plt.xscale('log');
In [41]:
# take only the parental genotypes
genotype_cross_parents = genotype_cross.take([0, 1], axis=1)[:]
genotype_cross_parents
Out[41]:
In [42]:
# find heterozygous genotypes in the parents
is_het_parents = genotype_cross_parents.is_het()
is_het_parents
Out[42]:
In [43]:
# select variants where maternal genotype is het
g_mat_het = genotype_cross.compress(is_het_parents[:, 0] & ~is_het_parents[:, 1])[:]
g_mat_het
Out[43]:
In [44]:
%time _ = allel.stats.phase_progeny_by_transmission(g_mat_het)
In [45]:
# step 1 - phase progeny by transmission
g_mat_het_phased = allel.stats.phase_progeny_by_transmission(g_mat_het)
g_mat_het_phased
Out[45]:
In [46]:
%%time
_ = allel.stats.phase_parents_by_transmission(g_mat_het_phased, window_size=10)
In [47]:
# step 2 - phase mother by transmission
g_mat_het_phased_parent = allel.stats.phase_parents_by_transmission(g_mat_het_phased, window_size=10)
In [48]:
g_mat_het_phased_parent
Out[48]:
In [49]:
inh_mat = allel.stats.paint_transmission(
g_mat_het_phased_parent[:, 0],
np.column_stack([
g_mat_het_phased_parent[:, 0], # include parent
g_mat_het_phased_parent[:, 2:, 0]
])
)
# fix where phase is unknown
inh_mat[~g_mat_het_phased_parent.is_phased[:, 0]] = 0
inh_mat[~g_mat_het_phased_parent.is_phased] = 0
inh_mat
Out[49]:
In [50]:
# check how many genotypes got phased
is_phased = g_mat_het_phased_parent.is_phased
np.count_nonzero(is_phased), is_phased.size
Out[50]:
In [51]:
np.min(inh_mat)
Out[51]:
In [52]:
np.max(inh_mat)
Out[52]:
In [53]:
# take a look...
fig, ax = plt.subplots(figsize=(10, 6))
ax.pcolormesh(inh_mat[::100].T, cmap=mpl.colors.ListedColormap(['w', 'r', 'b']))
ax.autoscale(axis='both', tight=True)
In [54]:
# take a look...
fig, ax = plt.subplots(figsize=(10, 6))
ax.pcolormesh(inh_mat[103010:103100].T,
cmap=mpl.colors.ListedColormap(['w', 'r', 'b']),
vmin=0, vmax=2)
ax.set_yticks(np.arange(inh_mat.shape[1])+.5)
ax.set_yticklabels(np.arange(inh_mat.shape[1]))
ax.autoscale(axis='both', tight=True)
In [55]:
g_mat_het_phased_parent[103010:103100].displayall()
In [ ]:
In [ ]: