In [1]:
import sys
sys.path.insert(0, '../..')
import allel; print('allel', allel.__version__)
import h5py
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import zarr
import seaborn as sns
sns.set_style('white')
sns.set_style('ticks')
sns.set_context('talk')
import functools
In [2]:
%%html
<style type="text/css">
.container {
width: 98%;
font-size: 25px;
line-height: 1em;
}
.CodeMirror pre {
line-height: 1.2em;
font-size: 25px;
}
</style>
In [3]:
callset = zarr.group('/kwiat/2/coluzzi/ag1000g/data/phase1/release/AR3.1/variation/main/zarr2/ag1000g.phase1.ar3.pass/')
callset
Out[3]:
In [4]:
genotypes = allel.GenotypeChunkedArray(callset['3R/calldata/genotype'])
genotypes
Out[4]:
In [5]:
accessibility = h5py.File('/kwiat/2/coluzzi/ag1000g/data/phase1/release/AR3/accessibility/accessibility.h5', mode='r')
accessibility
Out[5]:
In [6]:
@functools.lru_cache(maxsize=None)
def compute_roh(sample_index, chrom, phet_roh=0.001, phet_nonroh=(0.0025, 0.01)):
genotypes = allel.GenotypeChunkedArray(callset[chrom]['calldata/genotype'])
gv = genotypes[:, sample_index]
is_accessible = accessibility[chrom]['is_accessible'][:]
pos = callset[chrom]['variants/POS'][:]
df_roh, froh = allel.roh_mhmm(gv, pos, phet_roh=0.001, phet_nonroh=(0.0025, 0.01), is_accessible=is_accessible)
return df_roh, froh
In [7]:
def plot_roh(sample_index, chrom, phet_roh=0.001, phet_nonroh=(0.0025, 0.01), window_size=2000):
genotypes = allel.GenotypeChunkedArray(callset[chrom]['calldata/genotype'])
gv = genotypes[:, sample_index]
# compute windowed heterozygosity
is_het = gv.is_het()
pos = callset[chrom]['variants/POS'][:]
is_accessible = accessibility[chrom]['is_accessible'][:]
windows = allel.equally_accessible_windows(is_accessible, window_size)
wn, _, _ = allel.stats.windowed_statistic(pos, values=is_het, statistic=np.sum, windows=windows)
# compute roh
df_roh, froh = compute_roh(sample_index, chrom, phet_roh=phet_roh, phet_nonroh=phet_nonroh)
# plotting setup
fig, ax = plt.subplots(figsize=(40, 4))
sns.despine(ax=ax, offset=10)
# plot heterozygosity
y = wn / window_size
x = np.mean(windows, axis=1)
ax.plot(x, y, linewidth=.5)
ax.set_ylim(-0.01, 0.03)
ax.set_yticks(np.arange(0, 0.03, 0.005))
ax.set_xlim(0, is_accessible.size)
ax.set_xlabel('position')
ax.set_ylabel('heterozygosity')
# plot roh
xranges = np.column_stack([df_roh.start, df_roh.length])
yrange = (-.008, 0.006)
ax.broken_barh(xranges, yrange, facecolor='gray', linewidth=.5, edgecolor='k')
ax.text(1, 1, '$F_{ROH}$: %.4f\n$ROH$ count: %s' % (froh, len(df_roh)), transform=ax.transAxes, ha='right', va='top')
# plot histogram of roh sizes
x = df_roh.length
fig, ax = plt.subplots(figsize=(10, 4))
sns.despine(ax=ax, offset=10)
ax.hist(x, bins=np.logspace(2, 8, 30))
ax.set_xscale('log')
ax.set_xlabel('$ROH$ length')
ax.set_ylabel('frequency')
return df_roh
In [8]:
df = plot_roh(300, '3R')
In [9]:
df
Out[9]:
In [10]:
plot_roh(301, '3R');
In [11]:
plot_roh(302, '3R');
In [12]:
plot_roh(303, '3R');
In [13]:
plot_roh(304, '3R');
In [14]:
plot_roh(305, '3R');
In [15]:
plot_roh(0, '3R');
In [16]:
plot_roh(700, '3R');
In [17]:
plot_roh(600, '3R');
In [18]:
plot_roh(500, '3R');
In [19]:
plot_roh(400, '3R');
In [ ]: