In [1]:
import sys
sys.path.insert(0, '../..')
import allel; print('allel', allel.__version__)
import h5py
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import zarr
import seaborn as sns
sns.set_style('white')
sns.set_style('ticks')
sns.set_context('talk')
import functools


allel 1.0.0b6.dev0

In [2]:
%%html
<style type="text/css">
.container {
    width: 98%;
    font-size: 25px;
    line-height: 1em;
}
.CodeMirror pre {
    line-height: 1.2em;
    font-size: 25px;
}
</style>



In [3]:
callset = zarr.group('/kwiat/2/coluzzi/ag1000g/data/phase1/release/AR3.1/variation/main/zarr2/ag1000g.phase1.ar3.pass/')
callset


Out[3]:
Group(/, 6)
  arrays: 1; samples
  groups: 5; 2L, 2R, 3L, 3R, X
  store: DirectoryStore

In [4]:
genotypes = allel.GenotypeChunkedArray(callset['3R/calldata/genotype'])
genotypes


Out[4]:
<GenotypeChunkedArray shape=(13167162, 765, 2) dtype=int8 chunks=(157284, 51, 2) nbytes=18.8G cbytes=1.1G cratio=17.7 compression=blosc compression_opts={'shuffle': 0, 'clevel': 5, 'cname': 'lz4'} values=zarr.core.Array>
01234...760761762763764
00/00/00/00/00/0...0/00/00/00/00/0
10/00/00/00/00/0...0/00/00/00/00/0
20/00/00/00/00/0...0/00/00/00/00/0
......
131671590/00/00/00/00/0...0/00/00/00/00/0
131671600/00/00/00/00/0...0/00/00/00/00/0
131671610/00/00/00/00/0...0/00/00/00/00/0

In [5]:
accessibility = h5py.File('/kwiat/2/coluzzi/ag1000g/data/phase1/release/AR3/accessibility/accessibility.h5', mode='r')
accessibility


Out[5]:
<HDF5 file "accessibility.h5" (mode r)>

In [6]:
@functools.lru_cache(maxsize=None)
def compute_roh(sample_index, chrom, phet_roh=0.001, phet_nonroh=(0.0025, 0.01)):
    genotypes = allel.GenotypeChunkedArray(callset[chrom]['calldata/genotype'])
    gv = genotypes[:, sample_index]
    is_accessible = accessibility[chrom]['is_accessible'][:]
    pos = callset[chrom]['variants/POS'][:]
    df_roh, froh = allel.roh_mhmm(gv, pos, phet_roh=0.001, phet_nonroh=(0.0025, 0.01), is_accessible=is_accessible)
    return df_roh, froh

In [7]:
def plot_roh(sample_index, chrom, phet_roh=0.001, phet_nonroh=(0.0025, 0.01), window_size=2000):

    genotypes = allel.GenotypeChunkedArray(callset[chrom]['calldata/genotype'])
    gv = genotypes[:, sample_index]

    # compute windowed heterozygosity
    is_het = gv.is_het()
    pos = callset[chrom]['variants/POS'][:]
    is_accessible = accessibility[chrom]['is_accessible'][:]
    windows = allel.equally_accessible_windows(is_accessible, window_size)
    wn, _, _ = allel.stats.windowed_statistic(pos, values=is_het, statistic=np.sum, windows=windows)

    # compute roh
    df_roh, froh = compute_roh(sample_index, chrom, phet_roh=phet_roh, phet_nonroh=phet_nonroh)
    
    # plotting setup
    fig, ax = plt.subplots(figsize=(40, 4))
    sns.despine(ax=ax, offset=10)

    # plot heterozygosity
    y = wn / window_size
    x = np.mean(windows, axis=1)
    ax.plot(x, y, linewidth=.5)
    ax.set_ylim(-0.01, 0.03)
    ax.set_yticks(np.arange(0, 0.03, 0.005))
    ax.set_xlim(0, is_accessible.size)
    ax.set_xlabel('position')
    ax.set_ylabel('heterozygosity')

    # plot roh
    xranges = np.column_stack([df_roh.start, df_roh.length])
    yrange = (-.008, 0.006)
    ax.broken_barh(xranges, yrange, facecolor='gray', linewidth=.5, edgecolor='k')

    ax.text(1, 1, '$F_{ROH}$: %.4f\n$ROH$ count: %s' % (froh, len(df_roh)), transform=ax.transAxes, ha='right', va='top')
    
    # plot histogram of roh sizes
    x = df_roh.length
    fig, ax = plt.subplots(figsize=(10, 4))
    sns.despine(ax=ax, offset=10)
    ax.hist(x, bins=np.logspace(2, 8, 30))
    ax.set_xscale('log')
    ax.set_xlabel('$ROH$ length')
    ax.set_ylabel('frequency')
    
    return df_roh

In [8]:
df = plot_roh(300, '3R')



In [9]:
df


Out[9]:
start stop length is_marginal
0 1 11424744 11424744 True
1 13743171 14931009 1187839 False
2 16707790 16724602 16813 False
3 16757238 16763219 5982 False
4 17021406 17036802 15397 False
5 17064118 17076327 12210 False
6 17081574 17115222 33649 False
7 17310101 17316219 6119 False
8 18497685 36803361 18305677 False
9 37121693 37127716 6024 False
10 38051314 38133840 82527 False
11 42401560 42410949 9390 False
12 45128579 45135441 6863 False
13 45611618 45618690 7073 False
14 45821347 45831977 10631 False
15 45867586 45891079 23494 False
16 47588961 47642397 53437 False
17 48187057 48235907 48851 False
18 48471890 48481887 9998 False
19 49833187 49837717 4531 False
20 50613256 50617172 3917 False
21 52044699 52116362 71664 False
22 52617338 53041851 424514 False

In [10]:
plot_roh(301, '3R');



In [11]:
plot_roh(302, '3R');



In [12]:
plot_roh(303, '3R');



In [13]:
plot_roh(304, '3R');



In [14]:
plot_roh(305, '3R');



In [15]:
plot_roh(0, '3R');



In [16]:
plot_roh(700, '3R');



In [17]:
plot_roh(600, '3R');



In [18]:
plot_roh(500, '3R');



In [19]:
plot_roh(400, '3R');



In [ ]: