This notebook has analysis of haplotype divergence and homozygosity in moving windows between clusters.

Setup data


In [1]:
%run setup.ipynb
%matplotlib inline
%config InlineBackend.figure_formats = {'retina', 'png'}
rcParams['figure.dpi'] = 120
rcParams['figure.facecolor'] = 'w'
import hapclust



In [2]:
# define the gene region
region = 'PARA'
region_vgsc = '2L', 2358158, 2431617
pos_kdr_s = 2422651
pos_kdr_f = 2422652

In [3]:
# load data
callset_haps = np.load('../data/haps_phase1.npz')
haps = allel.HaplotypeArray(callset_haps['haplotypes'])
pos = allel.SortedIndex(callset_haps['POS'])
ann = callset_haps['ANN']

In [4]:
# define region we're going to analyse
loc_region = pos.locate_range(0, 5000000)
pos_phased_region = pos[loc_region]
pos_phased_region


Out[4]:
<SortedIndex shape=(281834,) dtype=int32>
01234...281829281830281831281832281833
4468844691447324473644756...49965634996572499657349965794996582

In [5]:
ann_phased_region = ann[loc_region]
ann_phased_region


Out[5]:
array([(b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'), ...,
       (b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.'),
       (b'intergenic_region', b'.', b'.')],
      dtype=[('Annotation', 'S34'), ('HGVS_p', 'S14'), ('HGVS_c', 'S12')])

In [6]:
haps_region = haps[loc_region]
haps_region


Out[6]:
<HaplotypeArray shape=(281834, 1530) dtype=int8>
01234...15251526152715281529
000000...00000
100000...00000
200000...00000
......
28183100000...00000
28183200000...00000
28183300000...00000

In [7]:
# perform allele count - needed to locate singletons
ac_phased_region = haps_region.count_alleles(max_allele=1)

In [8]:
# define types of variants to include in EHH analysis - should be mostly neutral
loc_type_neutral = ((ann_phased_region['Annotation'] == b'intergenic_region') | 
                    (ann_phased_region['Annotation'] == b'intron_variant') |
                    (ann_phased_region['Annotation'] == b'downstream_gene_variant') |
                    (ann_phased_region['Annotation'] == b'upstream_gene_variant') |
                    (ann_phased_region['Annotation'] == b'synonymous_variant') |
                    (ann_phased_region['Annotation'] == b'3_prime_UTR_variant') |
                    (ann_phased_region['Annotation'] == b'5_prime_UTR_variant') 
                    )
np.count_nonzero(loc_type_neutral), loc_type_neutral.shape


Out[8]:
(268259, (281834,))

In [9]:
# locate low frequency variants - will exclude from homozygosity analysis
loc_hf = ac_phased_region.min(axis=1) > 1
print(np.count_nonzero(loc_hf))

# these are the variants to use for haplotype homozygosity
loc_hh = loc_type_neutral & loc_hf
print(np.count_nonzero(loc_hh), loc_hh.shape)


193035
185122 (281834,)

In [10]:
haps_hh = haps_region[loc_hh]
haps_hh


Out[10]:
<HaplotypeArray shape=(185122, 1530) dtype=int8>
01234...15251526152715281529
000000...00000
100000...00000
211111...11111
......
18511900000...00000
18512000000...00000
18512100000...00000

In [11]:
pos_hh = pos_phased_region[loc_hh]
pos_hh


Out[11]:
<SortedIndex shape=(185122,) dtype=int32>
01234...185117185118185119185120185121
4469144732447564476744872...49965634996572499657349965794996582

In [12]:
#grab the npy array made in ag1000g paper 1 vgsc notebook
p1clus = np.load('../data/hierarchical_cluster_membership.npy')

#make this into a dictionary

p1list = [a.decode("utf-8") for a in p1clus]
p1list = np.asarray(p1list)
nlist = list(np.unique(p1list))
vgsc_haplogroups = {n: tuple(sorted(np.nonzero(p1list == n)[0])) for n in nlist}

for k, v in vgsc_haplogroups.items():
    print(repr(k), len(v))


'' 362
'F1' 453
'F2' 14
'F3' 38
'F4' 42
'F5' 196
'S1' 108
'S2' 79
'S3' 165
'S4' 37
'S5' 36

Setup analysis functions


In [13]:
%%cython
# cython: wraparound=False
# cython: boundscheck=False


import numpy as np 
cimport numpy as np
import scipy


cdef int pair_diffs(np.int8_t[:] h1, np.int8_t[:] h2):
    cdef:
        int i, n, v
    n = h1.shape[0]
    v = 0
    for i in range(n):
        v += (h1[i] != h2[i])
    return v
    


def pairwise_diffs_between(np.int8_t[:, :] h1, np.int8_t[:, :] h2):
    assert h1.ndim == h2.ndim == 2
    cdef:
        int i, j, n1, n2
        np.int32_t[:, :] out
    n1 = h1.shape[1]
    n2 = h2.shape[1]
    out = np.empty((n1, n2), dtype='i4')
    for i in range(n1):
        for j in range(n2):
            out[i, j] = pair_diffs(h1[:, i], h2[:, j])
    return np.asarray(out)

In [14]:
@functools.lru_cache(maxsize=None)
def scan_pdiffs_within(hap_ixs, startp=1500000, stopp=3500000, window_size=1000, window_step=200):
    
    # setup data
    loc = pos_hh.locate_range(startp, stopp)
    p = pos_hh[loc]
    h = np.asarray(haps_hh[loc].take(hap_ixs, axis=1), order='F')
    
    # compute in moving windows
    midpoints = allel.moving_statistic(values=p, statistic=np.mean, size=window_size, step=window_step)
    windows = list(allel.stats.window.index_windows(h, size=window_size, step=window_step, start=0, stop=None))
    result = np.array([
        allel.pairwise_distance(h[i:j], metric='hamming') * window_size
        for i, j in windows
    ], dtype=int)

    # compress result to save memory
    return midpoints, zarr.array(result)

In [15]:
@functools.lru_cache(maxsize=None)
def scan_pdiffs_between(hap_ixs1, hap_ixs2, startp=1500000, stopp=3500000, window_size=1000, window_step=200):
    
    # setup data
    loc = pos_hh.locate_range(startp, stopp)
    p = pos_hh[loc]
    h1 = np.asarray(haps_hh[loc].take(hap_ixs1, axis=1), order='F')
    h2 = np.asarray(haps_hh[loc].take(hap_ixs2, axis=1), order='F')
    
    # compute in moving windows
    midpoints = allel.moving_statistic(values=p, statistic=np.mean, size=window_size, step=window_step)
    windows = list(allel.stats.window.index_windows(h1, size=window_size, step=window_step, start=0, stop=None))
    result = np.array([
        pairwise_diffs_between(h1[i:j], h2[i:j]).flatten()
        for i, j in windows
    ], dtype=int)

    # compress result to save memory
    return midpoints, zarr.array(result)

In [16]:
palette = sns.color_palette()
region_vgsc = SeqFeature('2L', 2358158, 2431617)


def plot_pdiffs_within(hap_ixs, startp=1500000, stopp=3500000, window_size=1000, window_step=200, n_boot=10, ax=None):
    
    # get data
    midpoints, pdiffs = scan_pdiffs_within(hap_ixs, startp=startp, stopp=stopp, window_size=window_size, window_step=window_step)
    pdiffs = pdiffs[:]

    # broadcast out X coords
    x = np.empty(pdiffs.shape, dtype=int)
    x[:] = midpoints[:, None].astype(int)

    # plot
    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 3))
        sns.despine(ax=ax, offset=5)
    ax.axvline(region_vgsc.start, linestyle='--', color='k')
    ax.axvline(region_vgsc.end, linestyle='--', color='k')
    mx = x[:, 0]
    my = pdiffs.mean(axis=1)
    ax.plot(mx, my, color='k')
    sns.regplot(x=x.flatten(), y=pdiffs.flatten(), n_boot=n_boot, scatter=True, fit_reg=False, 
                x_estimator=np.mean, ax=ax, color='k', scatter_kws=dict(s=10))
    ax.set_ylim(bottom=-1)
    ax.set_ylabel('Mean no. SNP differences')
    ax.set_xlabel('Position (bp)')
    ax.set_xticklabels(['{:,}'.format(int(x)) for x in ax.get_xticks()])
    ax.text((region_vgsc.start + region_vgsc.end)/2, ax.get_ylim()[1], 'Vgsc', ha='center', va='bottom', fontstyle='italic', fontsize=8)
    if fig is not None:
        fig.tight_layout()
    
    
def plot_hh_within(hap_ixs, startp=1500000, stopp=3500000, window_size=1000, window_step=200, n_boot=0, ax=None, threshold=1, markersize=5, label=None):
    
    # get data
    midpoints, pdiffs = scan_pdiffs_within(hap_ixs, startp=startp, stopp=stopp, window_size=window_size, window_step=window_step)
    pdiffs = pdiffs[:]

    # broadcast out X coords
    x = np.empty(pdiffs.shape, dtype=int)
    x[:] = midpoints[:, None].astype(int)

    # plot
    fig = None
    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 3))
        sns.despine(ax=ax, offset=5)
    ax.axvline(region_vgsc.start, linestyle='--', color='k')
    ax.axvline(region_vgsc.end, linestyle='--', color='k')
    mx = x[:, 0]
    my = (pdiffs <= threshold).sum(axis=1) / pdiffs.shape[1]
    ax.plot(mx, my, color='k', label=label)
    sns.regplot(x=x.flatten(), y=pdiffs.flatten(), n_boot=n_boot, scatter=True, fit_reg=False, 
                x_estimator=lambda v: (np.sum(v <= threshold)) / len(v), ax=ax, color='k',
                scatter_kws=dict(s=markersize))
    ax.set_ylim(-0.05, 1.05)
    ax.set_xticklabels(['{:.1f}'.format(x/1e6) for x in ax.get_xticks()])
    if fig is not None:
        ax.text((region_vgsc.start + region_vgsc.end)/2, ax.get_ylim()[1], 'Vgsc', ha='center', va='bottom', fontstyle='italic', fontsize=8)
        ax.set_ylabel('Haplotype homozygosity')
        ax.set_xlabel('Position (Mbp)')
        fig.tight_layout()
    
    
def plot_hh_between(hap_ixs1, hap_ixs2, startp=1500000, stopp=3500000, window_size=1000, window_step=200, n_boot=0, ax=None, threshold=1, markersize=5, label=None):
    
    # get data
    midpoints, pdiffs = scan_pdiffs_between(hap_ixs1, hap_ixs2, startp=startp, stopp=stopp, window_size=window_size, window_step=window_step)
    pdiffs = pdiffs[:]

    # broadcast out X coords
    x = np.empty(pdiffs.shape, dtype=int)
    x[:] = midpoints[:, None].astype(int)

    # plot
    fig = None
    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 3))
        sns.despine(ax=ax, offset=5)
    ax.axvline(region_vgsc.start, linestyle='--', color='k')
    ax.axvline(region_vgsc.end, linestyle='--', color='k')
    mx = x[:, 0]
    my = (pdiffs <= threshold).sum(axis=1) / pdiffs.shape[1]
    ax.plot(mx, my, color='k', label=label)
    sns.regplot(x=x.flatten(), y=pdiffs.flatten(), n_boot=n_boot, scatter=True, fit_reg=False, 
                x_estimator=lambda v: (np.sum(v <= threshold)) / len(v), ax=ax, color='k',
                scatter_kws=dict(s=markersize))
    ax.set_ylim(-0.05, 1.05)
    ax.set_xticklabels(['{:.1f}'.format(x/1e6) for x in ax.get_xticks()])
    if fig is not None:
        ax.text((region_vgsc.start + region_vgsc.end)/2, ax.get_ylim()[1], 'Vgsc', ha='center', va='bottom', fontstyle='italic', fontsize=8)
        ax.set_ylabel('Haplotype homozygosity')
        ax.set_xlabel('Position (Mbp)')
        fig.tight_layout()

In [17]:
# sns.set_style('white')
# sns.set_style('ticks')

Make plots


In [18]:
def fig_S_comparisons():

    window_size = 1000
    window_step = 500
    fig, axs = plt.subplots(nrows=5, ncols=3, sharex=True, sharey=True, figsize=(8, 7))
    for i in range(5):
        ax = axs[i, 0]
        g = 'S%s' % (i + 1)
        plot_hh_within(vgsc_haplogroups[g], window_size=window_size, window_step=window_step, ax=ax, label=g)
        ax.set_title(g, loc='left')
        ax.set_yticks([0, 1])
    #     ax.text(.98, .95, g, ha='right', va='top', transform=ax.transAxes, fontsize=8, fontweight='bold')

    combinations = list(itertools.combinations(range(1, 6), 2))
    axes_ixs = [(i, j) for j in range(1, 3) for i in range(5)]
    for (c1, c2), axes_ix in zip(combinations, axes_ixs):
        ax = axs[axes_ix]
        g1 = 'S%s' % c1
        g2 = 'S%s' % c2
        label = '%s vs %s' % (g1, g2)
        plot_hh_between(vgsc_haplogroups[g1], vgsc_haplogroups[g2], window_size=window_size, window_step=window_step, ax=ax, label=label)
        ax.set_title(label, loc='left')
        ax.set_yticks([0, 1])
    #     ax.text(.98, .95, label, ha='right', va='top', transform=ax.transAxes, fontsize=8, fontweight='bold')

    # annotations
    for i in range(3):
        axs[0, i].text((region_vgsc.start + region_vgsc.end)/2, 1.1, 'Vgsc', ha='center', va='bottom', fontstyle='italic', fontsize=8)
    axs[2, 0].set_ylabel('Haplotype homozygosity')
    axs[-1, 1].set_xlabel('Chromosome arm 2L position (Mbp)')

    fig.tight_layout()
    fig.savefig('../artwork/mhh_S.pdf', bbox_inches='tight')
    
fig_S_comparisons()


/home/chris/Git/agam-vgsc-report/deps/conda/envs/agam-vgsc-report/lib/python3.6/site-packages/ipykernel_launcher.py:11: DeprecationWarning: generator 'index_windows' raised StopIteration
  # This is added back by InteractiveShellApp.init_path()
/home/chris/Git/agam-vgsc-report/deps/conda/envs/agam-vgsc-report/lib/python3.6/site-packages/ipykernel_launcher.py:12: DeprecationWarning: generator 'index_windows' raised StopIteration
  if sys.path[0] == '':

In [19]:
def fig_F_comparisons():
    window_size = 1000
    window_step = 500
    fig, axs = plt.subplots(nrows=5, ncols=3, sharex=True, sharey=True, figsize=(8, 7))
    for i in range(5):
        ax = axs[i, 0]
        g = 'F%s' % (i + 1)
        plot_hh_within(vgsc_haplogroups[g], window_size=window_size, window_step=window_step, ax=ax, label=g)
        ax.set_title(g, loc='left')
        ax.set_yticks([0, 1])
    #     ax.text(.98, .95, g, ha='right', va='top', transform=ax.transAxes, fontsize=8, fontweight='bold')

    combinations = list(itertools.combinations(range(1, 6), 2))
    axes_ixs = [(i, j) for j in range(1, 3) for i in range(5)]
    for (c1, c2), axes_ix in zip(combinations, axes_ixs):
        ax = axs[axes_ix]
        g1 = 'F%s' % c1
        g2 = 'F%s' % c2
        label = '%s vs %s' % (g1, g2)
        plot_hh_between(vgsc_haplogroups[g1], vgsc_haplogroups[g2], window_size=window_size, window_step=window_step, ax=ax, label=label)
        ax.set_title(label, loc='left')
        ax.set_yticks([0, 1])
    #     ax.text(.98, .95, label, ha='right', va='top', transform=ax.transAxes, fontsize=8, fontweight='bold')

    # annotations
    for i in range(3):
        axs[0, i].text((region_vgsc.start + region_vgsc.end)/2, 1.1, 'Vgsc', ha='center', va='bottom', fontstyle='italic', fontsize=8)
    axs[2, 0].set_ylabel('Haplotype homozygosity')
    axs[-1, 1].set_xlabel('Chromosome arm 2L position (Mbp)')

    fig.tight_layout()
    fig.savefig('../artwork/mhh_F.pdf', bbox_inches='tight')

fig_F_comparisons()


/home/chris/Git/agam-vgsc-report/deps/conda/envs/agam-vgsc-report/lib/python3.6/site-packages/ipykernel_launcher.py:11: DeprecationWarning: generator 'index_windows' raised StopIteration
  # This is added back by InteractiveShellApp.init_path()
/home/chris/Git/agam-vgsc-report/deps/conda/envs/agam-vgsc-report/lib/python3.6/site-packages/ipykernel_launcher.py:12: DeprecationWarning: generator 'index_windows' raised StopIteration
  if sys.path[0] == '':

Fst analysis


In [20]:
region_vgsc


Out[20]:
<SeqFeature 2L:2358158-2431617>

In [21]:
@functools.lru_cache(maxsize=None)
def scan_fst(hap_ixs1, hap_ixs2, startp=2000000, stopp=3000000):
    
    # setup data
    loc = pos_hh.locate_range(startp, stopp)
    p = pos_hh[loc]
    h1 = haps_hh[loc].take(hap_ixs1, axis=1)
    h2 = haps_hh[loc].take(hap_ixs2, axis=1)
    ac1 = h1.count_alleles(max_allele=3)
    ac2 = h2.count_alleles(max_allele=3)
    ac = ac1 + ac2
    loc_seg = ac.is_segregating()
    p, ac1, ac2 = p[loc_seg], ac1[loc_seg], ac2[loc_seg]
    num, den = allel.hudson_fst(ac1, ac2)
    fst = num / den
    return p, fst

In [22]:
def fig_fst_comparisons(prefix, startp, stopp, window_size=10, figsize=(8, 7), tick_snps=False):

    fig, axs = plt.subplots(nrows=5, ncols=2, sharex=True, sharey=True, figsize=figsize)

    combinations = list(itertools.combinations(range(1, 6), 2))
    axes_ixs = [(i, j) for j in range(2) for i in range(5)]
    for (c1, c2), axes_ix in zip(combinations, axes_ixs):
        ax = axs[axes_ix]
        g1 = '%s%s' % (prefix, c1)
        g2 = '%s%s' % (prefix, c2)
        label = '%s vs %s' % (g1, g2)
        p, fst = scan_fst(vgsc_haplogroups[g1], vgsc_haplogroups[g2], startp=startp, stopp=stopp)
        ax.plot(p, fst, marker='o', linestyle=' ', mfc='none', mec='k', markersize=2)
        wx = allel.moving_statistic(p, statistic=np.mean, size=window_size)
        wy = allel.moving_statistic(fst, statistic=np.mean, size=window_size)
        ax.plot(wx, wy, linestyle='-', color='k')
        ax.axvline(region_vgsc.start, linestyle='--', color='k')
        ax.axvline(region_vgsc.end, linestyle='--', color='k')
        ax.axvline(2422651, linestyle='-', color='r')
        ax.set_title(label, loc='left')
        ax.set_yticks([0, 1])
        if tick_snps:
            ax.set_xticks(p)
            ax.set_xticklabels(p, rotation=90)
        ax.set_xlim(startp, stopp)
    #     ax.text(.98, .95, label, ha='right', va='top', transform=ax.transAxes, fontsize=8, fontweight='bold')

    # annotations
#     for i in range(2):
#         axs[0, i].text((region_vgsc.start + region_vgsc.end)/2, 1.1, 'Vgsc', ha='center', va='bottom', fontstyle='italic', fontsize=8)
    axs[2, 0].set_ylabel('$F_{ST}$')
    axs[-1, 1].set_xlabel('Chromosome arm 2L position (Mbp)')

    fig.tight_layout()
#     fig.savefig('../artwork/mhh_F.pdf', bbox_inches='tight')

In [23]:
region_vgsc


Out[23]:
<SeqFeature 2L:2358158-2431617>

In [24]:
fig_fst_comparisons('S', startp=region_vgsc.start+45000, stopp=region_vgsc.end-5000, figsize=(34, 7), tick_snps=True)



In [25]:
# 8 SNPs differentiate S4, S5 in this region
2422250 - 2408423


Out[25]:
13827

In [26]:
fig_fst_comparisons('S', startp=region_vgsc.start - 70000, stopp=region_vgsc.end + 70000)



In [27]:
fig_fst_comparisons('S', startp=region_vgsc.start - 170000, stopp=region_vgsc.end + 170000)



In [28]:
fig_fst_comparisons('S', startp=region_vgsc.start - 500000, stopp=region_vgsc.end + 500000, window_size=20)



In [29]:
fig_fst_comparisons('F', startp=region_vgsc.start - 70000, stopp=region_vgsc.end + 70000)