Shared setup

This notebook contains imports, constants and functions which are shared across all other notebooks containing analyses for both the main body of the paper and the supplementary information.

Imports


In [1]:
# python standard library
import sys
# ensure Python 3
assert sys.version_info[0:2] == (3, 5)
import os
import operator
import itertools
import collections
import functools
import glob
import csv
import datetime
import bisect
from bisect import bisect_left, bisect_right
import sqlite3
import subprocess
import random
import gc
import shutil
import shelve
import re

In [2]:
# docker image
print('docker image', os.environ['DOCKER_IMAGE'])


docker image cggh/biipy:v1.6.0

In [3]:
%%HTML
<style type="text/css">
.container {
    width: 96%;
}
#maintoolbar {
    display: none;
}
#header-container {
    display: none;
}
#notebook {
    padding-top: 0;
}
</style>



In [4]:
# general purpose third party packages

import cython
#%load_ext Cython

import numpy as np
nnz = np.count_nonzero

import scipy
import scipy.stats
import scipy.spatial.distance
from scipy import stats
from scipy.spatial.distance import pdist, squareform

import numexpr

import h5py

import tables

import bcolz

import pandas

import IPython
from IPython.display import clear_output, display, HTML

import rpy2
import rpy2.robjects as ro
%reload_ext rpy2.ipython

import statsmodels

import sklearn
import sklearn.decomposition
import sklearn.manifold

import sh

import sqlalchemy

import pymysql

import psycopg2

import petl as etl
etl.config.display_index_header = True

import humanize
from humanize import naturalsize, intcomma, intword

In [5]:
# plotting setup
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.gridspec import GridSpec
import matplotlib_venn as venn
import seaborn as sns

sns.set_context('paper')
sns.set_style('white')
sns.set_style('ticks')

rcParams = plt.rcParams
rcParams['font.size'] = 9
rcParams['axes.labelsize'] = 9
rcParams['xtick.labelsize'] = 9
rcParams['ytick.labelsize'] = 9
rcParams['legend.fontsize'] = 9
rcParams['axes.linewidth'] = .5
rcParams['lines.linewidth'] = .5
rcParams['patch.linewidth'] = .5
rcParams['font.family'] = 'arial'
rcParams['ytick.direction'] = 'out'
rcParams['xtick.direction'] = 'out'
rcParams['savefig.jpeg_quality'] = 100
rcParams['savefig.dpi'] = 120
rcParams['lines.markeredgewidth'] = .5

golden_ratio = 1.61803398875

In [6]:
# bio third party packages
import Bio
import pyfasta
import pysam
import pysamstats
import petlx
import petlx.bio
import vcf
import vcfnp
import anhima
import allel

Utility functions


In [7]:
def log(*msg):
    msg = str(datetime.datetime.now()) + ' :: ' + ' '.join(map(str, msg))
    print(msg, file=sys.stderr) 
    sys.stderr.flush()
    
from IPython.display import display_html

def pre(msg):
    display_html('<pre style="line-height: 100%; font-size: .7em; display: inline-block; white-space: pre; background-color: #ff6; border: 0px solid #ddd; padding: 1px">' + str(msg).replace('<', '&lt;') + '</pre>', raw=True)
    
def html(content):
    display_html(content, raw=True)

Constants


In [8]:
CROSSES = '3d7_hb3', 'hb3_dd2', '7g8_gb4'
CHROMOSOMES = tuple('Pf3D7_{:02d}_v3'.format(n).encode('ascii') for n in range(1, 15))
PARENT_RUNS = 'ERR019061', 'ERR019054', 'ERR012788', 'ERR012840', 'ERR027099', 'ERR027100'
LABELS = {
    '3d7_hb3': '3D7 x HB3', 
    'hb3_dd2': 'HB3 x Dd2', 
    '7g8_gb4': '7G8 x GB4',
    'ERR019061': '3D7',
    'ERR019054': 'HB3(1)',
    'ERR012788': 'HB3(2)', 
    'ERR012840': 'Dd2',
    'ERR027099': '7G8',
    'ERR027100': 'GB4'
}

In [9]:
# N.B., change this if data are stored at a different location
BASE_DIR = '/data/plasmodium/pfalciparum/pf-crosses'
DATA_DIR = os.path.join(BASE_DIR, 'data')
RELEASE_DIR = os.path.join(DATA_DIR, 'release')
PUBLIC_DIR = os.path.join(DATA_DIR, 'public/20141022')

# public data
GATK_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.gatk.final.vcf.gz')
GATK_CALLSET_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.gatk.final.npz')
CORTEX_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.cortex.final.vcf.gz')
CORTEX_CALLSET_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.cortex.final.npz')
COMBINED_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.combined.final.vcf.gz')
COMBINED_CALLSET_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.combined.final.npz')
CORTEX_RAW_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.cortex.raw.vcf.gz')
BAM_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'bam/{run}.realigned.bam')
COVERAGE_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'cnv/{run}.coverage_ext_binned.300.{chrom}.npy')
CNV_TBL_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'cnv/{run}.cnv.300.txt')
CNV_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'cnv/{run}.cnv.300.{chrom}.npy')

# reference data
GENOME_DIR = os.path.join(DATA_DIR, 'genome/sanger/version3/September_2012')
FASTA_FN = os.path.join(GENOME_DIR, 'Pf3D7_v3.fa')
GFF_FN = os.path.join(GENOME_DIR, 'Pf3D7_v3.sorted.ascii.gff.gz')
fasta = pyfasta.Fasta(FASTA_FN)

In [10]:
dup_samples = {
    '3d7_hb3': ['C01/PG0062-C/ERR019070',
                'C02/PG0053-C/ERR019067',
                'C02/PG0055-C/ERR019066',
                'C02/PG0056-C/ERR019068'],
    'hb3_dd2': [],
    '7g8_gb4': ['AUD/PG0112-CW/ERR045639',
                'JC9/PG0111-C/ERR029409',
                'JE11/PG0100-CW/ERR045630',
                'JF6/PG0079-CW/ERR045637',
                'KB8/PG0104-CW/ERR045642',
                'LA10/PG0086-CW/ERR045629',
                'NIC/PG0095-C/ERR027107',
                'QF5/PG0078-CW/ERR045638',
                'XD8/PG0105-CW/ERR045628',
                'XF12/PG0102-CW/ERR045635'],
}


excessive_recomb_samples = {
    '3d7_hb3': [],
    'hb3_dd2': ['SC01/PG0025-C/ERR019045'],
    '7g8_gb4': ['D2/PG0094-CW/ERR045632'],
}

Numpy utilities


In [11]:
def interleave(a, b):
    c = np.empty((a.size + b.size,), dtype=a.dtype)
    c[0::2] = a
    c[1::2] = b
    return c


def unpack_callset(callset):
    V = callset['variants'].view(np.recarray)
    C = callset['calldata'].view(np.recarray)
    C2d = vcfnp.view2d(C).view(np.recarray)
    return V, C, C2d


def filter_variants(callset, query=None, verbose=True):
    if query is None:
        return callset
    else:
        V, C, _ = unpack_callset(callset)
        if isinstance(query, str):
            flt = numexpr.evaluate(query, local_dict=V)
        elif callable(query):
            flt = query(*unpack_callset(callset))
        else:
            flt = np.asarray(query)
        indices = np.nonzero(flt)[0]
        ret, exc, tot = indices.size, V.size-indices.size, V.size
        if verbose:
            log('filter variants: excluding %s (%.1f%%)' % (exc, exc*100./tot), 'retaining %s (%.1f%%)' % (ret, ret*100./tot), 'of', tot, 'variants')
        V = np.take(V, indices, axis=0).view(np.recarray)
        C = np.take(C, indices, axis=0).view(np.recarray)
        return {'variants': V, 'calldata': C}


def filter_calls(callset, query=None, verbose=True):
    if query is None:
        return callset
    else:
        V, C, _ = unpack_callset(callset)
        C = C.copy()
        C2d = vcfnp.view2d(C).view(np.recarray)

        # filter calls
        if isinstance(query, str):
            flt = numexpr.evaluate(query, local_dict=C2d)
        elif callable(query):
            flt = query(*unpack_callset(callset))
        else:
            flt = np.asarray(query)
        # only filter non-missing
        excl = (C2d.genotype >= 0) & ~flt

        tot, exc, ret = excl.size, np.count_nonzero(excl), np.count_nonzero(~excl)
        if verbose:
            log('filter calls: excluding %s (%.1f%%)' % (exc, exc*100./tot), 'retaining %s (%.1f%%)' % (ret, ret*100./tot), 'of', tot, 'calls')

        C2d.genotype[excl] = -2
        return {'variants': V, 'calldata': C}    
    

def filter_samples(callset, inclusions=None, exclusions=None, verbose=True):
    if inclusions or exclusions:
        V, C, _ = unpack_callset(callset)
        samples = C.dtype.names
        if inclusions:
            exclusions = [s for s in samples if s not in inclusions]
        elif exclusions:
            inclusions = [s for s in samples if s not in exclusions]
        if verbose:
            log('filter samples: excluding', exclusions, 'including', inclusions)
        C = C[inclusions].copy()
        return {'variants': V, 'calldata': C}
    else:
        return callset
    
    
def filter_callset(callset, 
                   variant_filter=None, 
                   call_filter=None, 
                   sample_inclusions=None, 
                   sample_exclusions=None,
                   verbose=True):
    
    callset = filter_variants(callset, variant_filter, verbose=verbose)
    
    callset = filter_samples(callset, inclusions=sample_inclusions, exclusions=sample_exclusions, verbose=verbose)
    
    callset = filter_calls(callset, call_filter, verbose=verbose)
    
    return callset


def load_callset(callset_fn, 
                 variant_filter=None, 
                 call_filter=None, 
                 sample_inclusions=None, 
                 sample_exclusions=None,
                 verbose=True):
    
    if verbose:
        log('loading', callset_fn)
    callset = np.load(callset_fn)

    callset = filter_callset(callset, 
                             variant_filter=variant_filter, 
                             call_filter=call_filter, 
                             sample_inclusions=sample_inclusions,
                             sample_exclusions=sample_exclusions,
                             verbose=verbose)
    
    return callset


def filter_callsets(callsets,
                    crosses=CROSSES,
                    variant_filter=None, 
                    call_filter=None, 
                    sample_inclusions=dict(), 
                    sample_exclusions=dict(),
                    verbose=True):
    out = dict()
    for cross in callsets.keys():
        out[cross] = filter_callset(callsets[cross], 
                                    variant_filter=variant_filter,
                                    call_filter=call_filter,
                                    sample_inclusions=sample_inclusions.get(cross, None),
                                    sample_exclusions=sample_exclusions.get(cross, None),
                                    verbose=verbose)
    return out
    
    
def load_callsets(callset_fn_template,
                  crosses=CROSSES,
                  variant_filter=None, 
                  call_filter=None, 
                  sample_inclusions=dict(), 
                  sample_exclusions=dict(),
                  verbose=True):
    callsets = dict()
    for cross in CROSSES:
        callset_fn = callset_fn_template.format(cross=cross)
        callsets[cross] = load_callset(callset_fn, 
                                       variant_filter=variant_filter,
                                       call_filter=call_filter,
                                       sample_inclusions=sample_inclusions.get(cross, None),
                                       sample_exclusions=sample_exclusions.get(cross, None),
                                       verbose=verbose)
    return callsets
    
    
def locate_variants(CHROM, POS, chrom, start=None, stop=None):
    if isinstance(chrom, str):
        chrom = chrom.encode('ascii')
    start_index, stop_index = bisect_left(CHROM, chrom), bisect_right(CHROM, chrom)
    POS = POS[start_index:stop_index]
    if stop is not None:
        stop_index = start_index + bisect_right(POS, stop)
    if start is not None:
        start_index = start_index + bisect_left(POS, start)
    return start_index, stop_index


def min_haplen_calls(min_haplen):
    def _f(v, c, c2d):
        hl, hs, _ = haplotypes(v, c)
        return hl >= min_haplen
    return _f


def gatk_conf_calls(V, C, C2d):
    return C2d.GQ >= 99


def cortex_conf_calls(V, C, C2d):
    return C2d.GT_CONF >= 50


def combined_conf_calls(v, c, c2d):
    return ((v.set[:,np.newaxis] == b'Cortex') & (c2d.GT_CONF >= 50)
            | (v.set[:,np.newaxis] != b'Cortex') & (c2d.GQ >= 99))

Tables


In [12]:
def display_with_nrows(tbl, n=5, caption=''):
    caption += ' (%s rows)' % tbl.nrows()
    tbl.display(n, caption=caption)

In [13]:
CACHE_DIR = os.path.join(PUBLIC_DIR, 'cache')

In [14]:
def etlcache(f):
    fn = os.path.join(CACHE_DIR, f.__name__)
    if not os.path.exists(fn):
        etl.topickle(f(), fn)
    return etl.frompickle(fn)
    
    
def pdcache(f):
    fn = os.path.join(CACHE_DIR, f.__name__)
    if not os.path.exists(fn):
        obj = f()
        pandas.to_pickle(obj, fn)
        return obj
    else:
        return pandas.read_pickle(fn)
    
    
def nocache(f):
    fn = os.path.join(CACHE_DIR, f.__name__)
    if os.path.exists(fn):
        os.remove(fn)
    return f()

Sample metadata


In [15]:
tbl_samples = (etl
    .fromtsv(os.path.join(PUBLIC_DIR, 'samples.txt'))
    .addfield('ID', lambda row: '%s/%s/%s' % (row.clone, row.sample, row.run))
)
run2id = tbl_samples.lookupone('run', 'ID')
tbl_samples


Out[15]:
0|cross 1|clone 2|sample 3|run 4|instrument 5|coverage 6|ID
3d7_hb3 3D7 PG0051-C ERR019061 Illumina Genome Analyzer II 122X 3D7/PG0051-C/ERR019061
3d7_hb3 C01 PG0065-C ERR019064 Illumina Genome Analyzer II 163X C01/PG0065-C/ERR019064
3d7_hb3 C01 PG0062-C ERR019070 Illumina Genome Analyzer II 108X C01/PG0062-C/ERR019070
3d7_hb3 C02 PG0055-C ERR019066 Illumina Genome Analyzer II 102X C02/PG0055-C/ERR019066
3d7_hb3 C02 PG0053-C ERR019067 Illumina Genome Analyzer II 73X C02/PG0053-C/ERR019067

...

Genome region accessibility classifications


In [16]:
@etlcache
def tbl_regions_1b():
    return (etl
        .fromtsv(os.path.join(PUBLIC_DIR, 'regions-20130225.onebased.txt'))
        .pushheader(['chrom', 'start', 'stop', 'type'])
        .convertnumbers()
        .prefixheader('region_')
        .addfield('region_size', lambda row: row.region_stop - row.region_start + 1)
    )
tbl_regions_1b


Out[16]:
0|region_chrom 1|region_start 2|region_stop 3|region_type 4|region_size
Pf3D7_01_v3 1 27336 SubtelomericRepeat 27336
Pf3D7_01_v3 27337 92900 SubtelomericHypervariable 65564
Pf3D7_01_v3 92901 457931 Core 365031
Pf3D7_01_v3 457932 460311 Centromere 2380
Pf3D7_01_v3 460312 575900 Core 115589

...

Genome annotations


In [17]:
@etlcache
def tbl_features():
    return (etl
        .fromgff3(GFF_FN)
        .unpackdict('attributes', ['ID', 'Parent', 'Name', 'previous_systematic_id'])
        .rename({'seqid': 'chrom', 'end': 'stop', 'ID': 'id', 'Parent': 'parent_id', 'Name': 'name', 'previous_systematic_id': 'previous_id'})
        .cutout('source', 'score', 'phase')
        .addfield('length', lambda row: row.stop - row.start, index=4)
        .intervalleftjoin(tbl_regions_1b, lkey='chrom', rkey='region_chrom', rstart='region_start', rstop='region_stop')
        .cutout('region_chrom', 'region_start', 'region_stop')
        .distinct('id')
        .sort(key=('chrom', 'start', 'type'))
        .prefixheader('feature_')
    )
tbl_features


Out[17]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_strand 6|feature_id 7|feature_parent_id 8|feature_name 9|feature_previous_id 10|feature_region_type 11|feature_region_size
Pf3D7_01_v3 repeat_region 1 360 359 + Pfalciparum_REP_20 None None None SubtelomericRepeat 27336
Pf3D7_01_v3 repeat_region 361 1418 1057 + Pfalciparum_REP_15 None None None SubtelomericRepeat 27336
Pf3D7_01_v3 repeat_region 2160 3858 1698 + Pfalciparum_REP_35 None None None SubtelomericRepeat 27336
Pf3D7_01_v3 repeat_region 8856 9021 165 + Pfalciparum_REP_5 None None None SubtelomericRepeat 27336
Pf3D7_01_v3 repeat_region 9313 9529 216 + Pfalciparum_REP_25 None None None SubtelomericRepeat 27336

...


In [18]:
lkp_feature = tbl_features.recordlookupone('feature_id')

In [19]:
@etlcache
def tbl_genes():
    return (
        tbl_features
        .selectin('feature_type', ['gene', 'pseudogene'])
        .cutout('feature_parent_id')
    )
tbl_genes


Out[19]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_strand 6|feature_id 7|feature_name 8|feature_previous_id 9|feature_region_type 10|feature_region_size
Pf3D7_01_v3 gene 29510 37126 7616 + PF3D7_0100100 VAR PFA0005w SubtelomericHypervariable 65564
Pf3D7_01_v3 gene 38982 40207 1225 - PF3D7_0100200 RIF PFA0010c SubtelomericHypervariable 65564
Pf3D7_01_v3 gene 42367 46507 4140 - PF3D7_0100300 None PFA0015c SubtelomericHypervariable 65564
Pf3D7_01_v3 gene 50363 51636 1273 + PF3D7_0100400 RIF PFA0020w SubtelomericHypervariable 65564
Pf3D7_01_v3 pseudogene 53169 53280 111 - PF3D7_0100500 var pseudogene, exon 1 PFA0025c SubtelomericHypervariable 65564

...


In [20]:
@etlcache
def tbl_intergenic():
    return (
        tbl_genes
        .addfieldusingcontext('next', lambda prv, cur, nxt: (nxt.feature_start, nxt.feature_id, nxt.feature_strand) if nxt and cur.feature_chrom == nxt.feature_chrom else None)
        .notnone('next')
        .unpack('next', ['next_start', 'next_id', 'next_strand'])
        .convert('feature_id', lambda v, row: '%s_%s' % (row.feature_id, row.next_id), pass_row=True)
        .cutout('feature_start', 'feature_length', 'feature_region_type', 'next_id')
        .rename({'feature_stop': 'feature_start', 'next_start': 'feature_stop'})
        .movefield('feature_stop', 3)
        .addfield('feature_length', lambda row: row.feature_stop - row.feature_start, index=4)
        .convert('feature_type', lambda v, row: 'intergenic %s/%s' % (row.feature_strand, row.next_strand), pass_row=True)
        # reclassify based on expected number of promoters
        .convert('feature_type', {'intergenic +/-': 'intergenic_0', 'intergenic -/+': 'intergenic_2', 'intergenic +/+': 'intergenic_1', 'intergenic -/-': 'intergenic_1'})
        .cutout('feature_strand', 'next_strand', 'feature_name', 'feature_previous_id')
        .intervalleftjoin(tbl_regions_1b, 
                          lkey='feature_chrom', lstart='feature_start', lstop='feature_stop',
                          rkey='region_chrom', rstart='region_start', rstop='region_stop')
        .rename('region_type', 'feature_region_type')
        .cutout('region_chrom', 'region_start', 'region_stop')
        .gt('feature_length', 0)
    )

tbl_intergenic


Out[20]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_id 6|feature_region_size 7|feature_region_type 8|region_size
Pf3D7_01_v3 intergenic_0 37126 38982 1856 PF3D7_0100100_PF3D7_0100200 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 40207 42367 2160 PF3D7_0100200_PF3D7_0100300 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_2 46507 50363 3856 PF3D7_0100300_PF3D7_0100400 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_0 51636 53169 1533 PF3D7_0100400_PF3D7_0100500 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 53280 53778 498 PF3D7_0100500_PF3D7_0100600 65564 SubtelomericHypervariable 65564

...


In [21]:
@etlcache
def tbl_exons():
    return (
        tbl_features
        .eq('feature_type', 'CDS')
        .gt('feature_length', 0)
    )

tbl_exons


Out[21]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_strand 6|feature_id 7|feature_parent_id 8|feature_name 9|feature_previous_id 10|feature_region_type 11|feature_region_size
Pf3D7_01_v3 CDS 29510 34762 5252 + PF3D7_0100100.1:exon:1 PF3D7_0100100.1 None None SubtelomericHypervariable 65564
Pf3D7_01_v3 CDS 35888 37126 1238 + PF3D7_0100100.1:exon:2 PF3D7_0100100.1 None None SubtelomericHypervariable 65564
Pf3D7_01_v3 CDS 38982 39923 941 - PF3D7_0100200.1:exon:1 PF3D7_0100200.1 None None SubtelomericHypervariable 65564
Pf3D7_01_v3 CDS 40154 40207 53 - PF3D7_0100200.1:exon:2 PF3D7_0100200.1 None None SubtelomericHypervariable 65564
Pf3D7_01_v3 CDS 42367 43617 1250 - PF3D7_0100300.1:exon:1 PF3D7_0100300.1 None None SubtelomericHypervariable 65564

...


In [22]:
@etlcache
def tbl_introns():
    return (
        tbl_exons
        .addfieldusingcontext('next', 
                              lambda prv, cur, nxt: (nxt.feature_start, nxt.feature_id) if nxt and cur.feature_parent_id == nxt.feature_parent_id else None)
        .notnone('next')
        .unpack('next', ['next_start', 'next_id'])
        .convert('feature_id', lambda v, row: '%s_%s' % (row.feature_id, row.next_id), pass_row=True)
        .cutout('feature_start', 'feature_length', 'feature_region_type', 'next_id')
        .rename({'feature_stop': 'feature_start', 'next_start': 'feature_stop'})
        .movefield('feature_stop', 3)
        .addfield('feature_length', lambda row: row.feature_stop - row.feature_start, index=4)
        .convert('feature_type', lambda v: 'intron')
        .intervalleftjoin(tbl_regions_1b, 
                          lkey='feature_chrom', lstart='feature_start', lstop='feature_stop',
                          rkey='region_chrom', rstart='region_start', rstop='region_stop')
        .rename('region_type', 'feature_region_type')
        .cutout('region_chrom', 'region_start', 'region_stop')
    )

tbl_introns


Out[22]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_strand 6|feature_id 7|feature_parent_id 8|feature_name 9|feature_previous_id 10|feature_region_size 11|feature_region_type 12|region_size
Pf3D7_01_v3 intron 34762 35888 1126 + PF3D7_0100100.1:exon:1_PF3D7_0100100.1:exon:2 PF3D7_0100100.1 None None 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intron 39923 40154 231 - PF3D7_0100200.1:exon:1_PF3D7_0100200.1:exon:2 PF3D7_0100200.1 None None 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intron 43617 43775 158 - PF3D7_0100300.1:exon:1_PF3D7_0100300.1:exon:2 PF3D7_0100300.1 None None 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intron 50416 50572 156 + PF3D7_0100400.1:exon:1_PF3D7_0100400.1:exon:2 PF3D7_0100400.1 None None 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intron 54788 54938 150 - PF3D7_0100600.1:exon:1_PF3D7_0100600.1:exon:2 PF3D7_0100600.1 None None 65564 SubtelomericHypervariable 65564

...


In [23]:
@etlcache
def tbl_gene_aliases():
    return (etl
        .fromtsv(os.path.join(DATA_DIR, 'genome/plasmodb/release-9.3/Pfalciparum3D7/txt/PlasmoDB-9.3_Pfalciparum3D7_GeneAliases.txt'))
        .pushheader(['feature_id'] + list(map(str, range(10))))
        .convert('feature_id', lambda v: v[:13])  # ignore alternate splice forms
        .cat()  # square up short rows
        .melt('feature_id')
        .rename('value', 'feature_alias')
        .cut('feature_id', 'feature_alias')
        .ne('feature_alias', None)
    )

tbl_gene_aliases


Out[23]:
0|feature_id 1|feature_alias
PF3D7_1314600 814061
PF3D7_1314600 PF13_0083
PF3D7_0209400 8444970
PF3D7_0209400 PF02_0090
PF3D7_0209400 PFB0423c

...

Heterochromatin (Flueck et al. 2009)


In [24]:
@etlcache
def tbl_hp1():
    return (etl
        .fromtext(os.path.join(DATA_DIR, 'reference/flueck_2009/journal.ppat.1000569.s008.txt'))
        .rowslice(3, None)
        .ne('lines', '')
        .capture('lines', r'^\S+\s+\d+\s+\d+\s+(\S+)', ['feature_alias'], include_original=1)
        .capture('lines', r'^\S+\s+\d+\s+\d+\s+\S+.*?(-?\d\.\d{3})', ['hp1'], include_original=1)
        .convert('hp1', float)
        .cutout('lines')
        .leftjoin(tbl_gene_aliases, key='feature_alias')
        .ne('feature_id', None)
        .sort('feature_id')
    )

tbl_hp1


Out[24]:
0|feature_alias 1|hp1 2|feature_id
PFA0005w 4.648 PF3D7_0100100
PFA0010c 3.763 PF3D7_0100200
PFA0015c 5.042 PF3D7_0100300
PFA0020w 3.721 PF3D7_0100400
PFA0030c 4.398 PF3D7_0100600

...


In [ ]:

Plotting


In [25]:
def plot_variants_locator(ax, callset, chrom, start=0, stop=None, step=5):
    V, _, _ = unpack_callset(callset)
    if stop is None:
        stop = len(fasta[chrom])
    indices = np.nonzero((V.CHROM == chrom.encode('ascii')) & (V.POS >= start) & (V.POS <= stop))[0]
    POS = np.take(V.POS, indices, axis=0)
    anhima.loc.plot_variant_locator(POS, ax=ax, step=step, start_position=start, stop_position=stop, line_args=dict(color='k'))
    ax.set_xticks([])
    ax.set_xlabel('')

In [26]:
INHERITANCE_COLORS = [
    'white', # missing
    'grey', # filtered
    'yellow', # parent missing
    'yellow', # parent filtered
    'black', # non-parental
    'orange', # non-segregating ref 
    'green', # non-segregating alt 
    'blue', # parent 1 
    'red' # parent 2 
]


def plot_inheritance(ax, callset, chrom, start=0, stop=None, sample_indices=None, colors=INHERITANCE_COLORS, alpha=1, **kwargs):

    V, C, C2d = unpack_callset(callset)
    samples = C.dtype.names
    
    if stop is None:
        stop = len(fasta[chrom])
    indices = np.nonzero((V.CHROM == chrom.encode('ascii')) & (V.POS >= start) & (V.POS <= stop))[0]
    POS = np.take(V.POS, indices, axis=0)
    G = np.take(C2d.genotype, indices, axis=0)
    P = inheritance(G)
    
    if sample_indices is not None:
        P = np.take(P, sample_indices, axis=1)
        samples = np.take(samples, sample_indices, axis=0) 

    anhima.gt.plot_discrete_calldata(P, labels=samples, colors=colors, states=range(len(colors)), ax=ax, **kwargs)

Gene annotations


In [27]:
def get_features(chrom, start=None, stop=None):
    if None not in {start, stop}:
        region = '%s:%s-%s' % (chrom, start, stop)
    else:
        region = chrom
    tbl_features = (etl
        .fromgff3(GFF_FN, region)
        .cutout('source', 'score', 'phase')
        .unpackdict('attributes', ['ID', 'Name', 'previous_systematic_id'], includeoriginal=True)
    )
    tbl_features = tbl_features.setheader(['feature_' + str(f).lower() for f in tbl_features.header()])
    return tbl_features


prog_hv = re.compile('var|rif|stev', flags=re.I) 


def plot_genes(ax, chrom, start=0, stop=None, label_named=False, exclude_labels=[], width=.3, label_ytextpad=.1,
               normal_color='k', hypervariable_color='r', cds_color='gray', divider_linewidth=1,
               annotate_genes=[], annotate_xtext=0, annotate_ytext_fwd=6, annotate_ytext_rev=8, 
               annotate_arrowprops=None, annotate_bbox=None, 
               show_cds=False, gene_labels=dict()):
    if stop is None:
        stop = len(fasta[chrom])
    tbl_local_features = get_features(chrom, start, stop)
    tbl_local_genes = tbl_local_features.selectin('feature_type', {'gene', 'pseudogene'})
    tbl_local_cds = tbl_local_features.eq('feature_type', 'CDS')
    
    # divider
    ax.plot([start, stop], [.5, .5], 'k-', linewidth=divider_linewidth)
    ax.set_ylim(0, 1)
    
    # positive strand
    tbl_fwd_genes = tbl_local_genes.eq('feature_strand', '+')
    xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_fwd_genes.records()]
    fc = [hypervariable_color if r.feature_name is not None and prog_hv.search(r.feature_name) is not None else normal_color 
          for r in tbl_fwd_genes.records()]
    ax.broken_barh(xranges, (.5, width), color=fc)
    if show_cds:
        tbl_fwd_cds = tbl_local_cds.eq('feature_strand', '+')
        xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_fwd_cds.records()]
        ax.broken_barh(xranges, (.5, width), color=cds_color)
        
    
    # negative strand
    tbl_rev_genes = tbl_local_genes.eq('feature_strand', '-')
    xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_rev_genes.records()]
    fc = [hypervariable_color if r.feature_name is not None and prog_hv.search(r.feature_name) is not None else normal_color
          for r in tbl_rev_genes.records()]
    ax.broken_barh(xranges, (.5-width, width), color=fc)
    if show_cds:
        tbl_rev_cds = tbl_local_cds.eq('feature_strand', '-')
        xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_rev_cds.records()]
        ax.broken_barh(xranges, (.5-width, width), color=cds_color)
    
    for s in 'top', 'left', 'right':
        ax.spines[s].set_visible(False)
    ax.set_xlim(start, stop)
    ax.set_yticks([])
    ax.xaxis.tick_bottom()

    # label named genes
    if label_named:
        for g in tbl_local_genes.records():
            if g.feature_name is not None and g.feature_name not in exclude_labels:
                x = g.feature_start
                y = .5 + width + label_ytextpad if g.feature_strand == '+' else .5 - width - label_ytextpad
                va = 'bottom' if g.feature_strand == '+' else 'top'
                l = g.feature_name
                ax.text(x, y, l, rotation=0, ha='left', va=va, transform=ax.transData)
                
    # annotate specific genes
    if annotate_arrowprops is None:
        annotate_arrowprops = dict(facecolor='k', arrowstyle='-', shrinkA=0, shrinkB=0)
    if annotate_bbox is None:
        annotate_bbox = dict(facecolor='w', alpha=.9)
    for id in annotate_genes:
        g = lkp_feature[id]
        gene_id = g['feature_id']
        l = gene_labels[gene_id] if gene_id in gene_labels else g['feature_name'] if g['feature_name'] is not None else gene_id
        if g['feature_strand'] == '+':
            ax.annotate(l, xy=(g['feature_start'], .5), ha='left', xytext=(annotate_xtext, annotate_ytext_fwd), 
                        xycoords='data', textcoords='offset points', 
                        arrowprops=annotate_arrowprops,
                        bbox=annotate_bbox)
        elif g['feature_strand'] == '-':
            ax.annotate(l, xy=(g['feature_start'], .5), ha='left', xytext=(annotate_xtext, annotate_ytext_rev), 
                        xycoords='data', textcoords='offset points', 
                        arrowprops=annotate_arrowprops,
                        bbox=annotate_bbox)

Accessibility


In [28]:
CNV_RUNS = {
    '3d7_hb3': ['ERR019061', 'ERR019054'],
    'hb3_dd2': ['ERR012788', 'ERR012840', 'ERR019054'],  # share HB3s
    '7g8_gb4': ['ERR027099', 'ERR027100']
}


accessibility_colors = {
    'Core': 'white', 
    'SubtelomericRepeat': 'yellow',
    'SubtelomericHypervariable': '#d73027',
    'InternalHypervariable': '#fc8d59',
    'Centromere': 'black',
    'CNV': '#91bfdb',
}


def plot_accessibility(ax, chrom, cross=None, start=0, stop=None, linewidth=0.5, **kwargs):
    tbl_lreg = tbl_regions_1b.eq('region_chrom', chrom)
    if stop is None:
        stop = len(fasta[chrom])

    # plot core
    xranges = tbl_lreg.eq('region_type', 'Core').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['Core'], 
                   linewidth=linewidth, **kwargs)

    # plot centromere
    xranges = tbl_lreg.eq('region_type', 'Centromere').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['Centromere'], 
                   linewidth=linewidth, **kwargs)

    # plot subtelomeric repeat
    xranges = tbl_lreg.eq('region_type', 'SubtelomericRepeat').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['SubtelomericRepeat'], 
                   linewidth=linewidth, **kwargs)
    
    # plot hypervariable
    xranges = tbl_lreg.eq('region_type', 'SubtelomericHypervariable').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['SubtelomericHypervariable'], 
                   linewidth=linewidth, **kwargs)
    xranges = tbl_lreg.eq('region_type', 'InternalHypervariable').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['InternalHypervariable'], 
                   linewidth=linewidth, **kwargs)
    
    if cross is not None:
        # plot CNVs
        tbls_cnvs = [etl.fromtsv(CNV_TBL_FN_TEMPLATE.format(run=run)).convertnumbers() for run in CNV_RUNS[cross]]
        tbl_cnvs = (etl
            .cat(*tbls_cnvs)
            .gt('cn', 1)
            .eq('chrom', chrom)
            .intervalleftjoin(tbl_lreg, lkey='chrom', rkey='region_chrom', 
                              lstart='start', lstop='stop', 
                              rstart='region_start', rstop='region_stop')
            .eq('region_type', 'Core')
        )
        xranges = tbl_cnvs.values(['start', 'size'])
        ax.broken_barh(xranges, (.1, .8), color=accessibility_colors['CNV'], 
                       linewidth=linewidth, **kwargs)
    
    # tidy up
    for s in 'top', 'left', 'right', 'bottom':
        ax.spines[s].set_visible(False)
    ax.set_xlim(start, stop)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.xaxis.tick_bottom()
    ax.set_ylim(0, 1)

Heterochromatin


In [29]:
def plot_hp1(ax, chrom, start=0, stop=None):

    if stop is None:
        stop = len(fasta[chrom])
        
    tbl_genes_local = get_features(chrom, start, stop).selectin('feature_type', {'gene', 'pseudogene'})
    A = tbl_genes_local.cut('feature_start', 'feature_end', 'feature_id').join(tbl_hp1, key='feature_id').torecarray()
    left = A.feature_start
    height = A.hp1
    width = A.feature_end - A.feature_start
    ax.bar(left, height, width, facecolor='purple', linewidth=1, edgecolor='purple')
    
    ax.set_xlim(start, stop)
    for s in 'top', 'left', 'right', 'bottom':
        ax.spines[s].set_visible(False)
    ax.set_yticks([])
    ax.set_ylim(-1, 7)
    ax.set_xticks([])
#    ax.plot([start, stop], [0, 0], color='k', linestyle='-')

Inheritance


In [30]:
# INHERITANCE_COLORS = [
#     'white', # missing
#     'grey', # filtered
#     'yellow', # parent missing
#     'yellow', # parent filtered
#     'black', # non-parental
#     'orange', # non-segregating ref 
#     'green', # non-segregating alt 
#     'blue', # parent 1 
#     'red' # parent 2 
# ]


# def plot_inheritance(ax, callset, chrom, start=0, stop=None, sample_indices=None, colors=INHERITANCE_COLORS, alpha=1, **kwargs):

#     V, C, C2d = unpack_callset(callset)
#     samples = C.dtype.names
    
#     if stop is None:
#         stop = len(fasta[chrom])
#     indices = np.nonzero((V.CHROM == chrom) & (V.POS >= start) & (V.POS <= stop))[0]
#     POS = np.take(V.POS, indices, axis=0)
#     G = np.take(C2d.genotype, indices, axis=0)
#     P = inheritance(G)
    
#     if sample_indices is not None:
#         P = np.take(P, sample_indices, axis=1)
#         samples = np.take(samples, sample_indices, axis=0) 

#     vcfplt.discrete_calldata_colormesh(P, labels=samples, colors=colors, states=range(len(colors)), alpha=alpha, ax=ax, **kwargs)

Recombination


In [31]:
def marker_span(callset, chrom):
    V, _, _ = unpack_callset(callset)
    start_index, stop_index = locate_variants(V.CHROM, V.POS, chrom)
    POS = V.POS[start_index:stop_index]
    x = POS[-1] - POS[0]
    return x


def marker_offset(callset, chrom):
    V, _, _ = unpack_callset(callset)
    start_index, stop_index = locate_variants(V.CHROM, V.POS, chrom)
    POS = V.POS[start_index:stop_index]
    return POS[0]


def locate_flanking_markers(CHROM, POS, chrom, pos):
    chrom_start_index, chrom_stop_index = bisect_left(CHROM, chrom), bisect_right(CHROM, chrom)
    POS = POS[chrom_start_index:chrom_stop_index]
    
    start_index = bisect_left(POS, pos) - 1
    stop_index = bisect_right(POS, pos)
    
    if start_index < 0:
        start_index = None
    else:
        start_index += chrom_start_index
        
    if stop_index >= len(POS):
        stop_index = None
    else:
        stop_index += chrom_start_index
    
    return start_index, stop_index
    

def simulate_random_uniform_crossovers(n_events, callset, chromosomes=CHROMOSOMES):
    
    # table of results
    tbl = [['chrom', 'co_pos', 'co_pos_mid', 'co_pos_min', 'co_pos_max', 'co_pos_range']]
    
    # utility variables
    V = callset['variants']
    marker_spans = [marker_span(callset, chrom) for chrom in chromosomes]
    marker_offsets = [marker_offset(callset, chrom) for chrom in chromosomes]
    cumulative_marker_span = [np.sum(marker_spans[:i+1]) for i in range(len(marker_spans))]
    total_marker_span = np.sum(marker_spans)
    
    # generate random integers over the total marker span
    X = np.random.randint(low=0, high=total_marker_span, size=n_events)
    
    # map each event onto a chromosome and position
    for x in X:

        # which chromosome?
        chrom_index = bisect_left(cumulative_marker_span, x)
        chrom = chromosomes[chrom_index]

        # which position?
        if chrom_index > 0:
            pos_relative = x - cumulative_marker_span[chrom_index-1]
        else:
            pos_relative = x
        pos = pos_relative + marker_offsets[chrom_index]
        
        # find flanking markers
        lidx, ridx = locate_flanking_markers(V.CHROM, V.POS, chrom, pos)

        pos_min = V.POS[lidx]
        pos_max = V.POS[ridx]
        pos_mid = (pos_min + pos_max)//2
        pos_range = pos_max - pos_min
        
        row = [chrom, pos, pos_mid, pos_min, pos_max, pos_range]
        tbl += [row]
        
    return etl.wrap(tbl)


def simulate_random_uniform_non_crossovers(n_events, callset, phi, chromosomes=CHROMOSOMES):
    
    # table of results
    tbl = [['chrom', 
            'tract_start', 
            'tract_stop', 
            'tract_length',
            'tract_start_min', 
            'tract_start_mid', 
            'tract_start_max', 
            'tract_stop_min', 
            'tract_stop_mid', 
            'tract_stop_max', 
            'tract_length_min', 
            'tract_length_mid', 
            'tract_length_max', 
            'tract_support']]
    
    # utility variables
    V = callset['variants']
    marker_spans = [marker_span(callset, chrom) for chrom in chromosomes]
    marker_offsets = [marker_offset(callset, chrom) for chrom in chromosomes]
    cumulative_marker_span = [np.sum(marker_spans[:i+1]) for i in range(len(marker_spans))]
    total_marker_span = np.sum(marker_spans)
    
    # generate random integers over the total marker span
    X = np.random.randint(low=0, high=total_marker_span, size=n_events)

    # generate random geometric tract lengths
    lengths = stats.geom.rvs(1-phi, size=n_events).astype('int32')
    
    # map each event onto a chromosome and position
    for x, length in zip(X, lengths):

        try:
            # which chromosome?
            chrom_index = bisect_left(cumulative_marker_span, x)
            chrom = chromosomes[chrom_index]

            # which start position?
            if chrom_index > 0:
                pos_relative = x - cumulative_marker_span[chrom_index-1]
            else:
                pos_relative = x
            start = pos_relative + marker_offsets[chrom_index]
            stop = start + length

            # find flanking markers
            start_lidx, start_ridx = locate_flanking_markers(V.CHROM, V.POS, chrom, start)
            stop_lidx, stop_ridx = locate_flanking_markers(V.CHROM, V.POS, chrom, stop)
            
            if None in {start_lidx, start_ridx, stop_lidx, stop_ridx}:
                
                # falls off edge of marker span
                support = start_min = start_max = start_mid = stop_min = stop_max = stop_mid = length_min = length_max = length_mid = None

            else:

                support = stop_ridx - start_ridx

                # do we even observe the event?
                if support == 0:
                    
                    start_min = start_max = start_mid = stop_min = stop_max = stop_mid = length_min = length_max = length_mid = None

                else:

                    start_min = V.POS[start_lidx]
                    start_max = V.POS[start_ridx]
                    start_mid = (start_min + start_max)//2
                    stop_min = V.POS[stop_lidx]
                    stop_max = V.POS[stop_ridx]
                    stop_mid = (stop_min + stop_max)//2
                    length_min = stop_min - start_max
                    length_max = stop_max - start_min
                    length_mid = (length_min + length_max)//2

            row = [chrom, 
                   start, 
                   stop, 
                   length,
                   start_min, 
                   start_mid, 
                   start_max, 
                   stop_min, 
                   stop_mid, 
                   stop_max, 
                   length_min, 
                   length_mid, 
                   length_max, 
                   support]
            tbl += [row]
    
        except Exception as e:
            log(e)
            log(traceback.format_exc(10))
                    
    return etl.wrap(tbl)

Optimisations


In [32]:
%reload_ext Cython

In [33]:
%%cython

import numpy as np
cimport numpy as np
import petl as etl
import vcfnp


CHROMOSOMES = [('Pf3D7_%02d_v3' % n).encode('ascii') for n in range(1, 15)]

# constants to represent the possible outcomes when attempting to determine the inheritance for any given variant call
INHERITANCE_MISSING = 0  # genotype call is missing
INHERITANCE_FILTERED = 1  # genotype call is filtered
INHERITANCE_PARENT_MISSING = 2  # genotype is called but cannot be phased because one or both parents is missing
INHERITANCE_PARENT_FILTERED = 3  # genotype is called but cannot be phased because one or both parents is filtered
INHERITANCE_NONPARENTAL = 4  # genotype call does not correspond with either parent (i.e., mendelian error sensu strictu)
INHERITANCE_NONSEGREGATING_REF = 5  # genotype call is ref and parents both have reference allele
INHERITANCE_NONSEGREGATING_ALT = 6  # genotype call is alt and parents both have alternate allele
INHERITANCE_PARENT1 = 7  # genotype call corresponds with first parent
INHERITANCE_PARENT2 = 8  # genotype call corresponds with second parent


# functions to determine inheritance states
###########################################


def inheritance(np.ndarray[np.int8_t, ndim=2] G):
    cdef int m = G.shape[0] # number of variants
    cdef int n = G.shape[1] # number of samples
    cdef np.ndarray[np.uint8_t, ndim=2] P
    P = np.zeros((m,n), dtype='u1')
    for i in range(m):
        for j in range(n):
            # N.B., parents are *first* two samples
            P[i,j] = inheritance_state(G[i,j], G[i,0], G[i,1])
    return P


cdef int inheritance_state(int genotype_child, int genotype_parent1, int genotype_parent2):
    if genotype_child == -1:
        return INHERITANCE_MISSING
    elif genotype_child == -2:
        return INHERITANCE_FILTERED
    elif genotype_parent1 == -1 or genotype_parent2 == -1:
        return INHERITANCE_PARENT_MISSING
    elif genotype_parent1 == -2 or genotype_parent2 == -2:
        return INHERITANCE_PARENT_FILTERED
    elif genotype_child != genotype_parent1 and genotype_child != genotype_parent2:
        return INHERITANCE_NONPARENTAL
    elif genotype_child == genotype_parent1 == genotype_parent2 == 0:
        return INHERITANCE_NONSEGREGATING_REF
    elif genotype_child == genotype_parent1 == genotype_parent2 and genotype_child > 0:
        return INHERITANCE_NONSEGREGATING_ALT
    elif genotype_child == genotype_parent1:
        return INHERITANCE_PARENT1
    elif genotype_child == genotype_parent2:
        return INHERITANCE_PARENT2
    else:
        return -1 # should never be reached


# functions to calculate haplotype statistics
#############################################


def haplotypes_chrom(V, C, chrom):
    """
    Construct arrays of haplotype length and haplotype support for all variant calls.

    """

    tbl = list()
    header = 'sample', 'chrom', 'start_min', 'start_mid', 'start_max', 'stop_min', 'stop_mid', 'stop_max', 'length_min', \
             'length_mid', 'length_max', 'support', 'prv_inheritance', 'inheritance', 'nxt_inheritance'
    tbl.append(header)

    cdef int i, j, M, N, prv_idx, start_idx, stop_idx, start_min, start_mid, start_max, stop_min, stop_mid, \
        stop_max, length_min, length_mid, length_max, support, prv, cur

    samples = C.dtype.names
    C2d = vcfnp.view2d(C).view(np.recarray)

    # only deal with one chromosome, make life simpler
    flt = V.CHROM == chrom
    indices = np.nonzero(flt)[0]
    V = np.take(V, indices, axis=0)
    C2d = np.take(C2d, indices, axis=0)
    M, N = C2d.shape
    cdef np.ndarray[np.uint8_t, ndim=2] P
    P = inheritance(C2d.genotype)  # inheritance
#    print M, N

    cdef np.ndarray[np.int32_t, ndim=1] POS
    POS = V['POS']

    cdef np.ndarray[np.int32_t, ndim=2] HL
    cdef np.ndarray[np.int32_t, ndim=2] HS
    HL = np.zeros((M, N), dtype='i4') # haplotype lengths (minimum)
    HS = np.zeros((M, N), dtype='i4') # haplotype support - number of calls supporting the haplotype

    cdef np.ndarray[np.int32_t, ndim=1] previous_call_idx
    cdef np.ndarray[np.int32_t, ndim=2] haplotype_start_idxs
    cdef np.ndarray[np.int32_t, ndim=1] last_haplotype_inheritance
    previous_call_idx = np.array([-1] * N, dtype='i4') # store index of last good call
    haplotype_start_idxs = np.array([[-1, -1]] * N, dtype='i4') # store indices of haplotype start (flanking markers)
    last_haplotype_inheritance = np.array([-1] * N, dtype='i4') # store parental inheritance of upstream block

    # iterate over variants
    for i in range(M):
        # iterate over samples
        for j in range(N):
            # extract genotype call for current variant/sample
            cur = P[i, j]
            if cur == INHERITANCE_PARENT1 or cur == INHERITANCE_PARENT2: # current call is good
                # index of last good call for this sample
                prv_idx = previous_call_idx[j]
                if prv_idx < 0:
                    # this is the first good call, start a haplotype
                    haplotype_start_idxs[j] = (i, i)
                else:
                    # extract last good genotype call for current sample
                    prv = P[prv_idx, j]
                    if cur != prv:
                        # inheritance switch, haplotype has ended
                        start_min_idx, start_max_idx = tuple(haplotype_start_idxs[j])
                        stop_min_idx, stop_max_idx = prv_idx, i
                        start_min = POS[start_min_idx]
                        start_max = POS[start_max_idx]
                        start_mid = (start_max + start_min) // 2
                        stop_min = POS[stop_min_idx]
                        stop_max = POS[stop_max_idx]
                        stop_mid = (stop_max + stop_min) // 2
                        length_min = stop_min - start_max
                        length_max = stop_max - start_min
                        length_mid = stop_mid - start_mid
                        support = np.count_nonzero(P[start_max_idx:stop_min_idx+1, j] == prv)
                        # assign results
                        HL[start_max_idx:stop_min_idx+1, j] = length_min
                        HS[start_max_idx:stop_min_idx+1, j] = support
                        row = samples[j], chrom, start_min, start_mid, start_max, stop_min, stop_mid, stop_max, \
                              length_min, length_mid, length_max, support, last_haplotype_inheritance[j], prv, cur
                        tbl.append(row)
                        # start new haplotype
                        haplotype_start_idxs[j] = (prv_idx, i)
                        last_haplotype_inheritance[j] = prv
                    else:
                        # haplotype continues
                        pass
                # store last good call index
                previous_call_idx[j] = i

    # final haplotypes ending at chromosome end
    for j in range(N):
        start_min_idx, start_max_idx = tuple(haplotype_start_idxs[j])
        stop_min_idx, stop_max_idx = prv_idx, prv_idx  # nothing subsequent
        start_min = POS[start_min_idx]
        start_max = POS[start_max_idx]
        start_mid = (start_max + start_min) // 2
        stop_min = POS[stop_min_idx]
        stop_max = POS[stop_max_idx]
        stop_mid = (stop_max + stop_min) // 2
        length_min = stop_min - start_max
        length_max = stop_max - start_min
        length_mid = stop_mid - start_mid
        inherit = P[prv_idx, j]
        support = np.count_nonzero(P[start_max_idx:stop_min_idx+1, j] == inherit)
        # assign results
        HL[start_max_idx:stop_min_idx+1, j] = length_min
        HS[start_max_idx:stop_min_idx+1, j] = support
        row = samples[j], chrom, start_min, start_mid, start_max, stop_min, stop_mid, stop_max, length_min, length_mid, \
              length_max, support, last_haplotype_inheritance[j], inherit, -1
        tbl.append(row)

    return HL, HS, etl.wrap(tbl)


def haplotypes(V, C):
    stats = [haplotype_stats_chrom(V, C, chrom) for chrom in CHROMOSOMES]
    HL = np.vstack(s[0] for s in stats)
    HS = np.vstack(s[1] for s in stats)
    tbl = etl.cat(*[s[2] for s in stats])
    return HL, HS, tbl


# backwards compatibility
haplotype_stats_chrom = haplotypes_chrom
haplotype_stats = haplotypes


# functions to tabulate recombination events
############################################


def tabulate_switches_chrom(V, C, chrom):
    tbl = list()
    header = 'sample', 'chrom', 'pos', 'lpos', 'rpos', 'range', 'from', 'to'
    tbl.append(header)

    cdef int i, j, m, n, prv_idx, cur, prv

    samples = C.dtype.names
    C2d = vcfnp.view2d(C).view(np.recarray)

    # only deal with one chromosome, make life simpler
    flt = V.CHROM == chrom
    indices = np.nonzero(flt)[0]
    V, C, C2d = [np.take(X, indices, axis=0) for X in [V, C, C2d]]
    m, n = C2d.shape

    cdef np.ndarray[np.uint8_t, ndim=2] P
    P = inheritance(C2d.genotype)

    cdef np.ndarray[np.int32_t, ndim=1] POS
    POS = V['POS']

    cdef np.ndarray[np.int32_t, ndim=1] previous_call_idx
    previous_call_idx = np.array([-1] * n, dtype='i4') # store index of last good call

    # iterate over variants
    for i in range(m):
        # iterate over samples
        for j in range(n):
            # extract genotype call for current variant/sample
            cur = P[i, j]
            if cur == INHERITANCE_PARENT1 or cur == INHERITANCE_PARENT2: # current call is good
                # index of last good call for this sample
                prv_idx = previous_call_idx[j]
                if prv_idx < 0:
                    # this is the first good call
                    pass
                else:
                    # extract last good genotype call for current sample
                    prv = P[prv_idx, j]
                    if cur != prv:
                        # phase switch
                        lpos = POS[prv_idx]
                        rpos = POS[i]
                        row = samples[j], chrom, (lpos + rpos) // 2, lpos, rpos, rpos-lpos, prv, cur
                        tbl.append(row)
                    else:
                        # haplotype continues
                        pass
                # store last good call index
                previous_call_idx[j] = i

    return etl.wrap(tbl)


def tabulate_switches(V, C):
    tbls = [tabulate_switches_chrom(V, C, chrom) for chrom in CHROMOSOMES]
    return etl.cat(*tbls)

In [ ]:


In [ ]: