This notebook contains imports, constants and functions which are shared across all other notebooks containing analyses for both the main body of the paper and the supplementary information.
In [1]:
    
# python standard library
import sys
# ensure Python 3
assert sys.version_info[0:2] == (3, 5)
import os
import operator
import itertools
import collections
import functools
import glob
import csv
import datetime
import bisect
from bisect import bisect_left, bisect_right
import sqlite3
import subprocess
import random
import gc
import shutil
import shelve
import re
    
In [2]:
    
# docker image
print('docker image', os.environ['DOCKER_IMAGE'])
    
    
In [3]:
    
%%HTML
<style type="text/css">
.container {
    width: 96%;
}
#maintoolbar {
    display: none;
}
#header-container {
    display: none;
}
#notebook {
    padding-top: 0;
}
</style>
    
    
In [4]:
    
# general purpose third party packages
import cython
#%load_ext Cython
import numpy as np
nnz = np.count_nonzero
import scipy
import scipy.stats
import scipy.spatial.distance
from scipy import stats
from scipy.spatial.distance import pdist, squareform
import numexpr
import h5py
import tables
import bcolz
import pandas
import IPython
from IPython.display import clear_output, display, HTML
import rpy2
import rpy2.robjects as ro
%reload_ext rpy2.ipython
import statsmodels
import sklearn
import sklearn.decomposition
import sklearn.manifold
import sh
import sqlalchemy
import pymysql
import psycopg2
import petl as etl
etl.config.display_index_header = True
import humanize
from humanize import naturalsize, intcomma, intword
    
In [5]:
    
# plotting setup
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.gridspec import GridSpec
import matplotlib_venn as venn
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')
sns.set_style('ticks')
rcParams = plt.rcParams
rcParams['font.size'] = 9
rcParams['axes.labelsize'] = 9
rcParams['xtick.labelsize'] = 9
rcParams['ytick.labelsize'] = 9
rcParams['legend.fontsize'] = 9
rcParams['axes.linewidth'] = .5
rcParams['lines.linewidth'] = .5
rcParams['patch.linewidth'] = .5
rcParams['font.family'] = 'arial'
rcParams['ytick.direction'] = 'out'
rcParams['xtick.direction'] = 'out'
rcParams['savefig.jpeg_quality'] = 100
rcParams['savefig.dpi'] = 120
rcParams['lines.markeredgewidth'] = .5
golden_ratio = 1.61803398875
    
In [6]:
    
# bio third party packages
import Bio
import pyfasta
import pysam
import pysamstats
import petlx
import petlx.bio
import vcf
import vcfnp
import anhima
import allel
    
In [7]:
    
def log(*msg):
    msg = str(datetime.datetime.now()) + ' :: ' + ' '.join(map(str, msg))
    print(msg, file=sys.stderr) 
    sys.stderr.flush()
    
from IPython.display import display_html
def pre(msg):
    display_html('<pre style="line-height: 100%; font-size: .7em; display: inline-block; white-space: pre; background-color: #ff6; border: 0px solid #ddd; padding: 1px">' + str(msg).replace('<', '<') + '</pre>', raw=True)
    
def html(content):
    display_html(content, raw=True)
    
In [8]:
    
CROSSES = '3d7_hb3', 'hb3_dd2', '7g8_gb4'
CHROMOSOMES = tuple('Pf3D7_{:02d}_v3'.format(n).encode('ascii') for n in range(1, 15))
PARENT_RUNS = 'ERR019061', 'ERR019054', 'ERR012788', 'ERR012840', 'ERR027099', 'ERR027100'
LABELS = {
    '3d7_hb3': '3D7 x HB3', 
    'hb3_dd2': 'HB3 x Dd2', 
    '7g8_gb4': '7G8 x GB4',
    'ERR019061': '3D7',
    'ERR019054': 'HB3(1)',
    'ERR012788': 'HB3(2)', 
    'ERR012840': 'Dd2',
    'ERR027099': '7G8',
    'ERR027100': 'GB4'
}
    
In [9]:
    
# N.B., change this if data are stored at a different location
BASE_DIR = '/data/plasmodium/pfalciparum/pf-crosses'
DATA_DIR = os.path.join(BASE_DIR, 'data')
RELEASE_DIR = os.path.join(DATA_DIR, 'release')
PUBLIC_DIR = os.path.join(DATA_DIR, 'public/20141022')
# public data
GATK_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.gatk.final.vcf.gz')
GATK_CALLSET_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.gatk.final.npz')
CORTEX_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.cortex.final.vcf.gz')
CORTEX_CALLSET_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.cortex.final.npz')
COMBINED_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.combined.final.vcf.gz')
COMBINED_CALLSET_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.combined.final.npz')
CORTEX_RAW_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.cortex.raw.vcf.gz')
BAM_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'bam/{run}.realigned.bam')
COVERAGE_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'cnv/{run}.coverage_ext_binned.300.{chrom}.npy')
CNV_TBL_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'cnv/{run}.cnv.300.txt')
CNV_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'cnv/{run}.cnv.300.{chrom}.npy')
# reference data
GENOME_DIR = os.path.join(DATA_DIR, 'genome/sanger/version3/September_2012')
FASTA_FN = os.path.join(GENOME_DIR, 'Pf3D7_v3.fa')
GFF_FN = os.path.join(GENOME_DIR, 'Pf3D7_v3.sorted.ascii.gff.gz')
fasta = pyfasta.Fasta(FASTA_FN)
    
In [10]:
    
dup_samples = {
    '3d7_hb3': ['C01/PG0062-C/ERR019070',
                'C02/PG0053-C/ERR019067',
                'C02/PG0055-C/ERR019066',
                'C02/PG0056-C/ERR019068'],
    'hb3_dd2': [],
    '7g8_gb4': ['AUD/PG0112-CW/ERR045639',
                'JC9/PG0111-C/ERR029409',
                'JE11/PG0100-CW/ERR045630',
                'JF6/PG0079-CW/ERR045637',
                'KB8/PG0104-CW/ERR045642',
                'LA10/PG0086-CW/ERR045629',
                'NIC/PG0095-C/ERR027107',
                'QF5/PG0078-CW/ERR045638',
                'XD8/PG0105-CW/ERR045628',
                'XF12/PG0102-CW/ERR045635'],
}
excessive_recomb_samples = {
    '3d7_hb3': [],
    'hb3_dd2': ['SC01/PG0025-C/ERR019045'],
    '7g8_gb4': ['D2/PG0094-CW/ERR045632'],
}
    
In [11]:
    
def interleave(a, b):
    c = np.empty((a.size + b.size,), dtype=a.dtype)
    c[0::2] = a
    c[1::2] = b
    return c
def unpack_callset(callset):
    V = callset['variants'].view(np.recarray)
    C = callset['calldata'].view(np.recarray)
    C2d = vcfnp.view2d(C).view(np.recarray)
    return V, C, C2d
def filter_variants(callset, query=None, verbose=True):
    if query is None:
        return callset
    else:
        V, C, _ = unpack_callset(callset)
        if isinstance(query, str):
            flt = numexpr.evaluate(query, local_dict=V)
        elif callable(query):
            flt = query(*unpack_callset(callset))
        else:
            flt = np.asarray(query)
        indices = np.nonzero(flt)[0]
        ret, exc, tot = indices.size, V.size-indices.size, V.size
        if verbose:
            log('filter variants: excluding %s (%.1f%%)' % (exc, exc*100./tot), 'retaining %s (%.1f%%)' % (ret, ret*100./tot), 'of', tot, 'variants')
        V = np.take(V, indices, axis=0).view(np.recarray)
        C = np.take(C, indices, axis=0).view(np.recarray)
        return {'variants': V, 'calldata': C}
def filter_calls(callset, query=None, verbose=True):
    if query is None:
        return callset
    else:
        V, C, _ = unpack_callset(callset)
        C = C.copy()
        C2d = vcfnp.view2d(C).view(np.recarray)
        # filter calls
        if isinstance(query, str):
            flt = numexpr.evaluate(query, local_dict=C2d)
        elif callable(query):
            flt = query(*unpack_callset(callset))
        else:
            flt = np.asarray(query)
        # only filter non-missing
        excl = (C2d.genotype >= 0) & ~flt
        tot, exc, ret = excl.size, np.count_nonzero(excl), np.count_nonzero(~excl)
        if verbose:
            log('filter calls: excluding %s (%.1f%%)' % (exc, exc*100./tot), 'retaining %s (%.1f%%)' % (ret, ret*100./tot), 'of', tot, 'calls')
        C2d.genotype[excl] = -2
        return {'variants': V, 'calldata': C}    
    
def filter_samples(callset, inclusions=None, exclusions=None, verbose=True):
    if inclusions or exclusions:
        V, C, _ = unpack_callset(callset)
        samples = C.dtype.names
        if inclusions:
            exclusions = [s for s in samples if s not in inclusions]
        elif exclusions:
            inclusions = [s for s in samples if s not in exclusions]
        if verbose:
            log('filter samples: excluding', exclusions, 'including', inclusions)
        C = C[inclusions].copy()
        return {'variants': V, 'calldata': C}
    else:
        return callset
    
    
def filter_callset(callset, 
                   variant_filter=None, 
                   call_filter=None, 
                   sample_inclusions=None, 
                   sample_exclusions=None,
                   verbose=True):
    
    callset = filter_variants(callset, variant_filter, verbose=verbose)
    
    callset = filter_samples(callset, inclusions=sample_inclusions, exclusions=sample_exclusions, verbose=verbose)
    
    callset = filter_calls(callset, call_filter, verbose=verbose)
    
    return callset
def load_callset(callset_fn, 
                 variant_filter=None, 
                 call_filter=None, 
                 sample_inclusions=None, 
                 sample_exclusions=None,
                 verbose=True):
    
    if verbose:
        log('loading', callset_fn)
    callset = np.load(callset_fn)
    callset = filter_callset(callset, 
                             variant_filter=variant_filter, 
                             call_filter=call_filter, 
                             sample_inclusions=sample_inclusions,
                             sample_exclusions=sample_exclusions,
                             verbose=verbose)
    
    return callset
def filter_callsets(callsets,
                    crosses=CROSSES,
                    variant_filter=None, 
                    call_filter=None, 
                    sample_inclusions=dict(), 
                    sample_exclusions=dict(),
                    verbose=True):
    out = dict()
    for cross in callsets.keys():
        out[cross] = filter_callset(callsets[cross], 
                                    variant_filter=variant_filter,
                                    call_filter=call_filter,
                                    sample_inclusions=sample_inclusions.get(cross, None),
                                    sample_exclusions=sample_exclusions.get(cross, None),
                                    verbose=verbose)
    return out
    
    
def load_callsets(callset_fn_template,
                  crosses=CROSSES,
                  variant_filter=None, 
                  call_filter=None, 
                  sample_inclusions=dict(), 
                  sample_exclusions=dict(),
                  verbose=True):
    callsets = dict()
    for cross in CROSSES:
        callset_fn = callset_fn_template.format(cross=cross)
        callsets[cross] = load_callset(callset_fn, 
                                       variant_filter=variant_filter,
                                       call_filter=call_filter,
                                       sample_inclusions=sample_inclusions.get(cross, None),
                                       sample_exclusions=sample_exclusions.get(cross, None),
                                       verbose=verbose)
    return callsets
    
    
def locate_variants(CHROM, POS, chrom, start=None, stop=None):
    if isinstance(chrom, str):
        chrom = chrom.encode('ascii')
    start_index, stop_index = bisect_left(CHROM, chrom), bisect_right(CHROM, chrom)
    POS = POS[start_index:stop_index]
    if stop is not None:
        stop_index = start_index + bisect_right(POS, stop)
    if start is not None:
        start_index = start_index + bisect_left(POS, start)
    return start_index, stop_index
def min_haplen_calls(min_haplen):
    def _f(v, c, c2d):
        hl, hs, _ = haplotypes(v, c)
        return hl >= min_haplen
    return _f
def gatk_conf_calls(V, C, C2d):
    return C2d.GQ >= 99
def cortex_conf_calls(V, C, C2d):
    return C2d.GT_CONF >= 50
def combined_conf_calls(v, c, c2d):
    return ((v.set[:,np.newaxis] == b'Cortex') & (c2d.GT_CONF >= 50)
            | (v.set[:,np.newaxis] != b'Cortex') & (c2d.GQ >= 99))
    
In [12]:
    
def display_with_nrows(tbl, n=5, caption=''):
    caption += ' (%s rows)' % tbl.nrows()
    tbl.display(n, caption=caption)
    
In [13]:
    
CACHE_DIR = os.path.join(PUBLIC_DIR, 'cache')
    
In [14]:
    
def etlcache(f):
    fn = os.path.join(CACHE_DIR, f.__name__)
    if not os.path.exists(fn):
        etl.topickle(f(), fn)
    return etl.frompickle(fn)
    
    
def pdcache(f):
    fn = os.path.join(CACHE_DIR, f.__name__)
    if not os.path.exists(fn):
        obj = f()
        pandas.to_pickle(obj, fn)
        return obj
    else:
        return pandas.read_pickle(fn)
    
    
def nocache(f):
    fn = os.path.join(CACHE_DIR, f.__name__)
    if os.path.exists(fn):
        os.remove(fn)
    return f()
    
In [15]:
    
tbl_samples = (etl
    .fromtsv(os.path.join(PUBLIC_DIR, 'samples.txt'))
    .addfield('ID', lambda row: '%s/%s/%s' % (row.clone, row.sample, row.run))
)
run2id = tbl_samples.lookupone('run', 'ID')
tbl_samples
    
    Out[15]:
In [16]:
    
@etlcache
def tbl_regions_1b():
    return (etl
        .fromtsv(os.path.join(PUBLIC_DIR, 'regions-20130225.onebased.txt'))
        .pushheader(['chrom', 'start', 'stop', 'type'])
        .convertnumbers()
        .prefixheader('region_')
        .addfield('region_size', lambda row: row.region_stop - row.region_start + 1)
    )
tbl_regions_1b
    
    Out[16]:
In [17]:
    
@etlcache
def tbl_features():
    return (etl
        .fromgff3(GFF_FN)
        .unpackdict('attributes', ['ID', 'Parent', 'Name', 'previous_systematic_id'])
        .rename({'seqid': 'chrom', 'end': 'stop', 'ID': 'id', 'Parent': 'parent_id', 'Name': 'name', 'previous_systematic_id': 'previous_id'})
        .cutout('source', 'score', 'phase')
        .addfield('length', lambda row: row.stop - row.start, index=4)
        .intervalleftjoin(tbl_regions_1b, lkey='chrom', rkey='region_chrom', rstart='region_start', rstop='region_stop')
        .cutout('region_chrom', 'region_start', 'region_stop')
        .distinct('id')
        .sort(key=('chrom', 'start', 'type'))
        .prefixheader('feature_')
    )
tbl_features
    
    Out[17]:
In [18]:
    
lkp_feature = tbl_features.recordlookupone('feature_id')
    
In [19]:
    
@etlcache
def tbl_genes():
    return (
        tbl_features
        .selectin('feature_type', ['gene', 'pseudogene'])
        .cutout('feature_parent_id')
    )
tbl_genes
    
    Out[19]:
In [20]:
    
@etlcache
def tbl_intergenic():
    return (
        tbl_genes
        .addfieldusingcontext('next', lambda prv, cur, nxt: (nxt.feature_start, nxt.feature_id, nxt.feature_strand) if nxt and cur.feature_chrom == nxt.feature_chrom else None)
        .notnone('next')
        .unpack('next', ['next_start', 'next_id', 'next_strand'])
        .convert('feature_id', lambda v, row: '%s_%s' % (row.feature_id, row.next_id), pass_row=True)
        .cutout('feature_start', 'feature_length', 'feature_region_type', 'next_id')
        .rename({'feature_stop': 'feature_start', 'next_start': 'feature_stop'})
        .movefield('feature_stop', 3)
        .addfield('feature_length', lambda row: row.feature_stop - row.feature_start, index=4)
        .convert('feature_type', lambda v, row: 'intergenic %s/%s' % (row.feature_strand, row.next_strand), pass_row=True)
        # reclassify based on expected number of promoters
        .convert('feature_type', {'intergenic +/-': 'intergenic_0', 'intergenic -/+': 'intergenic_2', 'intergenic +/+': 'intergenic_1', 'intergenic -/-': 'intergenic_1'})
        .cutout('feature_strand', 'next_strand', 'feature_name', 'feature_previous_id')
        .intervalleftjoin(tbl_regions_1b, 
                          lkey='feature_chrom', lstart='feature_start', lstop='feature_stop',
                          rkey='region_chrom', rstart='region_start', rstop='region_stop')
        .rename('region_type', 'feature_region_type')
        .cutout('region_chrom', 'region_start', 'region_stop')
        .gt('feature_length', 0)
    )
tbl_intergenic
    
    Out[20]:
In [21]:
    
@etlcache
def tbl_exons():
    return (
        tbl_features
        .eq('feature_type', 'CDS')
        .gt('feature_length', 0)
    )
tbl_exons
    
    Out[21]:
In [22]:
    
@etlcache
def tbl_introns():
    return (
        tbl_exons
        .addfieldusingcontext('next', 
                              lambda prv, cur, nxt: (nxt.feature_start, nxt.feature_id) if nxt and cur.feature_parent_id == nxt.feature_parent_id else None)
        .notnone('next')
        .unpack('next', ['next_start', 'next_id'])
        .convert('feature_id', lambda v, row: '%s_%s' % (row.feature_id, row.next_id), pass_row=True)
        .cutout('feature_start', 'feature_length', 'feature_region_type', 'next_id')
        .rename({'feature_stop': 'feature_start', 'next_start': 'feature_stop'})
        .movefield('feature_stop', 3)
        .addfield('feature_length', lambda row: row.feature_stop - row.feature_start, index=4)
        .convert('feature_type', lambda v: 'intron')
        .intervalleftjoin(tbl_regions_1b, 
                          lkey='feature_chrom', lstart='feature_start', lstop='feature_stop',
                          rkey='region_chrom', rstart='region_start', rstop='region_stop')
        .rename('region_type', 'feature_region_type')
        .cutout('region_chrom', 'region_start', 'region_stop')
    )
tbl_introns
    
    Out[22]:
In [23]:
    
@etlcache
def tbl_gene_aliases():
    return (etl
        .fromtsv(os.path.join(DATA_DIR, 'genome/plasmodb/release-9.3/Pfalciparum3D7/txt/PlasmoDB-9.3_Pfalciparum3D7_GeneAliases.txt'))
        .pushheader(['feature_id'] + list(map(str, range(10))))
        .convert('feature_id', lambda v: v[:13])  # ignore alternate splice forms
        .cat()  # square up short rows
        .melt('feature_id')
        .rename('value', 'feature_alias')
        .cut('feature_id', 'feature_alias')
        .ne('feature_alias', None)
    )
tbl_gene_aliases
    
    Out[23]:
In [24]:
    
@etlcache
def tbl_hp1():
    return (etl
        .fromtext(os.path.join(DATA_DIR, 'reference/flueck_2009/journal.ppat.1000569.s008.txt'))
        .rowslice(3, None)
        .ne('lines', '')
        .capture('lines', r'^\S+\s+\d+\s+\d+\s+(\S+)', ['feature_alias'], include_original=1)
        .capture('lines', r'^\S+\s+\d+\s+\d+\s+\S+.*?(-?\d\.\d{3})', ['hp1'], include_original=1)
        .convert('hp1', float)
        .cutout('lines')
        .leftjoin(tbl_gene_aliases, key='feature_alias')
        .ne('feature_id', None)
        .sort('feature_id')
    )
tbl_hp1
    
    Out[24]:
In [ ]:
    
    
In [25]:
    
def plot_variants_locator(ax, callset, chrom, start=0, stop=None, step=5):
    V, _, _ = unpack_callset(callset)
    if stop is None:
        stop = len(fasta[chrom])
    indices = np.nonzero((V.CHROM == chrom.encode('ascii')) & (V.POS >= start) & (V.POS <= stop))[0]
    POS = np.take(V.POS, indices, axis=0)
    anhima.loc.plot_variant_locator(POS, ax=ax, step=step, start_position=start, stop_position=stop, line_args=dict(color='k'))
    ax.set_xticks([])
    ax.set_xlabel('')
    
In [26]:
    
INHERITANCE_COLORS = [
    'white', # missing
    'grey', # filtered
    'yellow', # parent missing
    'yellow', # parent filtered
    'black', # non-parental
    'orange', # non-segregating ref 
    'green', # non-segregating alt 
    'blue', # parent 1 
    'red' # parent 2 
]
def plot_inheritance(ax, callset, chrom, start=0, stop=None, sample_indices=None, colors=INHERITANCE_COLORS, alpha=1, **kwargs):
    V, C, C2d = unpack_callset(callset)
    samples = C.dtype.names
    
    if stop is None:
        stop = len(fasta[chrom])
    indices = np.nonzero((V.CHROM == chrom.encode('ascii')) & (V.POS >= start) & (V.POS <= stop))[0]
    POS = np.take(V.POS, indices, axis=0)
    G = np.take(C2d.genotype, indices, axis=0)
    P = inheritance(G)
    
    if sample_indices is not None:
        P = np.take(P, sample_indices, axis=1)
        samples = np.take(samples, sample_indices, axis=0) 
    anhima.gt.plot_discrete_calldata(P, labels=samples, colors=colors, states=range(len(colors)), ax=ax, **kwargs)
    
In [27]:
    
def get_features(chrom, start=None, stop=None):
    if None not in {start, stop}:
        region = '%s:%s-%s' % (chrom, start, stop)
    else:
        region = chrom
    tbl_features = (etl
        .fromgff3(GFF_FN, region)
        .cutout('source', 'score', 'phase')
        .unpackdict('attributes', ['ID', 'Name', 'previous_systematic_id'], includeoriginal=True)
    )
    tbl_features = tbl_features.setheader(['feature_' + str(f).lower() for f in tbl_features.header()])
    return tbl_features
prog_hv = re.compile('var|rif|stev', flags=re.I) 
def plot_genes(ax, chrom, start=0, stop=None, label_named=False, exclude_labels=[], width=.3, label_ytextpad=.1,
               normal_color='k', hypervariable_color='r', cds_color='gray', divider_linewidth=1,
               annotate_genes=[], annotate_xtext=0, annotate_ytext_fwd=6, annotate_ytext_rev=8, 
               annotate_arrowprops=None, annotate_bbox=None, 
               show_cds=False, gene_labels=dict()):
    if stop is None:
        stop = len(fasta[chrom])
    tbl_local_features = get_features(chrom, start, stop)
    tbl_local_genes = tbl_local_features.selectin('feature_type', {'gene', 'pseudogene'})
    tbl_local_cds = tbl_local_features.eq('feature_type', 'CDS')
    
    # divider
    ax.plot([start, stop], [.5, .5], 'k-', linewidth=divider_linewidth)
    ax.set_ylim(0, 1)
    
    # positive strand
    tbl_fwd_genes = tbl_local_genes.eq('feature_strand', '+')
    xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_fwd_genes.records()]
    fc = [hypervariable_color if r.feature_name is not None and prog_hv.search(r.feature_name) is not None else normal_color 
          for r in tbl_fwd_genes.records()]
    ax.broken_barh(xranges, (.5, width), color=fc)
    if show_cds:
        tbl_fwd_cds = tbl_local_cds.eq('feature_strand', '+')
        xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_fwd_cds.records()]
        ax.broken_barh(xranges, (.5, width), color=cds_color)
        
    
    # negative strand
    tbl_rev_genes = tbl_local_genes.eq('feature_strand', '-')
    xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_rev_genes.records()]
    fc = [hypervariable_color if r.feature_name is not None and prog_hv.search(r.feature_name) is not None else normal_color
          for r in tbl_rev_genes.records()]
    ax.broken_barh(xranges, (.5-width, width), color=fc)
    if show_cds:
        tbl_rev_cds = tbl_local_cds.eq('feature_strand', '-')
        xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_rev_cds.records()]
        ax.broken_barh(xranges, (.5-width, width), color=cds_color)
    
    for s in 'top', 'left', 'right':
        ax.spines[s].set_visible(False)
    ax.set_xlim(start, stop)
    ax.set_yticks([])
    ax.xaxis.tick_bottom()
    # label named genes
    if label_named:
        for g in tbl_local_genes.records():
            if g.feature_name is not None and g.feature_name not in exclude_labels:
                x = g.feature_start
                y = .5 + width + label_ytextpad if g.feature_strand == '+' else .5 - width - label_ytextpad
                va = 'bottom' if g.feature_strand == '+' else 'top'
                l = g.feature_name
                ax.text(x, y, l, rotation=0, ha='left', va=va, transform=ax.transData)
                
    # annotate specific genes
    if annotate_arrowprops is None:
        annotate_arrowprops = dict(facecolor='k', arrowstyle='-', shrinkA=0, shrinkB=0)
    if annotate_bbox is None:
        annotate_bbox = dict(facecolor='w', alpha=.9)
    for id in annotate_genes:
        g = lkp_feature[id]
        gene_id = g['feature_id']
        l = gene_labels[gene_id] if gene_id in gene_labels else g['feature_name'] if g['feature_name'] is not None else gene_id
        if g['feature_strand'] == '+':
            ax.annotate(l, xy=(g['feature_start'], .5), ha='left', xytext=(annotate_xtext, annotate_ytext_fwd), 
                        xycoords='data', textcoords='offset points', 
                        arrowprops=annotate_arrowprops,
                        bbox=annotate_bbox)
        elif g['feature_strand'] == '-':
            ax.annotate(l, xy=(g['feature_start'], .5), ha='left', xytext=(annotate_xtext, annotate_ytext_rev), 
                        xycoords='data', textcoords='offset points', 
                        arrowprops=annotate_arrowprops,
                        bbox=annotate_bbox)
    
In [28]:
    
CNV_RUNS = {
    '3d7_hb3': ['ERR019061', 'ERR019054'],
    'hb3_dd2': ['ERR012788', 'ERR012840', 'ERR019054'],  # share HB3s
    '7g8_gb4': ['ERR027099', 'ERR027100']
}
accessibility_colors = {
    'Core': 'white', 
    'SubtelomericRepeat': 'yellow',
    'SubtelomericHypervariable': '#d73027',
    'InternalHypervariable': '#fc8d59',
    'Centromere': 'black',
    'CNV': '#91bfdb',
}
def plot_accessibility(ax, chrom, cross=None, start=0, stop=None, linewidth=0.5, **kwargs):
    tbl_lreg = tbl_regions_1b.eq('region_chrom', chrom)
    if stop is None:
        stop = len(fasta[chrom])
    # plot core
    xranges = tbl_lreg.eq('region_type', 'Core').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['Core'], 
                   linewidth=linewidth, **kwargs)
    # plot centromere
    xranges = tbl_lreg.eq('region_type', 'Centromere').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['Centromere'], 
                   linewidth=linewidth, **kwargs)
    # plot subtelomeric repeat
    xranges = tbl_lreg.eq('region_type', 'SubtelomericRepeat').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['SubtelomericRepeat'], 
                   linewidth=linewidth, **kwargs)
    
    # plot hypervariable
    xranges = tbl_lreg.eq('region_type', 'SubtelomericHypervariable').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['SubtelomericHypervariable'], 
                   linewidth=linewidth, **kwargs)
    xranges = tbl_lreg.eq('region_type', 'InternalHypervariable').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['InternalHypervariable'], 
                   linewidth=linewidth, **kwargs)
    
    if cross is not None:
        # plot CNVs
        tbls_cnvs = [etl.fromtsv(CNV_TBL_FN_TEMPLATE.format(run=run)).convertnumbers() for run in CNV_RUNS[cross]]
        tbl_cnvs = (etl
            .cat(*tbls_cnvs)
            .gt('cn', 1)
            .eq('chrom', chrom)
            .intervalleftjoin(tbl_lreg, lkey='chrom', rkey='region_chrom', 
                              lstart='start', lstop='stop', 
                              rstart='region_start', rstop='region_stop')
            .eq('region_type', 'Core')
        )
        xranges = tbl_cnvs.values(['start', 'size'])
        ax.broken_barh(xranges, (.1, .8), color=accessibility_colors['CNV'], 
                       linewidth=linewidth, **kwargs)
    
    # tidy up
    for s in 'top', 'left', 'right', 'bottom':
        ax.spines[s].set_visible(False)
    ax.set_xlim(start, stop)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.xaxis.tick_bottom()
    ax.set_ylim(0, 1)
    
In [29]:
    
def plot_hp1(ax, chrom, start=0, stop=None):
    if stop is None:
        stop = len(fasta[chrom])
        
    tbl_genes_local = get_features(chrom, start, stop).selectin('feature_type', {'gene', 'pseudogene'})
    A = tbl_genes_local.cut('feature_start', 'feature_end', 'feature_id').join(tbl_hp1, key='feature_id').torecarray()
    left = A.feature_start
    height = A.hp1
    width = A.feature_end - A.feature_start
    ax.bar(left, height, width, facecolor='purple', linewidth=1, edgecolor='purple')
    
    ax.set_xlim(start, stop)
    for s in 'top', 'left', 'right', 'bottom':
        ax.spines[s].set_visible(False)
    ax.set_yticks([])
    ax.set_ylim(-1, 7)
    ax.set_xticks([])
#    ax.plot([start, stop], [0, 0], color='k', linestyle='-')
    
In [30]:
    
# INHERITANCE_COLORS = [
#     'white', # missing
#     'grey', # filtered
#     'yellow', # parent missing
#     'yellow', # parent filtered
#     'black', # non-parental
#     'orange', # non-segregating ref 
#     'green', # non-segregating alt 
#     'blue', # parent 1 
#     'red' # parent 2 
# ]
# def plot_inheritance(ax, callset, chrom, start=0, stop=None, sample_indices=None, colors=INHERITANCE_COLORS, alpha=1, **kwargs):
#     V, C, C2d = unpack_callset(callset)
#     samples = C.dtype.names
    
#     if stop is None:
#         stop = len(fasta[chrom])
#     indices = np.nonzero((V.CHROM == chrom) & (V.POS >= start) & (V.POS <= stop))[0]
#     POS = np.take(V.POS, indices, axis=0)
#     G = np.take(C2d.genotype, indices, axis=0)
#     P = inheritance(G)
    
#     if sample_indices is not None:
#         P = np.take(P, sample_indices, axis=1)
#         samples = np.take(samples, sample_indices, axis=0) 
#     vcfplt.discrete_calldata_colormesh(P, labels=samples, colors=colors, states=range(len(colors)), alpha=alpha, ax=ax, **kwargs)
    
In [31]:
    
def marker_span(callset, chrom):
    V, _, _ = unpack_callset(callset)
    start_index, stop_index = locate_variants(V.CHROM, V.POS, chrom)
    POS = V.POS[start_index:stop_index]
    x = POS[-1] - POS[0]
    return x
def marker_offset(callset, chrom):
    V, _, _ = unpack_callset(callset)
    start_index, stop_index = locate_variants(V.CHROM, V.POS, chrom)
    POS = V.POS[start_index:stop_index]
    return POS[0]
def locate_flanking_markers(CHROM, POS, chrom, pos):
    chrom_start_index, chrom_stop_index = bisect_left(CHROM, chrom), bisect_right(CHROM, chrom)
    POS = POS[chrom_start_index:chrom_stop_index]
    
    start_index = bisect_left(POS, pos) - 1
    stop_index = bisect_right(POS, pos)
    
    if start_index < 0:
        start_index = None
    else:
        start_index += chrom_start_index
        
    if stop_index >= len(POS):
        stop_index = None
    else:
        stop_index += chrom_start_index
    
    return start_index, stop_index
    
def simulate_random_uniform_crossovers(n_events, callset, chromosomes=CHROMOSOMES):
    
    # table of results
    tbl = [['chrom', 'co_pos', 'co_pos_mid', 'co_pos_min', 'co_pos_max', 'co_pos_range']]
    
    # utility variables
    V = callset['variants']
    marker_spans = [marker_span(callset, chrom) for chrom in chromosomes]
    marker_offsets = [marker_offset(callset, chrom) for chrom in chromosomes]
    cumulative_marker_span = [np.sum(marker_spans[:i+1]) for i in range(len(marker_spans))]
    total_marker_span = np.sum(marker_spans)
    
    # generate random integers over the total marker span
    X = np.random.randint(low=0, high=total_marker_span, size=n_events)
    
    # map each event onto a chromosome and position
    for x in X:
        # which chromosome?
        chrom_index = bisect_left(cumulative_marker_span, x)
        chrom = chromosomes[chrom_index]
        # which position?
        if chrom_index > 0:
            pos_relative = x - cumulative_marker_span[chrom_index-1]
        else:
            pos_relative = x
        pos = pos_relative + marker_offsets[chrom_index]
        
        # find flanking markers
        lidx, ridx = locate_flanking_markers(V.CHROM, V.POS, chrom, pos)
        pos_min = V.POS[lidx]
        pos_max = V.POS[ridx]
        pos_mid = (pos_min + pos_max)//2
        pos_range = pos_max - pos_min
        
        row = [chrom, pos, pos_mid, pos_min, pos_max, pos_range]
        tbl += [row]
        
    return etl.wrap(tbl)
def simulate_random_uniform_non_crossovers(n_events, callset, phi, chromosomes=CHROMOSOMES):
    
    # table of results
    tbl = [['chrom', 
            'tract_start', 
            'tract_stop', 
            'tract_length',
            'tract_start_min', 
            'tract_start_mid', 
            'tract_start_max', 
            'tract_stop_min', 
            'tract_stop_mid', 
            'tract_stop_max', 
            'tract_length_min', 
            'tract_length_mid', 
            'tract_length_max', 
            'tract_support']]
    
    # utility variables
    V = callset['variants']
    marker_spans = [marker_span(callset, chrom) for chrom in chromosomes]
    marker_offsets = [marker_offset(callset, chrom) for chrom in chromosomes]
    cumulative_marker_span = [np.sum(marker_spans[:i+1]) for i in range(len(marker_spans))]
    total_marker_span = np.sum(marker_spans)
    
    # generate random integers over the total marker span
    X = np.random.randint(low=0, high=total_marker_span, size=n_events)
    # generate random geometric tract lengths
    lengths = stats.geom.rvs(1-phi, size=n_events).astype('int32')
    
    # map each event onto a chromosome and position
    for x, length in zip(X, lengths):
        try:
            # which chromosome?
            chrom_index = bisect_left(cumulative_marker_span, x)
            chrom = chromosomes[chrom_index]
            # which start position?
            if chrom_index > 0:
                pos_relative = x - cumulative_marker_span[chrom_index-1]
            else:
                pos_relative = x
            start = pos_relative + marker_offsets[chrom_index]
            stop = start + length
            # find flanking markers
            start_lidx, start_ridx = locate_flanking_markers(V.CHROM, V.POS, chrom, start)
            stop_lidx, stop_ridx = locate_flanking_markers(V.CHROM, V.POS, chrom, stop)
            
            if None in {start_lidx, start_ridx, stop_lidx, stop_ridx}:
                
                # falls off edge of marker span
                support = start_min = start_max = start_mid = stop_min = stop_max = stop_mid = length_min = length_max = length_mid = None
            else:
                support = stop_ridx - start_ridx
                # do we even observe the event?
                if support == 0:
                    
                    start_min = start_max = start_mid = stop_min = stop_max = stop_mid = length_min = length_max = length_mid = None
                else:
                    start_min = V.POS[start_lidx]
                    start_max = V.POS[start_ridx]
                    start_mid = (start_min + start_max)//2
                    stop_min = V.POS[stop_lidx]
                    stop_max = V.POS[stop_ridx]
                    stop_mid = (stop_min + stop_max)//2
                    length_min = stop_min - start_max
                    length_max = stop_max - start_min
                    length_mid = (length_min + length_max)//2
            row = [chrom, 
                   start, 
                   stop, 
                   length,
                   start_min, 
                   start_mid, 
                   start_max, 
                   stop_min, 
                   stop_mid, 
                   stop_max, 
                   length_min, 
                   length_mid, 
                   length_max, 
                   support]
            tbl += [row]
    
        except Exception as e:
            log(e)
            log(traceback.format_exc(10))
                    
    return etl.wrap(tbl)
    
In [32]:
    
%reload_ext Cython
    
In [33]:
    
%%cython
import numpy as np
cimport numpy as np
import petl as etl
import vcfnp
CHROMOSOMES = [('Pf3D7_%02d_v3' % n).encode('ascii') for n in range(1, 15)]
# constants to represent the possible outcomes when attempting to determine the inheritance for any given variant call
INHERITANCE_MISSING = 0  # genotype call is missing
INHERITANCE_FILTERED = 1  # genotype call is filtered
INHERITANCE_PARENT_MISSING = 2  # genotype is called but cannot be phased because one or both parents is missing
INHERITANCE_PARENT_FILTERED = 3  # genotype is called but cannot be phased because one or both parents is filtered
INHERITANCE_NONPARENTAL = 4  # genotype call does not correspond with either parent (i.e., mendelian error sensu strictu)
INHERITANCE_NONSEGREGATING_REF = 5  # genotype call is ref and parents both have reference allele
INHERITANCE_NONSEGREGATING_ALT = 6  # genotype call is alt and parents both have alternate allele
INHERITANCE_PARENT1 = 7  # genotype call corresponds with first parent
INHERITANCE_PARENT2 = 8  # genotype call corresponds with second parent
# functions to determine inheritance states
###########################################
def inheritance(np.ndarray[np.int8_t, ndim=2] G):
    cdef int m = G.shape[0] # number of variants
    cdef int n = G.shape[1] # number of samples
    cdef np.ndarray[np.uint8_t, ndim=2] P
    P = np.zeros((m,n), dtype='u1')
    for i in range(m):
        for j in range(n):
            # N.B., parents are *first* two samples
            P[i,j] = inheritance_state(G[i,j], G[i,0], G[i,1])
    return P
cdef int inheritance_state(int genotype_child, int genotype_parent1, int genotype_parent2):
    if genotype_child == -1:
        return INHERITANCE_MISSING
    elif genotype_child == -2:
        return INHERITANCE_FILTERED
    elif genotype_parent1 == -1 or genotype_parent2 == -1:
        return INHERITANCE_PARENT_MISSING
    elif genotype_parent1 == -2 or genotype_parent2 == -2:
        return INHERITANCE_PARENT_FILTERED
    elif genotype_child != genotype_parent1 and genotype_child != genotype_parent2:
        return INHERITANCE_NONPARENTAL
    elif genotype_child == genotype_parent1 == genotype_parent2 == 0:
        return INHERITANCE_NONSEGREGATING_REF
    elif genotype_child == genotype_parent1 == genotype_parent2 and genotype_child > 0:
        return INHERITANCE_NONSEGREGATING_ALT
    elif genotype_child == genotype_parent1:
        return INHERITANCE_PARENT1
    elif genotype_child == genotype_parent2:
        return INHERITANCE_PARENT2
    else:
        return -1 # should never be reached
# functions to calculate haplotype statistics
#############################################
def haplotypes_chrom(V, C, chrom):
    """
    Construct arrays of haplotype length and haplotype support for all variant calls.
    """
    tbl = list()
    header = 'sample', 'chrom', 'start_min', 'start_mid', 'start_max', 'stop_min', 'stop_mid', 'stop_max', 'length_min', \
             'length_mid', 'length_max', 'support', 'prv_inheritance', 'inheritance', 'nxt_inheritance'
    tbl.append(header)
    cdef int i, j, M, N, prv_idx, start_idx, stop_idx, start_min, start_mid, start_max, stop_min, stop_mid, \
        stop_max, length_min, length_mid, length_max, support, prv, cur
    samples = C.dtype.names
    C2d = vcfnp.view2d(C).view(np.recarray)
    # only deal with one chromosome, make life simpler
    flt = V.CHROM == chrom
    indices = np.nonzero(flt)[0]
    V = np.take(V, indices, axis=0)
    C2d = np.take(C2d, indices, axis=0)
    M, N = C2d.shape
    cdef np.ndarray[np.uint8_t, ndim=2] P
    P = inheritance(C2d.genotype)  # inheritance
#    print M, N
    cdef np.ndarray[np.int32_t, ndim=1] POS
    POS = V['POS']
    cdef np.ndarray[np.int32_t, ndim=2] HL
    cdef np.ndarray[np.int32_t, ndim=2] HS
    HL = np.zeros((M, N), dtype='i4') # haplotype lengths (minimum)
    HS = np.zeros((M, N), dtype='i4') # haplotype support - number of calls supporting the haplotype
    cdef np.ndarray[np.int32_t, ndim=1] previous_call_idx
    cdef np.ndarray[np.int32_t, ndim=2] haplotype_start_idxs
    cdef np.ndarray[np.int32_t, ndim=1] last_haplotype_inheritance
    previous_call_idx = np.array([-1] * N, dtype='i4') # store index of last good call
    haplotype_start_idxs = np.array([[-1, -1]] * N, dtype='i4') # store indices of haplotype start (flanking markers)
    last_haplotype_inheritance = np.array([-1] * N, dtype='i4') # store parental inheritance of upstream block
    # iterate over variants
    for i in range(M):
        # iterate over samples
        for j in range(N):
            # extract genotype call for current variant/sample
            cur = P[i, j]
            if cur == INHERITANCE_PARENT1 or cur == INHERITANCE_PARENT2: # current call is good
                # index of last good call for this sample
                prv_idx = previous_call_idx[j]
                if prv_idx < 0:
                    # this is the first good call, start a haplotype
                    haplotype_start_idxs[j] = (i, i)
                else:
                    # extract last good genotype call for current sample
                    prv = P[prv_idx, j]
                    if cur != prv:
                        # inheritance switch, haplotype has ended
                        start_min_idx, start_max_idx = tuple(haplotype_start_idxs[j])
                        stop_min_idx, stop_max_idx = prv_idx, i
                        start_min = POS[start_min_idx]
                        start_max = POS[start_max_idx]
                        start_mid = (start_max + start_min) // 2
                        stop_min = POS[stop_min_idx]
                        stop_max = POS[stop_max_idx]
                        stop_mid = (stop_max + stop_min) // 2
                        length_min = stop_min - start_max
                        length_max = stop_max - start_min
                        length_mid = stop_mid - start_mid
                        support = np.count_nonzero(P[start_max_idx:stop_min_idx+1, j] == prv)
                        # assign results
                        HL[start_max_idx:stop_min_idx+1, j] = length_min
                        HS[start_max_idx:stop_min_idx+1, j] = support
                        row = samples[j], chrom, start_min, start_mid, start_max, stop_min, stop_mid, stop_max, \
                              length_min, length_mid, length_max, support, last_haplotype_inheritance[j], prv, cur
                        tbl.append(row)
                        # start new haplotype
                        haplotype_start_idxs[j] = (prv_idx, i)
                        last_haplotype_inheritance[j] = prv
                    else:
                        # haplotype continues
                        pass
                # store last good call index
                previous_call_idx[j] = i
    # final haplotypes ending at chromosome end
    for j in range(N):
        start_min_idx, start_max_idx = tuple(haplotype_start_idxs[j])
        stop_min_idx, stop_max_idx = prv_idx, prv_idx  # nothing subsequent
        start_min = POS[start_min_idx]
        start_max = POS[start_max_idx]
        start_mid = (start_max + start_min) // 2
        stop_min = POS[stop_min_idx]
        stop_max = POS[stop_max_idx]
        stop_mid = (stop_max + stop_min) // 2
        length_min = stop_min - start_max
        length_max = stop_max - start_min
        length_mid = stop_mid - start_mid
        inherit = P[prv_idx, j]
        support = np.count_nonzero(P[start_max_idx:stop_min_idx+1, j] == inherit)
        # assign results
        HL[start_max_idx:stop_min_idx+1, j] = length_min
        HS[start_max_idx:stop_min_idx+1, j] = support
        row = samples[j], chrom, start_min, start_mid, start_max, stop_min, stop_mid, stop_max, length_min, length_mid, \
              length_max, support, last_haplotype_inheritance[j], inherit, -1
        tbl.append(row)
    return HL, HS, etl.wrap(tbl)
def haplotypes(V, C):
    stats = [haplotype_stats_chrom(V, C, chrom) for chrom in CHROMOSOMES]
    HL = np.vstack(s[0] for s in stats)
    HS = np.vstack(s[1] for s in stats)
    tbl = etl.cat(*[s[2] for s in stats])
    return HL, HS, tbl
# backwards compatibility
haplotype_stats_chrom = haplotypes_chrom
haplotype_stats = haplotypes
# functions to tabulate recombination events
############################################
def tabulate_switches_chrom(V, C, chrom):
    tbl = list()
    header = 'sample', 'chrom', 'pos', 'lpos', 'rpos', 'range', 'from', 'to'
    tbl.append(header)
    cdef int i, j, m, n, prv_idx, cur, prv
    samples = C.dtype.names
    C2d = vcfnp.view2d(C).view(np.recarray)
    # only deal with one chromosome, make life simpler
    flt = V.CHROM == chrom
    indices = np.nonzero(flt)[0]
    V, C, C2d = [np.take(X, indices, axis=0) for X in [V, C, C2d]]
    m, n = C2d.shape
    cdef np.ndarray[np.uint8_t, ndim=2] P
    P = inheritance(C2d.genotype)
    cdef np.ndarray[np.int32_t, ndim=1] POS
    POS = V['POS']
    cdef np.ndarray[np.int32_t, ndim=1] previous_call_idx
    previous_call_idx = np.array([-1] * n, dtype='i4') # store index of last good call
    # iterate over variants
    for i in range(m):
        # iterate over samples
        for j in range(n):
            # extract genotype call for current variant/sample
            cur = P[i, j]
            if cur == INHERITANCE_PARENT1 or cur == INHERITANCE_PARENT2: # current call is good
                # index of last good call for this sample
                prv_idx = previous_call_idx[j]
                if prv_idx < 0:
                    # this is the first good call
                    pass
                else:
                    # extract last good genotype call for current sample
                    prv = P[prv_idx, j]
                    if cur != prv:
                        # phase switch
                        lpos = POS[prv_idx]
                        rpos = POS[i]
                        row = samples[j], chrom, (lpos + rpos) // 2, lpos, rpos, rpos-lpos, prv, cur
                        tbl.append(row)
                    else:
                        # haplotype continues
                        pass
                # store last good call index
                previous_call_idx[j] = i
    return etl.wrap(tbl)
def tabulate_switches(V, C):
    tbls = [tabulate_switches_chrom(V, C, chrom) for chrom in CHROMOSOMES]
    return etl.cat(*tbls)
    
In [ ]:
    
    
In [ ]: