Shared setup

This notebook contains imports, constants and functions which are shared across all other notebooks containing analyses for both the main body of the paper and the supplementary information.

Imports



In [1]:

    
# python standard library
import sys
# ensure Python 3
assert sys.version_info[0:2] == (3, 5)
import os
import operator
import itertools
import collections
import functools
import glob
import csv
import datetime
import bisect
from bisect import bisect_left, bisect_right
import sqlite3
import subprocess
import random
import gc
import shutil
import shelve
import re



In [2]:

    
# docker image
print('docker image', os.environ['DOCKER_IMAGE'])









    



docker image cggh/biipy:v1.6.0



In [3]:

    
%%HTML
<style type="text/css">
.container {
    width: 96%;
}
#maintoolbar {
    display: none;
}
#header-container {
    display: none;
}
#notebook {
    padding-top: 0;
}
</style>



In [4]:

    
# general purpose third party packages

import cython
#%load_ext Cython

import numpy as np
nnz = np.count_nonzero

import scipy
import scipy.stats
import scipy.spatial.distance
from scipy import stats
from scipy.spatial.distance import pdist, squareform

import numexpr

import h5py

import tables

import bcolz

import pandas

import IPython
from IPython.display import clear_output, display, HTML

import rpy2
import rpy2.robjects as ro
%reload_ext rpy2.ipython

import statsmodels

import sklearn
import sklearn.decomposition
import sklearn.manifold

import sh

import sqlalchemy

import pymysql

import psycopg2

import petl as etl
etl.config.display_index_header = True

import humanize
from humanize import naturalsize, intcomma, intword



In [5]:

    
# plotting setup
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.gridspec import GridSpec
import matplotlib_venn as venn
import seaborn as sns

sns.set_context('paper')
sns.set_style('white')
sns.set_style('ticks')

rcParams = plt.rcParams
rcParams['font.size'] = 9
rcParams['axes.labelsize'] = 9
rcParams['xtick.labelsize'] = 9
rcParams['ytick.labelsize'] = 9
rcParams['legend.fontsize'] = 9
rcParams['axes.linewidth'] = .5
rcParams['lines.linewidth'] = .5
rcParams['patch.linewidth'] = .5
rcParams['font.family'] = 'arial'
rcParams['ytick.direction'] = 'out'
rcParams['xtick.direction'] = 'out'
rcParams['savefig.jpeg_quality'] = 100
rcParams['savefig.dpi'] = 120
rcParams['lines.markeredgewidth'] = .5

golden_ratio = 1.61803398875



In [6]:

    
# bio third party packages
import Bio
import pyfasta
import pysam
import pysamstats
import petlx
import petlx.bio
import vcf
import vcfnp
import anhima
import allel

Utility functions



In [7]:

    
def log(*msg):
    msg = str(datetime.datetime.now()) + ' :: ' + ' '.join(map(str, msg))
    print(msg, file=sys.stderr) 
    sys.stderr.flush()
    
from IPython.display import display_html

def pre(msg):
    display_html('<pre style="line-height: 100%; font-size: .7em; display: inline-block; white-space: pre; background-color: #ff6; border: 0px solid #ddd; padding: 1px">' + str(msg).replace('<', '&lt;') + '</pre>', raw=True)
    
def html(content):
    display_html(content, raw=True)

Constants



In [8]:

    
CROSSES = '3d7_hb3', 'hb3_dd2', '7g8_gb4'
CHROMOSOMES = tuple('Pf3D7_{:02d}_v3'.format(n).encode('ascii') for n in range(1, 15))
PARENT_RUNS = 'ERR019061', 'ERR019054', 'ERR012788', 'ERR012840', 'ERR027099', 'ERR027100'
LABELS = {
    '3d7_hb3': '3D7 x HB3', 
    'hb3_dd2': 'HB3 x Dd2', 
    '7g8_gb4': '7G8 x GB4',
    'ERR019061': '3D7',
    'ERR019054': 'HB3(1)',
    'ERR012788': 'HB3(2)', 
    'ERR012840': 'Dd2',
    'ERR027099': '7G8',
    'ERR027100': 'GB4'
}



In [9]:

    
# N.B., change this if data are stored at a different location
BASE_DIR = '/data/plasmodium/pfalciparum/pf-crosses'
DATA_DIR = os.path.join(BASE_DIR, 'data')
RELEASE_DIR = os.path.join(DATA_DIR, 'release')
PUBLIC_DIR = os.path.join(DATA_DIR, 'public/20141022')

# public data
GATK_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.gatk.final.vcf.gz')
GATK_CALLSET_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.gatk.final.npz')
CORTEX_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.cortex.final.vcf.gz')
CORTEX_CALLSET_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.cortex.final.npz')
COMBINED_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.combined.final.vcf.gz')
COMBINED_CALLSET_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.combined.final.npz')
CORTEX_RAW_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.cortex.raw.vcf.gz')
BAM_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'bam/{run}.realigned.bam')
COVERAGE_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'cnv/{run}.coverage_ext_binned.300.{chrom}.npy')
CNV_TBL_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'cnv/{run}.cnv.300.txt')
CNV_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'cnv/{run}.cnv.300.{chrom}.npy')

# reference data
GENOME_DIR = os.path.join(DATA_DIR, 'genome/sanger/version3/September_2012')
FASTA_FN = os.path.join(GENOME_DIR, 'Pf3D7_v3.fa')
GFF_FN = os.path.join(GENOME_DIR, 'Pf3D7_v3.sorted.ascii.gff.gz')
fasta = pyfasta.Fasta(FASTA_FN)



In [10]:

    
dup_samples = {
    '3d7_hb3': ['C01/PG0062-C/ERR019070',
                'C02/PG0053-C/ERR019067',
                'C02/PG0055-C/ERR019066',
                'C02/PG0056-C/ERR019068'],
    'hb3_dd2': [],
    '7g8_gb4': ['AUD/PG0112-CW/ERR045639',
                'JC9/PG0111-C/ERR029409',
                'JE11/PG0100-CW/ERR045630',
                'JF6/PG0079-CW/ERR045637',
                'KB8/PG0104-CW/ERR045642',
                'LA10/PG0086-CW/ERR045629',
                'NIC/PG0095-C/ERR027107',
                'QF5/PG0078-CW/ERR045638',
                'XD8/PG0105-CW/ERR045628',
                'XF12/PG0102-CW/ERR045635'],
}


excessive_recomb_samples = {
    '3d7_hb3': [],
    'hb3_dd2': ['SC01/PG0025-C/ERR019045'],
    '7g8_gb4': ['D2/PG0094-CW/ERR045632'],
}

Numpy utilities



In [11]:

    
def interleave(a, b):
    c = np.empty((a.size + b.size,), dtype=a.dtype)
    c[0::2] = a
    c[1::2] = b
    return c


def unpack_callset(callset):
    V = callset['variants'].view(np.recarray)
    C = callset['calldata'].view(np.recarray)
    C2d = vcfnp.view2d(C).view(np.recarray)
    return V, C, C2d


def filter_variants(callset, query=None, verbose=True):
    if query is None:
        return callset
    else:
        V, C, _ = unpack_callset(callset)
        if isinstance(query, str):
            flt = numexpr.evaluate(query, local_dict=V)
        elif callable(query):
            flt = query(*unpack_callset(callset))
        else:
            flt = np.asarray(query)
        indices = np.nonzero(flt)[0]
        ret, exc, tot = indices.size, V.size-indices.size, V.size
        if verbose:
            log('filter variants: excluding %s (%.1f%%)' % (exc, exc*100./tot), 'retaining %s (%.1f%%)' % (ret, ret*100./tot), 'of', tot, 'variants')
        V = np.take(V, indices, axis=0).view(np.recarray)
        C = np.take(C, indices, axis=0).view(np.recarray)
        return {'variants': V, 'calldata': C}


def filter_calls(callset, query=None, verbose=True):
    if query is None:
        return callset
    else:
        V, C, _ = unpack_callset(callset)
        C = C.copy()
        C2d = vcfnp.view2d(C).view(np.recarray)

        # filter calls
        if isinstance(query, str):
            flt = numexpr.evaluate(query, local_dict=C2d)
        elif callable(query):
            flt = query(*unpack_callset(callset))
        else:
            flt = np.asarray(query)
        # only filter non-missing
        excl = (C2d.genotype >= 0) & ~flt

        tot, exc, ret = excl.size, np.count_nonzero(excl), np.count_nonzero(~excl)
        if verbose:
            log('filter calls: excluding %s (%.1f%%)' % (exc, exc*100./tot), 'retaining %s (%.1f%%)' % (ret, ret*100./tot), 'of', tot, 'calls')

        C2d.genotype[excl] = -2
        return {'variants': V, 'calldata': C}    
    

def filter_samples(callset, inclusions=None, exclusions=None, verbose=True):
    if inclusions or exclusions:
        V, C, _ = unpack_callset(callset)
        samples = C.dtype.names
        if inclusions:
            exclusions = [s for s in samples if s not in inclusions]
        elif exclusions:
            inclusions = [s for s in samples if s not in exclusions]
        if verbose:
            log('filter samples: excluding', exclusions, 'including', inclusions)
        C = C[inclusions].copy()
        return {'variants': V, 'calldata': C}
    else:
        return callset
    
    
def filter_callset(callset, 
                   variant_filter=None, 
                   call_filter=None, 
                   sample_inclusions=None, 
                   sample_exclusions=None,
                   verbose=True):
    
    callset = filter_variants(callset, variant_filter, verbose=verbose)
    
    callset = filter_samples(callset, inclusions=sample_inclusions, exclusions=sample_exclusions, verbose=verbose)
    
    callset = filter_calls(callset, call_filter, verbose=verbose)
    
    return callset


def load_callset(callset_fn, 
                 variant_filter=None, 
                 call_filter=None, 
                 sample_inclusions=None, 
                 sample_exclusions=None,
                 verbose=True):
    
    if verbose:
        log('loading', callset_fn)
    callset = np.load(callset_fn)

    callset = filter_callset(callset, 
                             variant_filter=variant_filter, 
                             call_filter=call_filter, 
                             sample_inclusions=sample_inclusions,
                             sample_exclusions=sample_exclusions,
                             verbose=verbose)
    
    return callset


def filter_callsets(callsets,
                    crosses=CROSSES,
                    variant_filter=None, 
                    call_filter=None, 
                    sample_inclusions=dict(), 
                    sample_exclusions=dict(),
                    verbose=True):
    out = dict()
    for cross in callsets.keys():
        out[cross] = filter_callset(callsets[cross], 
                                    variant_filter=variant_filter,
                                    call_filter=call_filter,
                                    sample_inclusions=sample_inclusions.get(cross, None),
                                    sample_exclusions=sample_exclusions.get(cross, None),
                                    verbose=verbose)
    return out
    
    
def load_callsets(callset_fn_template,
                  crosses=CROSSES,
                  variant_filter=None, 
                  call_filter=None, 
                  sample_inclusions=dict(), 
                  sample_exclusions=dict(),
                  verbose=True):
    callsets = dict()
    for cross in CROSSES:
        callset_fn = callset_fn_template.format(cross=cross)
        callsets[cross] = load_callset(callset_fn, 
                                       variant_filter=variant_filter,
                                       call_filter=call_filter,
                                       sample_inclusions=sample_inclusions.get(cross, None),
                                       sample_exclusions=sample_exclusions.get(cross, None),
                                       verbose=verbose)
    return callsets
    
    
def locate_variants(CHROM, POS, chrom, start=None, stop=None):
    if isinstance(chrom, str):
        chrom = chrom.encode('ascii')
    start_index, stop_index = bisect_left(CHROM, chrom), bisect_right(CHROM, chrom)
    POS = POS[start_index:stop_index]
    if stop is not None:
        stop_index = start_index + bisect_right(POS, stop)
    if start is not None:
        start_index = start_index + bisect_left(POS, start)
    return start_index, stop_index


def min_haplen_calls(min_haplen):
    def _f(v, c, c2d):
        hl, hs, _ = haplotypes(v, c)
        return hl >= min_haplen
    return _f


def gatk_conf_calls(V, C, C2d):
    return C2d.GQ >= 99


def cortex_conf_calls(V, C, C2d):
    return C2d.GT_CONF >= 50


def combined_conf_calls(v, c, c2d):
    return ((v.set[:,np.newaxis] == b'Cortex') & (c2d.GT_CONF >= 50)
            | (v.set[:,np.newaxis] != b'Cortex') & (c2d.GQ >= 99))

Tables



In [12]:

    
def display_with_nrows(tbl, n=5, caption=''):
    caption += ' (%s rows)' % tbl.nrows()
    tbl.display(n, caption=caption)



In [13]:

    
CACHE_DIR = os.path.join(PUBLIC_DIR, 'cache')



In [14]:

    
def etlcache(f):
    fn = os.path.join(CACHE_DIR, f.__name__)
    if not os.path.exists(fn):
        etl.topickle(f(), fn)
    return etl.frompickle(fn)
    
    
def pdcache(f):
    fn = os.path.join(CACHE_DIR, f.__name__)
    if not os.path.exists(fn):
        obj = f()
        pandas.to_pickle(obj, fn)
        return obj
    else:
        return pandas.read_pickle(fn)
    
    
def nocache(f):
    fn = os.path.join(CACHE_DIR, f.__name__)
    if os.path.exists(fn):
        os.remove(fn)
    return f()

Sample metadata



In [15]:

    
tbl_samples = (etl
    .fromtsv(os.path.join(PUBLIC_DIR, 'samples.txt'))
    .addfield('ID', lambda row: '%s/%s/%s' % (row.clone, row.sample, row.run))
)
run2id = tbl_samples.lookupone('run', 'ID')
tbl_samples









    Out[15]:







0|cross
1|clone
2|sample
3|run
4|instrument
5|coverage
6|ID




3d7_hb3
3D7
PG0051-C
ERR019061
Illumina Genome Analyzer II
122X
3D7/PG0051-C/ERR019061


3d7_hb3
C01
PG0065-C
ERR019064
Illumina Genome Analyzer II
163X
C01/PG0065-C/ERR019064


3d7_hb3
C01
PG0062-C
ERR019070
Illumina Genome Analyzer II
108X
C01/PG0062-C/ERR019070


3d7_hb3
C02
PG0055-C
ERR019066
Illumina Genome Analyzer II
102X
C02/PG0055-C/ERR019066


3d7_hb3
C02
PG0053-C
ERR019067
Illumina Genome Analyzer II
73X
C02/PG0053-C/ERR019067



...

Genome region accessibility classifications



In [16]:

    
@etlcache
def tbl_regions_1b():
    return (etl
        .fromtsv(os.path.join(PUBLIC_DIR, 'regions-20130225.onebased.txt'))
        .pushheader(['chrom', 'start', 'stop', 'type'])
        .convertnumbers()
        .prefixheader('region_')
        .addfield('region_size', lambda row: row.region_stop - row.region_start + 1)
    )
tbl_regions_1b









    Out[16]:







0|region_chrom
1|region_start
2|region_stop
3|region_type
4|region_size




Pf3D7_01_v3
1
27336
SubtelomericRepeat
27336


Pf3D7_01_v3
27337
92900
SubtelomericHypervariable
65564


Pf3D7_01_v3
92901
457931
Core
365031


Pf3D7_01_v3
457932
460311
Centromere
2380


Pf3D7_01_v3
460312
575900
Core
115589



...

Genome annotations



In [17]:

    
@etlcache
def tbl_features():
    return (etl
        .fromgff3(GFF_FN)
        .unpackdict('attributes', ['ID', 'Parent', 'Name', 'previous_systematic_id'])
        .rename({'seqid': 'chrom', 'end': 'stop', 'ID': 'id', 'Parent': 'parent_id', 'Name': 'name', 'previous_systematic_id': 'previous_id'})
        .cutout('source', 'score', 'phase')
        .addfield('length', lambda row: row.stop - row.start, index=4)
        .intervalleftjoin(tbl_regions_1b, lkey='chrom', rkey='region_chrom', rstart='region_start', rstop='region_stop')
        .cutout('region_chrom', 'region_start', 'region_stop')
        .distinct('id')
        .sort(key=('chrom', 'start', 'type'))
        .prefixheader('feature_')
    )
tbl_features









    Out[17]:







0|feature_chrom
1|feature_type
2|feature_start
3|feature_stop
4|feature_length
5|feature_strand
6|feature_id
7|feature_parent_id
8|feature_name
9|feature_previous_id
10|feature_region_type
11|feature_region_size




Pf3D7_01_v3
repeat_region
1
360
359
+
Pfalciparum_REP_20
None
None
None
SubtelomericRepeat
27336


Pf3D7_01_v3
repeat_region
361
1418
1057
+
Pfalciparum_REP_15
None
None
None
SubtelomericRepeat
27336


Pf3D7_01_v3
repeat_region
2160
3858
1698
+
Pfalciparum_REP_35
None
None
None
SubtelomericRepeat
27336


Pf3D7_01_v3
repeat_region
8856
9021
165
+
Pfalciparum_REP_5
None
None
None
SubtelomericRepeat
27336


Pf3D7_01_v3
repeat_region
9313
9529
216
+
Pfalciparum_REP_25
None
None
None
SubtelomericRepeat
27336



...



In [18]:

    
lkp_feature = tbl_features.recordlookupone('feature_id')



In [19]:

    
@etlcache
def tbl_genes():
    return (
        tbl_features
        .selectin('feature_type', ['gene', 'pseudogene'])
        .cutout('feature_parent_id')
    )
tbl_genes









    Out[19]:







0|feature_chrom
1|feature_type
2|feature_start
3|feature_stop
4|feature_length
5|feature_strand
6|feature_id
7|feature_name
8|feature_previous_id
9|feature_region_type
10|feature_region_size




Pf3D7_01_v3
gene
29510
37126
7616
+
PF3D7_0100100
VAR
PFA0005w
SubtelomericHypervariable
65564


Pf3D7_01_v3
gene
38982
40207
1225
-
PF3D7_0100200
RIF
PFA0010c
SubtelomericHypervariable
65564


Pf3D7_01_v3
gene
42367
46507
4140
-
PF3D7_0100300
None
PFA0015c
SubtelomericHypervariable
65564


Pf3D7_01_v3
gene
50363
51636
1273
+
PF3D7_0100400
RIF
PFA0020w
SubtelomericHypervariable
65564


Pf3D7_01_v3
pseudogene
53169
53280
111
-
PF3D7_0100500
var pseudogene, exon 1
PFA0025c
SubtelomericHypervariable
65564



...



In [20]:

    
@etlcache
def tbl_intergenic():
    return (
        tbl_genes
        .addfieldusingcontext('next', lambda prv, cur, nxt: (nxt.feature_start, nxt.feature_id, nxt.feature_strand) if nxt and cur.feature_chrom == nxt.feature_chrom else None)
        .notnone('next')
        .unpack('next', ['next_start', 'next_id', 'next_strand'])
        .convert('feature_id', lambda v, row: '%s_%s' % (row.feature_id, row.next_id), pass_row=True)
        .cutout('feature_start', 'feature_length', 'feature_region_type', 'next_id')
        .rename({'feature_stop': 'feature_start', 'next_start': 'feature_stop'})
        .movefield('feature_stop', 3)
        .addfield('feature_length', lambda row: row.feature_stop - row.feature_start, index=4)
        .convert('feature_type', lambda v, row: 'intergenic %s/%s' % (row.feature_strand, row.next_strand), pass_row=True)
        # reclassify based on expected number of promoters
        .convert('feature_type', {'intergenic +/-': 'intergenic_0', 'intergenic -/+': 'intergenic_2', 'intergenic +/+': 'intergenic_1', 'intergenic -/-': 'intergenic_1'})
        .cutout('feature_strand', 'next_strand', 'feature_name', 'feature_previous_id')
        .intervalleftjoin(tbl_regions_1b, 
                          lkey='feature_chrom', lstart='feature_start', lstop='feature_stop',
                          rkey='region_chrom', rstart='region_start', rstop='region_stop')
        .rename('region_type', 'feature_region_type')
        .cutout('region_chrom', 'region_start', 'region_stop')
        .gt('feature_length', 0)
    )

tbl_intergenic









    Out[20]:







0|feature_chrom
1|feature_type
2|feature_start
3|feature_stop
4|feature_length
5|feature_id
6|feature_region_size
7|feature_region_type
8|region_size




Pf3D7_01_v3
intergenic_0
37126
38982
1856
PF3D7_0100100_PF3D7_0100200
65564
SubtelomericHypervariable
65564


Pf3D7_01_v3
intergenic_1
40207
42367
2160
PF3D7_0100200_PF3D7_0100300
65564
SubtelomericHypervariable
65564


Pf3D7_01_v3
intergenic_2
46507
50363
3856
PF3D7_0100300_PF3D7_0100400
65564
SubtelomericHypervariable
65564


Pf3D7_01_v3
intergenic_0
51636
53169
1533
PF3D7_0100400_PF3D7_0100500
65564
SubtelomericHypervariable
65564


Pf3D7_01_v3
intergenic_1
53280
53778
498
PF3D7_0100500_PF3D7_0100600
65564
SubtelomericHypervariable
65564



...



In [21]:

    
@etlcache
def tbl_exons():
    return (
        tbl_features
        .eq('feature_type', 'CDS')
        .gt('feature_length', 0)
    )

tbl_exons









    Out[21]:







0|feature_chrom
1|feature_type
2|feature_start
3|feature_stop
4|feature_length
5|feature_strand
6|feature_id
7|feature_parent_id
8|feature_name
9|feature_previous_id
10|feature_region_type
11|feature_region_size




Pf3D7_01_v3
CDS
29510
34762
5252
+
PF3D7_0100100.1:exon:1
PF3D7_0100100.1
None
None
SubtelomericHypervariable
65564


Pf3D7_01_v3
CDS
35888
37126
1238
+
PF3D7_0100100.1:exon:2
PF3D7_0100100.1
None
None
SubtelomericHypervariable
65564


Pf3D7_01_v3
CDS
38982
39923
941
-
PF3D7_0100200.1:exon:1
PF3D7_0100200.1
None
None
SubtelomericHypervariable
65564


Pf3D7_01_v3
CDS
40154
40207
53
-
PF3D7_0100200.1:exon:2
PF3D7_0100200.1
None
None
SubtelomericHypervariable
65564


Pf3D7_01_v3
CDS
42367
43617
1250
-
PF3D7_0100300.1:exon:1
PF3D7_0100300.1
None
None
SubtelomericHypervariable
65564



...



In [22]:

    
@etlcache
def tbl_introns():
    return (
        tbl_exons
        .addfieldusingcontext('next', 
                              lambda prv, cur, nxt: (nxt.feature_start, nxt.feature_id) if nxt and cur.feature_parent_id == nxt.feature_parent_id else None)
        .notnone('next')
        .unpack('next', ['next_start', 'next_id'])
        .convert('feature_id', lambda v, row: '%s_%s' % (row.feature_id, row.next_id), pass_row=True)
        .cutout('feature_start', 'feature_length', 'feature_region_type', 'next_id')
        .rename({'feature_stop': 'feature_start', 'next_start': 'feature_stop'})
        .movefield('feature_stop', 3)
        .addfield('feature_length', lambda row: row.feature_stop - row.feature_start, index=4)
        .convert('feature_type', lambda v: 'intron')
        .intervalleftjoin(tbl_regions_1b, 
                          lkey='feature_chrom', lstart='feature_start', lstop='feature_stop',
                          rkey='region_chrom', rstart='region_start', rstop='region_stop')
        .rename('region_type', 'feature_region_type')
        .cutout('region_chrom', 'region_start', 'region_stop')
    )

tbl_introns









    Out[22]:







0|feature_chrom
1|feature_type
2|feature_start
3|feature_stop
4|feature_length
5|feature_strand
6|feature_id
7|feature_parent_id
8|feature_name
9|feature_previous_id
10|feature_region_size
11|feature_region_type
12|region_size




Pf3D7_01_v3
intron
34762
35888
1126
+
PF3D7_0100100.1:exon:1_PF3D7_0100100.1:exon:2
PF3D7_0100100.1
None
None
65564
SubtelomericHypervariable
65564


Pf3D7_01_v3
intron
39923
40154
231
-
PF3D7_0100200.1:exon:1_PF3D7_0100200.1:exon:2
PF3D7_0100200.1
None
None
65564
SubtelomericHypervariable
65564


Pf3D7_01_v3
intron
43617
43775
158
-
PF3D7_0100300.1:exon:1_PF3D7_0100300.1:exon:2
PF3D7_0100300.1
None
None
65564
SubtelomericHypervariable
65564


Pf3D7_01_v3
intron
50416
50572
156
+
PF3D7_0100400.1:exon:1_PF3D7_0100400.1:exon:2
PF3D7_0100400.1
None
None
65564
SubtelomericHypervariable
65564


Pf3D7_01_v3
intron
54788
54938
150
-
PF3D7_0100600.1:exon:1_PF3D7_0100600.1:exon:2
PF3D7_0100600.1
None
None
65564
SubtelomericHypervariable
65564



...



In [23]:

    
@etlcache
def tbl_gene_aliases():
    return (etl
        .fromtsv(os.path.join(DATA_DIR, 'genome/plasmodb/release-9.3/Pfalciparum3D7/txt/PlasmoDB-9.3_Pfalciparum3D7_GeneAliases.txt'))
        .pushheader(['feature_id'] + list(map(str, range(10))))
        .convert('feature_id', lambda v: v[:13])  # ignore alternate splice forms
        .cat()  # square up short rows
        .melt('feature_id')
        .rename('value', 'feature_alias')
        .cut('feature_id', 'feature_alias')
        .ne('feature_alias', None)
    )

tbl_gene_aliases









    Out[23]:







0|feature_id
1|feature_alias




PF3D7_1314600
814061


PF3D7_1314600
PF13_0083


PF3D7_0209400
8444970


PF3D7_0209400
PF02_0090


PF3D7_0209400
PFB0423c



...

Heterochromatin (Flueck et al. 2009)



In [24]:

    
@etlcache
def tbl_hp1():
    return (etl
        .fromtext(os.path.join(DATA_DIR, 'reference/flueck_2009/journal.ppat.1000569.s008.txt'))
        .rowslice(3, None)
        .ne('lines', '')
        .capture('lines', r'^\S+\s+\d+\s+\d+\s+(\S+)', ['feature_alias'], include_original=1)
        .capture('lines', r'^\S+\s+\d+\s+\d+\s+\S+.*?(-?\d\.\d{3})', ['hp1'], include_original=1)
        .convert('hp1', float)
        .cutout('lines')
        .leftjoin(tbl_gene_aliases, key='feature_alias')
        .ne('feature_id', None)
        .sort('feature_id')
    )

tbl_hp1









    Out[24]:







0|feature_alias
1|hp1
2|feature_id




PFA0005w
4.648
PF3D7_0100100


PFA0010c
3.763
PF3D7_0100200


PFA0015c
5.042
PF3D7_0100300


PFA0020w
3.721
PF3D7_0100400


PFA0030c
4.398
PF3D7_0100600



...



In [ ]:

Plotting



In [25]:

    
def plot_variants_locator(ax, callset, chrom, start=0, stop=None, step=5):
    V, _, _ = unpack_callset(callset)
    if stop is None:
        stop = len(fasta[chrom])
    indices = np.nonzero((V.CHROM == chrom.encode('ascii')) & (V.POS >= start) & (V.POS <= stop))[0]
    POS = np.take(V.POS, indices, axis=0)
    anhima.loc.plot_variant_locator(POS, ax=ax, step=step, start_position=start, stop_position=stop, line_args=dict(color='k'))
    ax.set_xticks([])
    ax.set_xlabel('')



In [26]:

    
INHERITANCE_COLORS = [
    'white', # missing
    'grey', # filtered
    'yellow', # parent missing
    'yellow', # parent filtered
    'black', # non-parental
    'orange', # non-segregating ref 
    'green', # non-segregating alt 
    'blue', # parent 1 
    'red' # parent 2 
]


def plot_inheritance(ax, callset, chrom, start=0, stop=None, sample_indices=None, colors=INHERITANCE_COLORS, alpha=1, **kwargs):

    V, C, C2d = unpack_callset(callset)
    samples = C.dtype.names
    
    if stop is None:
        stop = len(fasta[chrom])
    indices = np.nonzero((V.CHROM == chrom.encode('ascii')) & (V.POS >= start) & (V.POS <= stop))[0]
    POS = np.take(V.POS, indices, axis=0)
    G = np.take(C2d.genotype, indices, axis=0)
    P = inheritance(G)
    
    if sample_indices is not None:
        P = np.take(P, sample_indices, axis=1)
        samples = np.take(samples, sample_indices, axis=0) 

    anhima.gt.plot_discrete_calldata(P, labels=samples, colors=colors, states=range(len(colors)), ax=ax, **kwargs)

Gene annotations



In [27]:

    
def get_features(chrom, start=None, stop=None):
    if None not in {start, stop}:
        region = '%s:%s-%s' % (chrom, start, stop)
    else:
        region = chrom
    tbl_features = (etl
        .fromgff3(GFF_FN, region)
        .cutout('source', 'score', 'phase')
        .unpackdict('attributes', ['ID', 'Name', 'previous_systematic_id'], includeoriginal=True)
    )
    tbl_features = tbl_features.setheader(['feature_' + str(f).lower() for f in tbl_features.header()])
    return tbl_features


prog_hv = re.compile('var|rif|stev', flags=re.I) 


def plot_genes(ax, chrom, start=0, stop=None, label_named=False, exclude_labels=[], width=.3, label_ytextpad=.1,
               normal_color='k', hypervariable_color='r', cds_color='gray', divider_linewidth=1,
               annotate_genes=[], annotate_xtext=0, annotate_ytext_fwd=6, annotate_ytext_rev=8, 
               annotate_arrowprops=None, annotate_bbox=None, 
               show_cds=False, gene_labels=dict()):
    if stop is None:
        stop = len(fasta[chrom])
    tbl_local_features = get_features(chrom, start, stop)
    tbl_local_genes = tbl_local_features.selectin('feature_type', {'gene', 'pseudogene'})
    tbl_local_cds = tbl_local_features.eq('feature_type', 'CDS')
    
    # divider
    ax.plot([start, stop], [.5, .5], 'k-', linewidth=divider_linewidth)
    ax.set_ylim(0, 1)
    
    # positive strand
    tbl_fwd_genes = tbl_local_genes.eq('feature_strand', '+')
    xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_fwd_genes.records()]
    fc = [hypervariable_color if r.feature_name is not None and prog_hv.search(r.feature_name) is not None else normal_color 
          for r in tbl_fwd_genes.records()]
    ax.broken_barh(xranges, (.5, width), color=fc)
    if show_cds:
        tbl_fwd_cds = tbl_local_cds.eq('feature_strand', '+')
        xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_fwd_cds.records()]
        ax.broken_barh(xranges, (.5, width), color=cds_color)
        
    
    # negative strand
    tbl_rev_genes = tbl_local_genes.eq('feature_strand', '-')
    xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_rev_genes.records()]
    fc = [hypervariable_color if r.feature_name is not None and prog_hv.search(r.feature_name) is not None else normal_color
          for r in tbl_rev_genes.records()]
    ax.broken_barh(xranges, (.5-width, width), color=fc)
    if show_cds:
        tbl_rev_cds = tbl_local_cds.eq('feature_strand', '-')
        xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_rev_cds.records()]
        ax.broken_barh(xranges, (.5-width, width), color=cds_color)
    
    for s in 'top', 'left', 'right':
        ax.spines[s].set_visible(False)
    ax.set_xlim(start, stop)
    ax.set_yticks([])
    ax.xaxis.tick_bottom()

    # label named genes
    if label_named:
        for g in tbl_local_genes.records():
            if g.feature_name is not None and g.feature_name not in exclude_labels:
                x = g.feature_start
                y = .5 + width + label_ytextpad if g.feature_strand == '+' else .5 - width - label_ytextpad
                va = 'bottom' if g.feature_strand == '+' else 'top'
                l = g.feature_name
                ax.text(x, y, l, rotation=0, ha='left', va=va, transform=ax.transData)
                
    # annotate specific genes
    if annotate_arrowprops is None:
        annotate_arrowprops = dict(facecolor='k', arrowstyle='-', shrinkA=0, shrinkB=0)
    if annotate_bbox is None:
        annotate_bbox = dict(facecolor='w', alpha=.9)
    for id in annotate_genes:
        g = lkp_feature[id]
        gene_id = g['feature_id']
        l = gene_labels[gene_id] if gene_id in gene_labels else g['feature_name'] if g['feature_name'] is not None else gene_id
        if g['feature_strand'] == '+':
            ax.annotate(l, xy=(g['feature_start'], .5), ha='left', xytext=(annotate_xtext, annotate_ytext_fwd), 
                        xycoords='data', textcoords='offset points', 
                        arrowprops=annotate_arrowprops,
                        bbox=annotate_bbox)
        elif g['feature_strand'] == '-':
            ax.annotate(l, xy=(g['feature_start'], .5), ha='left', xytext=(annotate_xtext, annotate_ytext_rev), 
                        xycoords='data', textcoords='offset points', 
                        arrowprops=annotate_arrowprops,
                        bbox=annotate_bbox)

Accessibility



In [28]:

    
CNV_RUNS = {
    '3d7_hb3': ['ERR019061', 'ERR019054'],
    'hb3_dd2': ['ERR012788', 'ERR012840', 'ERR019054'],  # share HB3s
    '7g8_gb4': ['ERR027099', 'ERR027100']
}


accessibility_colors = {
    'Core': 'white', 
    'SubtelomericRepeat': 'yellow',
    'SubtelomericHypervariable': '#d73027',
    'InternalHypervariable': '#fc8d59',
    'Centromere': 'black',
    'CNV': '#91bfdb',
}


def plot_accessibility(ax, chrom, cross=None, start=0, stop=None, linewidth=0.5, **kwargs):
    tbl_lreg = tbl_regions_1b.eq('region_chrom', chrom)
    if stop is None:
        stop = len(fasta[chrom])

    # plot core
    xranges = tbl_lreg.eq('region_type', 'Core').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['Core'], 
                   linewidth=linewidth, **kwargs)

    # plot centromere
    xranges = tbl_lreg.eq('region_type', 'Centromere').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['Centromere'], 
                   linewidth=linewidth, **kwargs)

    # plot subtelomeric repeat
    xranges = tbl_lreg.eq('region_type', 'SubtelomericRepeat').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['SubtelomericRepeat'], 
                   linewidth=linewidth, **kwargs)
    
    # plot hypervariable
    xranges = tbl_lreg.eq('region_type', 'SubtelomericHypervariable').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['SubtelomericHypervariable'], 
                   linewidth=linewidth, **kwargs)
    xranges = tbl_lreg.eq('region_type', 'InternalHypervariable').values(['region_start', 'region_size'])
    ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['InternalHypervariable'], 
                   linewidth=linewidth, **kwargs)
    
    if cross is not None:
        # plot CNVs
        tbls_cnvs = [etl.fromtsv(CNV_TBL_FN_TEMPLATE.format(run=run)).convertnumbers() for run in CNV_RUNS[cross]]
        tbl_cnvs = (etl
            .cat(*tbls_cnvs)
            .gt('cn', 1)
            .eq('chrom', chrom)
            .intervalleftjoin(tbl_lreg, lkey='chrom', rkey='region_chrom', 
                              lstart='start', lstop='stop', 
                              rstart='region_start', rstop='region_stop')
            .eq('region_type', 'Core')
        )
        xranges = tbl_cnvs.values(['start', 'size'])
        ax.broken_barh(xranges, (.1, .8), color=accessibility_colors['CNV'], 
                       linewidth=linewidth, **kwargs)
    
    # tidy up
    for s in 'top', 'left', 'right', 'bottom':
        ax.spines[s].set_visible(False)
    ax.set_xlim(start, stop)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.xaxis.tick_bottom()
    ax.set_ylim(0, 1)

Heterochromatin



In [29]:

    
def plot_hp1(ax, chrom, start=0, stop=None):

    if stop is None:
        stop = len(fasta[chrom])
        
    tbl_genes_local = get_features(chrom, start, stop).selectin('feature_type', {'gene', 'pseudogene'})
    A = tbl_genes_local.cut('feature_start', 'feature_end', 'feature_id').join(tbl_hp1, key='feature_id').torecarray()
    left = A.feature_start
    height = A.hp1
    width = A.feature_end - A.feature_start
    ax.bar(left, height, width, facecolor='purple', linewidth=1, edgecolor='purple')
    
    ax.set_xlim(start, stop)
    for s in 'top', 'left', 'right', 'bottom':
        ax.spines[s].set_visible(False)
    ax.set_yticks([])
    ax.set_ylim(-1, 7)
    ax.set_xticks([])
#    ax.plot([start, stop], [0, 0], color='k', linestyle='-')

Inheritance



In [30]:

    
# INHERITANCE_COLORS = [
#     'white', # missing
#     'grey', # filtered
#     'yellow', # parent missing
#     'yellow', # parent filtered
#     'black', # non-parental
#     'orange', # non-segregating ref 
#     'green', # non-segregating alt 
#     'blue', # parent 1 
#     'red' # parent 2 
# ]


# def plot_inheritance(ax, callset, chrom, start=0, stop=None, sample_indices=None, colors=INHERITANCE_COLORS, alpha=1, **kwargs):

#     V, C, C2d = unpack_callset(callset)
#     samples = C.dtype.names
    
#     if stop is None:
#         stop = len(fasta[chrom])
#     indices = np.nonzero((V.CHROM == chrom) & (V.POS >= start) & (V.POS <= stop))[0]
#     POS = np.take(V.POS, indices, axis=0)
#     G = np.take(C2d.genotype, indices, axis=0)
#     P = inheritance(G)
    
#     if sample_indices is not None:
#         P = np.take(P, sample_indices, axis=1)
#         samples = np.take(samples, sample_indices, axis=0) 

#     vcfplt.discrete_calldata_colormesh(P, labels=samples, colors=colors, states=range(len(colors)), alpha=alpha, ax=ax, **kwargs)

Recombination



In [31]:

    
def marker_span(callset, chrom):
    V, _, _ = unpack_callset(callset)
    start_index, stop_index = locate_variants(V.CHROM, V.POS, chrom)
    POS = V.POS[start_index:stop_index]
    x = POS[-1] - POS[0]
    return x


def marker_offset(callset, chrom):
    V, _, _ = unpack_callset(callset)
    start_index, stop_index = locate_variants(V.CHROM, V.POS, chrom)
    POS = V.POS[start_index:stop_index]
    return POS[0]


def locate_flanking_markers(CHROM, POS, chrom, pos):
    chrom_start_index, chrom_stop_index = bisect_left(CHROM, chrom), bisect_right(CHROM, chrom)
    POS = POS[chrom_start_index:chrom_stop_index]
    
    start_index = bisect_left(POS, pos) - 1
    stop_index = bisect_right(POS, pos)
    
    if start_index < 0:
        start_index = None
    else:
        start_index += chrom_start_index
        
    if stop_index >= len(POS):
        stop_index = None
    else:
        stop_index += chrom_start_index
    
    return start_index, stop_index
    

def simulate_random_uniform_crossovers(n_events, callset, chromosomes=CHROMOSOMES):
    
    # table of results
    tbl = [['chrom', 'co_pos', 'co_pos_mid', 'co_pos_min', 'co_pos_max', 'co_pos_range']]
    
    # utility variables
    V = callset['variants']
    marker_spans = [marker_span(callset, chrom) for chrom in chromosomes]
    marker_offsets = [marker_offset(callset, chrom) for chrom in chromosomes]
    cumulative_marker_span = [np.sum(marker_spans[:i+1]) for i in range(len(marker_spans))]
    total_marker_span = np.sum(marker_spans)
    
    # generate random integers over the total marker span
    X = np.random.randint(low=0, high=total_marker_span, size=n_events)
    
    # map each event onto a chromosome and position
    for x in X:

        # which chromosome?
        chrom_index = bisect_left(cumulative_marker_span, x)
        chrom = chromosomes[chrom_index]

        # which position?
        if chrom_index > 0:
            pos_relative = x - cumulative_marker_span[chrom_index-1]
        else:
            pos_relative = x
        pos = pos_relative + marker_offsets[chrom_index]
        
        # find flanking markers
        lidx, ridx = locate_flanking_markers(V.CHROM, V.POS, chrom, pos)

        pos_min = V.POS[lidx]
        pos_max = V.POS[ridx]
        pos_mid = (pos_min + pos_max)//2
        pos_range = pos_max - pos_min
        
        row = [chrom, pos, pos_mid, pos_min, pos_max, pos_range]
        tbl += [row]
        
    return etl.wrap(tbl)


def simulate_random_uniform_non_crossovers(n_events, callset, phi, chromosomes=CHROMOSOMES):
    
    # table of results
    tbl = [['chrom', 
            'tract_start', 
            'tract_stop', 
            'tract_length',
            'tract_start_min', 
            'tract_start_mid', 
            'tract_start_max', 
            'tract_stop_min', 
            'tract_stop_mid', 
            'tract_stop_max', 
            'tract_length_min', 
            'tract_length_mid', 
            'tract_length_max', 
            'tract_support']]
    
    # utility variables
    V = callset['variants']
    marker_spans = [marker_span(callset, chrom) for chrom in chromosomes]
    marker_offsets = [marker_offset(callset, chrom) for chrom in chromosomes]
    cumulative_marker_span = [np.sum(marker_spans[:i+1]) for i in range(len(marker_spans))]
    total_marker_span = np.sum(marker_spans)
    
    # generate random integers over the total marker span
    X = np.random.randint(low=0, high=total_marker_span, size=n_events)

    # generate random geometric tract lengths
    lengths = stats.geom.rvs(1-phi, size=n_events).astype('int32')
    
    # map each event onto a chromosome and position
    for x, length in zip(X, lengths):

        try:
            # which chromosome?
            chrom_index = bisect_left(cumulative_marker_span, x)
            chrom = chromosomes[chrom_index]

            # which start position?
            if chrom_index > 0:
                pos_relative = x - cumulative_marker_span[chrom_index-1]
            else:
                pos_relative = x
            start = pos_relative + marker_offsets[chrom_index]
            stop = start + length

            # find flanking markers
            start_lidx, start_ridx = locate_flanking_markers(V.CHROM, V.POS, chrom, start)
            stop_lidx, stop_ridx = locate_flanking_markers(V.CHROM, V.POS, chrom, stop)
            
            if None in {start_lidx, start_ridx, stop_lidx, stop_ridx}:
                
                # falls off edge of marker span
                support = start_min = start_max = start_mid = stop_min = stop_max = stop_mid = length_min = length_max = length_mid = None

            else:

                support = stop_ridx - start_ridx

                # do we even observe the event?
                if support == 0:
                    
                    start_min = start_max = start_mid = stop_min = stop_max = stop_mid = length_min = length_max = length_mid = None

                else:

                    start_min = V.POS[start_lidx]
                    start_max = V.POS[start_ridx]
                    start_mid = (start_min + start_max)//2
                    stop_min = V.POS[stop_lidx]
                    stop_max = V.POS[stop_ridx]
                    stop_mid = (stop_min + stop_max)//2
                    length_min = stop_min - start_max
                    length_max = stop_max - start_min
                    length_mid = (length_min + length_max)//2

            row = [chrom, 
                   start, 
                   stop, 
                   length,
                   start_min, 
                   start_mid, 
                   start_max, 
                   stop_min, 
                   stop_mid, 
                   stop_max, 
                   length_min, 
                   length_mid, 
                   length_max, 
                   support]
            tbl += [row]
    
        except Exception as e:
            log(e)
            log(traceback.format_exc(10))
                    
    return etl.wrap(tbl)

Optimisations



In [32]:

    
%reload_ext Cython



In [33]:

    
%%cython

import numpy as np
cimport numpy as np
import petl as etl
import vcfnp


CHROMOSOMES = [('Pf3D7_%02d_v3' % n).encode('ascii') for n in range(1, 15)]

# constants to represent the possible outcomes when attempting to determine the inheritance for any given variant call
INHERITANCE_MISSING = 0  # genotype call is missing
INHERITANCE_FILTERED = 1  # genotype call is filtered
INHERITANCE_PARENT_MISSING = 2  # genotype is called but cannot be phased because one or both parents is missing
INHERITANCE_PARENT_FILTERED = 3  # genotype is called but cannot be phased because one or both parents is filtered
INHERITANCE_NONPARENTAL = 4  # genotype call does not correspond with either parent (i.e., mendelian error sensu strictu)
INHERITANCE_NONSEGREGATING_REF = 5  # genotype call is ref and parents both have reference allele
INHERITANCE_NONSEGREGATING_ALT = 6  # genotype call is alt and parents both have alternate allele
INHERITANCE_PARENT1 = 7  # genotype call corresponds with first parent
INHERITANCE_PARENT2 = 8  # genotype call corresponds with second parent


# functions to determine inheritance states
###########################################


def inheritance(np.ndarray[np.int8_t, ndim=2] G):
    cdef int m = G.shape[0] # number of variants
    cdef int n = G.shape[1] # number of samples
    cdef np.ndarray[np.uint8_t, ndim=2] P
    P = np.zeros((m,n), dtype='u1')
    for i in range(m):
        for j in range(n):
            # N.B., parents are *first* two samples
            P[i,j] = inheritance_state(G[i,j], G[i,0], G[i,1])
    return P


cdef int inheritance_state(int genotype_child, int genotype_parent1, int genotype_parent2):
    if genotype_child == -1:
        return INHERITANCE_MISSING
    elif genotype_child == -2:
        return INHERITANCE_FILTERED
    elif genotype_parent1 == -1 or genotype_parent2 == -1:
        return INHERITANCE_PARENT_MISSING
    elif genotype_parent1 == -2 or genotype_parent2 == -2:
        return INHERITANCE_PARENT_FILTERED
    elif genotype_child != genotype_parent1 and genotype_child != genotype_parent2:
        return INHERITANCE_NONPARENTAL
    elif genotype_child == genotype_parent1 == genotype_parent2 == 0:
        return INHERITANCE_NONSEGREGATING_REF
    elif genotype_child == genotype_parent1 == genotype_parent2 and genotype_child > 0:
        return INHERITANCE_NONSEGREGATING_ALT
    elif genotype_child == genotype_parent1:
        return INHERITANCE_PARENT1
    elif genotype_child == genotype_parent2:
        return INHERITANCE_PARENT2
    else:
        return -1 # should never be reached


# functions to calculate haplotype statistics
#############################################


def haplotypes_chrom(V, C, chrom):
    """
    Construct arrays of haplotype length and haplotype support for all variant calls.

    """

    tbl = list()
    header = 'sample', 'chrom', 'start_min', 'start_mid', 'start_max', 'stop_min', 'stop_mid', 'stop_max', 'length_min', \
             'length_mid', 'length_max', 'support', 'prv_inheritance', 'inheritance', 'nxt_inheritance'
    tbl.append(header)

    cdef int i, j, M, N, prv_idx, start_idx, stop_idx, start_min, start_mid, start_max, stop_min, stop_mid, \
        stop_max, length_min, length_mid, length_max, support, prv, cur

    samples = C.dtype.names
    C2d = vcfnp.view2d(C).view(np.recarray)

    # only deal with one chromosome, make life simpler
    flt = V.CHROM == chrom
    indices = np.nonzero(flt)[0]
    V = np.take(V, indices, axis=0)
    C2d = np.take(C2d, indices, axis=0)
    M, N = C2d.shape
    cdef np.ndarray[np.uint8_t, ndim=2] P
    P = inheritance(C2d.genotype)  # inheritance
#    print M, N

    cdef np.ndarray[np.int32_t, ndim=1] POS
    POS = V['POS']

    cdef np.ndarray[np.int32_t, ndim=2] HL
    cdef np.ndarray[np.int32_t, ndim=2] HS
    HL = np.zeros((M, N), dtype='i4') # haplotype lengths (minimum)
    HS = np.zeros((M, N), dtype='i4') # haplotype support - number of calls supporting the haplotype

    cdef np.ndarray[np.int32_t, ndim=1] previous_call_idx
    cdef np.ndarray[np.int32_t, ndim=2] haplotype_start_idxs
    cdef np.ndarray[np.int32_t, ndim=1] last_haplotype_inheritance
    previous_call_idx = np.array([-1] * N, dtype='i4') # store index of last good call
    haplotype_start_idxs = np.array([[-1, -1]] * N, dtype='i4') # store indices of haplotype start (flanking markers)
    last_haplotype_inheritance = np.array([-1] * N, dtype='i4') # store parental inheritance of upstream block

    # iterate over variants
    for i in range(M):
        # iterate over samples
        for j in range(N):
            # extract genotype call for current variant/sample
            cur = P[i, j]
            if cur == INHERITANCE_PARENT1 or cur == INHERITANCE_PARENT2: # current call is good
                # index of last good call for this sample
                prv_idx = previous_call_idx[j]
                if prv_idx < 0:
                    # this is the first good call, start a haplotype
                    haplotype_start_idxs[j] = (i, i)
                else:
                    # extract last good genotype call for current sample
                    prv = P[prv_idx, j]
                    if cur != prv:
                        # inheritance switch, haplotype has ended
                        start_min_idx, start_max_idx = tuple(haplotype_start_idxs[j])
                        stop_min_idx, stop_max_idx = prv_idx, i
                        start_min = POS[start_min_idx]
                        start_max = POS[start_max_idx]
                        start_mid = (start_max + start_min) // 2
                        stop_min = POS[stop_min_idx]
                        stop_max = POS[stop_max_idx]
                        stop_mid = (stop_max + stop_min) // 2
                        length_min = stop_min - start_max
                        length_max = stop_max - start_min
                        length_mid = stop_mid - start_mid
                        support = np.count_nonzero(P[start_max_idx:stop_min_idx+1, j] == prv)
                        # assign results
                        HL[start_max_idx:stop_min_idx+1, j] = length_min
                        HS[start_max_idx:stop_min_idx+1, j] = support
                        row = samples[j], chrom, start_min, start_mid, start_max, stop_min, stop_mid, stop_max, \
                              length_min, length_mid, length_max, support, last_haplotype_inheritance[j], prv, cur
                        tbl.append(row)
                        # start new haplotype
                        haplotype_start_idxs[j] = (prv_idx, i)
                        last_haplotype_inheritance[j] = prv
                    else:
                        # haplotype continues
                        pass
                # store last good call index
                previous_call_idx[j] = i

    # final haplotypes ending at chromosome end
    for j in range(N):
        start_min_idx, start_max_idx = tuple(haplotype_start_idxs[j])
        stop_min_idx, stop_max_idx = prv_idx, prv_idx  # nothing subsequent
        start_min = POS[start_min_idx]
        start_max = POS[start_max_idx]
        start_mid = (start_max + start_min) // 2
        stop_min = POS[stop_min_idx]
        stop_max = POS[stop_max_idx]
        stop_mid = (stop_max + stop_min) // 2
        length_min = stop_min - start_max
        length_max = stop_max - start_min
        length_mid = stop_mid - start_mid
        inherit = P[prv_idx, j]
        support = np.count_nonzero(P[start_max_idx:stop_min_idx+1, j] == inherit)
        # assign results
        HL[start_max_idx:stop_min_idx+1, j] = length_min
        HS[start_max_idx:stop_min_idx+1, j] = support
        row = samples[j], chrom, start_min, start_mid, start_max, stop_min, stop_mid, stop_max, length_min, length_mid, \
              length_max, support, last_haplotype_inheritance[j], inherit, -1
        tbl.append(row)

    return HL, HS, etl.wrap(tbl)


def haplotypes(V, C):
    stats = [haplotype_stats_chrom(V, C, chrom) for chrom in CHROMOSOMES]
    HL = np.vstack(s[0] for s in stats)
    HS = np.vstack(s[1] for s in stats)
    tbl = etl.cat(*[s[2] for s in stats])
    return HL, HS, tbl


# backwards compatibility
haplotype_stats_chrom = haplotypes_chrom
haplotype_stats = haplotypes


# functions to tabulate recombination events
############################################


def tabulate_switches_chrom(V, C, chrom):
    tbl = list()
    header = 'sample', 'chrom', 'pos', 'lpos', 'rpos', 'range', 'from', 'to'
    tbl.append(header)

    cdef int i, j, m, n, prv_idx, cur, prv

    samples = C.dtype.names
    C2d = vcfnp.view2d(C).view(np.recarray)

    # only deal with one chromosome, make life simpler
    flt = V.CHROM == chrom
    indices = np.nonzero(flt)[0]
    V, C, C2d = [np.take(X, indices, axis=0) for X in [V, C, C2d]]
    m, n = C2d.shape

    cdef np.ndarray[np.uint8_t, ndim=2] P
    P = inheritance(C2d.genotype)

    cdef np.ndarray[np.int32_t, ndim=1] POS
    POS = V['POS']

    cdef np.ndarray[np.int32_t, ndim=1] previous_call_idx
    previous_call_idx = np.array([-1] * n, dtype='i4') # store index of last good call

    # iterate over variants
    for i in range(m):
        # iterate over samples
        for j in range(n):
            # extract genotype call for current variant/sample
            cur = P[i, j]
            if cur == INHERITANCE_PARENT1 or cur == INHERITANCE_PARENT2: # current call is good
                # index of last good call for this sample
                prv_idx = previous_call_idx[j]
                if prv_idx < 0:
                    # this is the first good call
                    pass
                else:
                    # extract last good genotype call for current sample
                    prv = P[prv_idx, j]
                    if cur != prv:
                        # phase switch
                        lpos = POS[prv_idx]
                        rpos = POS[i]
                        row = samples[j], chrom, (lpos + rpos) // 2, lpos, rpos, rpos-lpos, prv, cur
                        tbl.append(row)
                    else:
                        # haplotype continues
                        pass
                # store last good call index
                previous_call_idx[j] = i

    return etl.wrap(tbl)


def tabulate_switches(V, C):
    tbls = [tabulate_switches_chrom(V, C, chrom) for chrom in CHROMOSOMES]
    return etl.cat(*tbls)



In [ ]:



In [ ]:

0\|cross	1\|clone	2\|sample	3\|run	4\|instrument	5\|coverage	6\|ID
3d7_hb3	3D7	PG0051-C	ERR019061	Illumina Genome Analyzer II	122X	3D7/PG0051-C/ERR019061
3d7_hb3	C01	PG0065-C	ERR019064	Illumina Genome Analyzer II	163X	C01/PG0065-C/ERR019064
3d7_hb3	C01	PG0062-C	ERR019070	Illumina Genome Analyzer II	108X	C01/PG0062-C/ERR019070
3d7_hb3	C02	PG0055-C	ERR019066	Illumina Genome Analyzer II	102X	C02/PG0055-C/ERR019066
3d7_hb3	C02	PG0053-C	ERR019067	Illumina Genome Analyzer II	73X	C02/PG0053-C/ERR019067

0\|region_chrom	1\|region_start	2\|region_stop	3\|region_type	4\|region_size
Pf3D7_01_v3	1	27336	SubtelomericRepeat	27336
Pf3D7_01_v3	27337	92900	SubtelomericHypervariable	65564
Pf3D7_01_v3	92901	457931	Core	365031
Pf3D7_01_v3	457932	460311	Centromere	2380
Pf3D7_01_v3	460312	575900	Core	115589

0\|feature_chrom	1\|feature_type	2\|feature_start	3\|feature_stop	4\|feature_length	5\|feature_strand	6\|feature_id	7\|feature_parent_id	8\|feature_name	9\|feature_previous_id	10\|feature_region_type	11\|feature_region_size
Pf3D7_01_v3	repeat_region	1	360	359	+	Pfalciparum_REP_20	None	None	None	SubtelomericRepeat	27336
Pf3D7_01_v3	repeat_region	361	1418	1057	+	Pfalciparum_REP_15	None	None	None	SubtelomericRepeat	27336
Pf3D7_01_v3	repeat_region	2160	3858	1698	+	Pfalciparum_REP_35	None	None	None	SubtelomericRepeat	27336
Pf3D7_01_v3	repeat_region	8856	9021	165	+	Pfalciparum_REP_5	None	None	None	SubtelomericRepeat	27336
Pf3D7_01_v3	repeat_region	9313	9529	216	+	Pfalciparum_REP_25	None	None	None	SubtelomericRepeat	27336

0\|feature_chrom	1\|feature_type	2\|feature_start	3\|feature_stop	4\|feature_length	5\|feature_strand	6\|feature_id	7\|feature_name	8\|feature_previous_id	9\|feature_region_type	10\|feature_region_size
Pf3D7_01_v3	gene	29510	37126	7616	+	PF3D7_0100100	VAR	PFA0005w	SubtelomericHypervariable	65564
Pf3D7_01_v3	gene	38982	40207	1225	-	PF3D7_0100200	RIF	PFA0010c	SubtelomericHypervariable	65564
Pf3D7_01_v3	gene	42367	46507	4140	-	PF3D7_0100300	None	PFA0015c	SubtelomericHypervariable	65564
Pf3D7_01_v3	gene	50363	51636	1273	+	PF3D7_0100400	RIF	PFA0020w	SubtelomericHypervariable	65564
Pf3D7_01_v3	pseudogene	53169	53280	111	-	PF3D7_0100500	var pseudogene, exon 1	PFA0025c	SubtelomericHypervariable	65564

0\|feature_chrom	1\|feature_type	2\|feature_start	3\|feature_stop	4\|feature_length	5\|feature_id	6\|feature_region_size	7\|feature_region_type	8\|region_size
Pf3D7_01_v3	intergenic_0	37126	38982	1856	PF3D7_0100100_PF3D7_0100200	65564	SubtelomericHypervariable	65564
Pf3D7_01_v3	intergenic_1	40207	42367	2160	PF3D7_0100200_PF3D7_0100300	65564	SubtelomericHypervariable	65564
Pf3D7_01_v3	intergenic_2	46507	50363	3856	PF3D7_0100300_PF3D7_0100400	65564	SubtelomericHypervariable	65564
Pf3D7_01_v3	intergenic_0	51636	53169	1533	PF3D7_0100400_PF3D7_0100500	65564	SubtelomericHypervariable	65564
Pf3D7_01_v3	intergenic_1	53280	53778	498	PF3D7_0100500_PF3D7_0100600	65564	SubtelomericHypervariable	65564

0\|feature_chrom	1\|feature_type	2\|feature_start	3\|feature_stop	4\|feature_length	5\|feature_strand	6\|feature_id	7\|feature_parent_id	8\|feature_name	9\|feature_previous_id	10\|feature_region_type	11\|feature_region_size
Pf3D7_01_v3	CDS	29510	34762	5252	+	PF3D7_0100100.1:exon:1	PF3D7_0100100.1	None	None	SubtelomericHypervariable	65564
Pf3D7_01_v3	CDS	35888	37126	1238	+	PF3D7_0100100.1:exon:2	PF3D7_0100100.1	None	None	SubtelomericHypervariable	65564
Pf3D7_01_v3	CDS	38982	39923	941	-	PF3D7_0100200.1:exon:1	PF3D7_0100200.1	None	None	SubtelomericHypervariable	65564
Pf3D7_01_v3	CDS	40154	40207	53	-	PF3D7_0100200.1:exon:2	PF3D7_0100200.1	None	None	SubtelomericHypervariable	65564
Pf3D7_01_v3	CDS	42367	43617	1250	-	PF3D7_0100300.1:exon:1	PF3D7_0100300.1	None	None	SubtelomericHypervariable	65564

0\|feature_chrom	1\|feature_type	2\|feature_start	3\|feature_stop	4\|feature_length	5\|feature_strand	6\|feature_id	7\|feature_parent_id	8\|feature_name	9\|feature_previous_id	10\|feature_region_size	11\|feature_region_type	12\|region_size
Pf3D7_01_v3	intron	34762	35888	1126	+	PF3D7_0100100.1:exon:1_PF3D7_0100100.1:exon:2	PF3D7_0100100.1	None	None	65564	SubtelomericHypervariable	65564
Pf3D7_01_v3	intron	39923	40154	231	-	PF3D7_0100200.1:exon:1_PF3D7_0100200.1:exon:2	PF3D7_0100200.1	None	None	65564	SubtelomericHypervariable	65564
Pf3D7_01_v3	intron	43617	43775	158	-	PF3D7_0100300.1:exon:1_PF3D7_0100300.1:exon:2	PF3D7_0100300.1	None	None	65564	SubtelomericHypervariable	65564
Pf3D7_01_v3	intron	50416	50572	156	+	PF3D7_0100400.1:exon:1_PF3D7_0100400.1:exon:2	PF3D7_0100400.1	None	None	65564	SubtelomericHypervariable	65564
Pf3D7_01_v3	intron	54788	54938	150	-	PF3D7_0100600.1:exon:1_PF3D7_0100600.1:exon:2	PF3D7_0100600.1	None	None	65564	SubtelomericHypervariable	65564

0\|feature_id	1\|feature_alias
PF3D7_1314600	814061
PF3D7_1314600	PF13_0083
PF3D7_0209400	8444970
PF3D7_0209400	PF02_0090
PF3D7_0209400	PFB0423c

0\|feature_alias	1\|hp1	2\|feature_id
PFA0005w	4.648	PF3D7_0100100
PFA0010c	3.763	PF3D7_0100200
PFA0015c	5.042	PF3D7_0100300
PFA0020w	3.721	PF3D7_0100400
PFA0030c	4.398	PF3D7_0100600