This notebook contains imports, constants and functions which are shared across all other notebooks containing analyses for both the main body of the paper and the supplementary information.
In [1]:
# python standard library
import sys
# ensure Python 3
assert sys.version_info[0:2] == (3, 5)
import os
import operator
import itertools
import collections
import functools
import glob
import csv
import datetime
import bisect
from bisect import bisect_left, bisect_right
import sqlite3
import subprocess
import random
import gc
import shutil
import shelve
import re
In [2]:
# docker image
print('docker image', os.environ['DOCKER_IMAGE'])
In [3]:
%%HTML
<style type="text/css">
.container {
width: 96%;
}
#maintoolbar {
display: none;
}
#header-container {
display: none;
}
#notebook {
padding-top: 0;
}
</style>
In [4]:
# general purpose third party packages
import cython
#%load_ext Cython
import numpy as np
nnz = np.count_nonzero
import scipy
import scipy.stats
import scipy.spatial.distance
from scipy import stats
from scipy.spatial.distance import pdist, squareform
import numexpr
import h5py
import tables
import bcolz
import pandas
import IPython
from IPython.display import clear_output, display, HTML
import rpy2
import rpy2.robjects as ro
%reload_ext rpy2.ipython
import statsmodels
import sklearn
import sklearn.decomposition
import sklearn.manifold
import sh
import sqlalchemy
import pymysql
import psycopg2
import petl as etl
etl.config.display_index_header = True
import humanize
from humanize import naturalsize, intcomma, intword
In [5]:
# plotting setup
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.gridspec import GridSpec
import matplotlib_venn as venn
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')
sns.set_style('ticks')
rcParams = plt.rcParams
rcParams['font.size'] = 9
rcParams['axes.labelsize'] = 9
rcParams['xtick.labelsize'] = 9
rcParams['ytick.labelsize'] = 9
rcParams['legend.fontsize'] = 9
rcParams['axes.linewidth'] = .5
rcParams['lines.linewidth'] = .5
rcParams['patch.linewidth'] = .5
rcParams['font.family'] = 'arial'
rcParams['ytick.direction'] = 'out'
rcParams['xtick.direction'] = 'out'
rcParams['savefig.jpeg_quality'] = 100
rcParams['savefig.dpi'] = 120
rcParams['lines.markeredgewidth'] = .5
golden_ratio = 1.61803398875
In [6]:
# bio third party packages
import Bio
import pyfasta
import pysam
import pysamstats
import petlx
import petlx.bio
import vcf
import vcfnp
import anhima
import allel
In [7]:
def log(*msg):
msg = str(datetime.datetime.now()) + ' :: ' + ' '.join(map(str, msg))
print(msg, file=sys.stderr)
sys.stderr.flush()
from IPython.display import display_html
def pre(msg):
display_html('<pre style="line-height: 100%; font-size: .7em; display: inline-block; white-space: pre; background-color: #ff6; border: 0px solid #ddd; padding: 1px">' + str(msg).replace('<', '<') + '</pre>', raw=True)
def html(content):
display_html(content, raw=True)
In [8]:
CROSSES = '3d7_hb3', 'hb3_dd2', '7g8_gb4'
CHROMOSOMES = tuple('Pf3D7_{:02d}_v3'.format(n).encode('ascii') for n in range(1, 15))
PARENT_RUNS = 'ERR019061', 'ERR019054', 'ERR012788', 'ERR012840', 'ERR027099', 'ERR027100'
LABELS = {
'3d7_hb3': '3D7 x HB3',
'hb3_dd2': 'HB3 x Dd2',
'7g8_gb4': '7G8 x GB4',
'ERR019061': '3D7',
'ERR019054': 'HB3(1)',
'ERR012788': 'HB3(2)',
'ERR012840': 'Dd2',
'ERR027099': '7G8',
'ERR027100': 'GB4'
}
In [9]:
# N.B., change this if data are stored at a different location
BASE_DIR = '/data/plasmodium/pfalciparum/pf-crosses'
DATA_DIR = os.path.join(BASE_DIR, 'data')
RELEASE_DIR = os.path.join(DATA_DIR, 'release')
PUBLIC_DIR = os.path.join(DATA_DIR, 'public/20141022')
# public data
GATK_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.gatk.final.vcf.gz')
GATK_CALLSET_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.gatk.final.npz')
CORTEX_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.cortex.final.vcf.gz')
CORTEX_CALLSET_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.cortex.final.npz')
COMBINED_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.combined.final.vcf.gz')
COMBINED_CALLSET_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.combined.final.npz')
CORTEX_RAW_VCF_FN_TEMPLATE = os.path.join(PUBLIC_DIR, '{cross}.cortex.raw.vcf.gz')
BAM_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'bam/{run}.realigned.bam')
COVERAGE_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'cnv/{run}.coverage_ext_binned.300.{chrom}.npy')
CNV_TBL_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'cnv/{run}.cnv.300.txt')
CNV_FN_TEMPLATE = os.path.join(PUBLIC_DIR, 'cnv/{run}.cnv.300.{chrom}.npy')
# reference data
GENOME_DIR = os.path.join(DATA_DIR, 'genome/sanger/version3/September_2012')
FASTA_FN = os.path.join(GENOME_DIR, 'Pf3D7_v3.fa')
GFF_FN = os.path.join(GENOME_DIR, 'Pf3D7_v3.sorted.ascii.gff.gz')
fasta = pyfasta.Fasta(FASTA_FN)
In [10]:
dup_samples = {
'3d7_hb3': ['C01/PG0062-C/ERR019070',
'C02/PG0053-C/ERR019067',
'C02/PG0055-C/ERR019066',
'C02/PG0056-C/ERR019068'],
'hb3_dd2': [],
'7g8_gb4': ['AUD/PG0112-CW/ERR045639',
'JC9/PG0111-C/ERR029409',
'JE11/PG0100-CW/ERR045630',
'JF6/PG0079-CW/ERR045637',
'KB8/PG0104-CW/ERR045642',
'LA10/PG0086-CW/ERR045629',
'NIC/PG0095-C/ERR027107',
'QF5/PG0078-CW/ERR045638',
'XD8/PG0105-CW/ERR045628',
'XF12/PG0102-CW/ERR045635'],
}
excessive_recomb_samples = {
'3d7_hb3': [],
'hb3_dd2': ['SC01/PG0025-C/ERR019045'],
'7g8_gb4': ['D2/PG0094-CW/ERR045632'],
}
In [11]:
def interleave(a, b):
c = np.empty((a.size + b.size,), dtype=a.dtype)
c[0::2] = a
c[1::2] = b
return c
def unpack_callset(callset):
V = callset['variants'].view(np.recarray)
C = callset['calldata'].view(np.recarray)
C2d = vcfnp.view2d(C).view(np.recarray)
return V, C, C2d
def filter_variants(callset, query=None, verbose=True):
if query is None:
return callset
else:
V, C, _ = unpack_callset(callset)
if isinstance(query, str):
flt = numexpr.evaluate(query, local_dict=V)
elif callable(query):
flt = query(*unpack_callset(callset))
else:
flt = np.asarray(query)
indices = np.nonzero(flt)[0]
ret, exc, tot = indices.size, V.size-indices.size, V.size
if verbose:
log('filter variants: excluding %s (%.1f%%)' % (exc, exc*100./tot), 'retaining %s (%.1f%%)' % (ret, ret*100./tot), 'of', tot, 'variants')
V = np.take(V, indices, axis=0).view(np.recarray)
C = np.take(C, indices, axis=0).view(np.recarray)
return {'variants': V, 'calldata': C}
def filter_calls(callset, query=None, verbose=True):
if query is None:
return callset
else:
V, C, _ = unpack_callset(callset)
C = C.copy()
C2d = vcfnp.view2d(C).view(np.recarray)
# filter calls
if isinstance(query, str):
flt = numexpr.evaluate(query, local_dict=C2d)
elif callable(query):
flt = query(*unpack_callset(callset))
else:
flt = np.asarray(query)
# only filter non-missing
excl = (C2d.genotype >= 0) & ~flt
tot, exc, ret = excl.size, np.count_nonzero(excl), np.count_nonzero(~excl)
if verbose:
log('filter calls: excluding %s (%.1f%%)' % (exc, exc*100./tot), 'retaining %s (%.1f%%)' % (ret, ret*100./tot), 'of', tot, 'calls')
C2d.genotype[excl] = -2
return {'variants': V, 'calldata': C}
def filter_samples(callset, inclusions=None, exclusions=None, verbose=True):
if inclusions or exclusions:
V, C, _ = unpack_callset(callset)
samples = C.dtype.names
if inclusions:
exclusions = [s for s in samples if s not in inclusions]
elif exclusions:
inclusions = [s for s in samples if s not in exclusions]
if verbose:
log('filter samples: excluding', exclusions, 'including', inclusions)
C = C[inclusions].copy()
return {'variants': V, 'calldata': C}
else:
return callset
def filter_callset(callset,
variant_filter=None,
call_filter=None,
sample_inclusions=None,
sample_exclusions=None,
verbose=True):
callset = filter_variants(callset, variant_filter, verbose=verbose)
callset = filter_samples(callset, inclusions=sample_inclusions, exclusions=sample_exclusions, verbose=verbose)
callset = filter_calls(callset, call_filter, verbose=verbose)
return callset
def load_callset(callset_fn,
variant_filter=None,
call_filter=None,
sample_inclusions=None,
sample_exclusions=None,
verbose=True):
if verbose:
log('loading', callset_fn)
callset = np.load(callset_fn)
callset = filter_callset(callset,
variant_filter=variant_filter,
call_filter=call_filter,
sample_inclusions=sample_inclusions,
sample_exclusions=sample_exclusions,
verbose=verbose)
return callset
def filter_callsets(callsets,
crosses=CROSSES,
variant_filter=None,
call_filter=None,
sample_inclusions=dict(),
sample_exclusions=dict(),
verbose=True):
out = dict()
for cross in callsets.keys():
out[cross] = filter_callset(callsets[cross],
variant_filter=variant_filter,
call_filter=call_filter,
sample_inclusions=sample_inclusions.get(cross, None),
sample_exclusions=sample_exclusions.get(cross, None),
verbose=verbose)
return out
def load_callsets(callset_fn_template,
crosses=CROSSES,
variant_filter=None,
call_filter=None,
sample_inclusions=dict(),
sample_exclusions=dict(),
verbose=True):
callsets = dict()
for cross in CROSSES:
callset_fn = callset_fn_template.format(cross=cross)
callsets[cross] = load_callset(callset_fn,
variant_filter=variant_filter,
call_filter=call_filter,
sample_inclusions=sample_inclusions.get(cross, None),
sample_exclusions=sample_exclusions.get(cross, None),
verbose=verbose)
return callsets
def locate_variants(CHROM, POS, chrom, start=None, stop=None):
if isinstance(chrom, str):
chrom = chrom.encode('ascii')
start_index, stop_index = bisect_left(CHROM, chrom), bisect_right(CHROM, chrom)
POS = POS[start_index:stop_index]
if stop is not None:
stop_index = start_index + bisect_right(POS, stop)
if start is not None:
start_index = start_index + bisect_left(POS, start)
return start_index, stop_index
def min_haplen_calls(min_haplen):
def _f(v, c, c2d):
hl, hs, _ = haplotypes(v, c)
return hl >= min_haplen
return _f
def gatk_conf_calls(V, C, C2d):
return C2d.GQ >= 99
def cortex_conf_calls(V, C, C2d):
return C2d.GT_CONF >= 50
def combined_conf_calls(v, c, c2d):
return ((v.set[:,np.newaxis] == b'Cortex') & (c2d.GT_CONF >= 50)
| (v.set[:,np.newaxis] != b'Cortex') & (c2d.GQ >= 99))
In [12]:
def display_with_nrows(tbl, n=5, caption=''):
caption += ' (%s rows)' % tbl.nrows()
tbl.display(n, caption=caption)
In [13]:
CACHE_DIR = os.path.join(PUBLIC_DIR, 'cache')
In [14]:
def etlcache(f):
fn = os.path.join(CACHE_DIR, f.__name__)
if not os.path.exists(fn):
etl.topickle(f(), fn)
return etl.frompickle(fn)
def pdcache(f):
fn = os.path.join(CACHE_DIR, f.__name__)
if not os.path.exists(fn):
obj = f()
pandas.to_pickle(obj, fn)
return obj
else:
return pandas.read_pickle(fn)
def nocache(f):
fn = os.path.join(CACHE_DIR, f.__name__)
if os.path.exists(fn):
os.remove(fn)
return f()
In [15]:
tbl_samples = (etl
.fromtsv(os.path.join(PUBLIC_DIR, 'samples.txt'))
.addfield('ID', lambda row: '%s/%s/%s' % (row.clone, row.sample, row.run))
)
run2id = tbl_samples.lookupone('run', 'ID')
tbl_samples
Out[15]:
In [16]:
@etlcache
def tbl_regions_1b():
return (etl
.fromtsv(os.path.join(PUBLIC_DIR, 'regions-20130225.onebased.txt'))
.pushheader(['chrom', 'start', 'stop', 'type'])
.convertnumbers()
.prefixheader('region_')
.addfield('region_size', lambda row: row.region_stop - row.region_start + 1)
)
tbl_regions_1b
Out[16]:
In [17]:
@etlcache
def tbl_features():
return (etl
.fromgff3(GFF_FN)
.unpackdict('attributes', ['ID', 'Parent', 'Name', 'previous_systematic_id'])
.rename({'seqid': 'chrom', 'end': 'stop', 'ID': 'id', 'Parent': 'parent_id', 'Name': 'name', 'previous_systematic_id': 'previous_id'})
.cutout('source', 'score', 'phase')
.addfield('length', lambda row: row.stop - row.start, index=4)
.intervalleftjoin(tbl_regions_1b, lkey='chrom', rkey='region_chrom', rstart='region_start', rstop='region_stop')
.cutout('region_chrom', 'region_start', 'region_stop')
.distinct('id')
.sort(key=('chrom', 'start', 'type'))
.prefixheader('feature_')
)
tbl_features
Out[17]:
In [18]:
lkp_feature = tbl_features.recordlookupone('feature_id')
In [19]:
@etlcache
def tbl_genes():
return (
tbl_features
.selectin('feature_type', ['gene', 'pseudogene'])
.cutout('feature_parent_id')
)
tbl_genes
Out[19]:
In [20]:
@etlcache
def tbl_intergenic():
return (
tbl_genes
.addfieldusingcontext('next', lambda prv, cur, nxt: (nxt.feature_start, nxt.feature_id, nxt.feature_strand) if nxt and cur.feature_chrom == nxt.feature_chrom else None)
.notnone('next')
.unpack('next', ['next_start', 'next_id', 'next_strand'])
.convert('feature_id', lambda v, row: '%s_%s' % (row.feature_id, row.next_id), pass_row=True)
.cutout('feature_start', 'feature_length', 'feature_region_type', 'next_id')
.rename({'feature_stop': 'feature_start', 'next_start': 'feature_stop'})
.movefield('feature_stop', 3)
.addfield('feature_length', lambda row: row.feature_stop - row.feature_start, index=4)
.convert('feature_type', lambda v, row: 'intergenic %s/%s' % (row.feature_strand, row.next_strand), pass_row=True)
# reclassify based on expected number of promoters
.convert('feature_type', {'intergenic +/-': 'intergenic_0', 'intergenic -/+': 'intergenic_2', 'intergenic +/+': 'intergenic_1', 'intergenic -/-': 'intergenic_1'})
.cutout('feature_strand', 'next_strand', 'feature_name', 'feature_previous_id')
.intervalleftjoin(tbl_regions_1b,
lkey='feature_chrom', lstart='feature_start', lstop='feature_stop',
rkey='region_chrom', rstart='region_start', rstop='region_stop')
.rename('region_type', 'feature_region_type')
.cutout('region_chrom', 'region_start', 'region_stop')
.gt('feature_length', 0)
)
tbl_intergenic
Out[20]:
In [21]:
@etlcache
def tbl_exons():
return (
tbl_features
.eq('feature_type', 'CDS')
.gt('feature_length', 0)
)
tbl_exons
Out[21]:
In [22]:
@etlcache
def tbl_introns():
return (
tbl_exons
.addfieldusingcontext('next',
lambda prv, cur, nxt: (nxt.feature_start, nxt.feature_id) if nxt and cur.feature_parent_id == nxt.feature_parent_id else None)
.notnone('next')
.unpack('next', ['next_start', 'next_id'])
.convert('feature_id', lambda v, row: '%s_%s' % (row.feature_id, row.next_id), pass_row=True)
.cutout('feature_start', 'feature_length', 'feature_region_type', 'next_id')
.rename({'feature_stop': 'feature_start', 'next_start': 'feature_stop'})
.movefield('feature_stop', 3)
.addfield('feature_length', lambda row: row.feature_stop - row.feature_start, index=4)
.convert('feature_type', lambda v: 'intron')
.intervalleftjoin(tbl_regions_1b,
lkey='feature_chrom', lstart='feature_start', lstop='feature_stop',
rkey='region_chrom', rstart='region_start', rstop='region_stop')
.rename('region_type', 'feature_region_type')
.cutout('region_chrom', 'region_start', 'region_stop')
)
tbl_introns
Out[22]:
In [23]:
@etlcache
def tbl_gene_aliases():
return (etl
.fromtsv(os.path.join(DATA_DIR, 'genome/plasmodb/release-9.3/Pfalciparum3D7/txt/PlasmoDB-9.3_Pfalciparum3D7_GeneAliases.txt'))
.pushheader(['feature_id'] + list(map(str, range(10))))
.convert('feature_id', lambda v: v[:13]) # ignore alternate splice forms
.cat() # square up short rows
.melt('feature_id')
.rename('value', 'feature_alias')
.cut('feature_id', 'feature_alias')
.ne('feature_alias', None)
)
tbl_gene_aliases
Out[23]:
In [24]:
@etlcache
def tbl_hp1():
return (etl
.fromtext(os.path.join(DATA_DIR, 'reference/flueck_2009/journal.ppat.1000569.s008.txt'))
.rowslice(3, None)
.ne('lines', '')
.capture('lines', r'^\S+\s+\d+\s+\d+\s+(\S+)', ['feature_alias'], include_original=1)
.capture('lines', r'^\S+\s+\d+\s+\d+\s+\S+.*?(-?\d\.\d{3})', ['hp1'], include_original=1)
.convert('hp1', float)
.cutout('lines')
.leftjoin(tbl_gene_aliases, key='feature_alias')
.ne('feature_id', None)
.sort('feature_id')
)
tbl_hp1
Out[24]:
In [ ]:
In [25]:
def plot_variants_locator(ax, callset, chrom, start=0, stop=None, step=5):
V, _, _ = unpack_callset(callset)
if stop is None:
stop = len(fasta[chrom])
indices = np.nonzero((V.CHROM == chrom.encode('ascii')) & (V.POS >= start) & (V.POS <= stop))[0]
POS = np.take(V.POS, indices, axis=0)
anhima.loc.plot_variant_locator(POS, ax=ax, step=step, start_position=start, stop_position=stop, line_args=dict(color='k'))
ax.set_xticks([])
ax.set_xlabel('')
In [26]:
INHERITANCE_COLORS = [
'white', # missing
'grey', # filtered
'yellow', # parent missing
'yellow', # parent filtered
'black', # non-parental
'orange', # non-segregating ref
'green', # non-segregating alt
'blue', # parent 1
'red' # parent 2
]
def plot_inheritance(ax, callset, chrom, start=0, stop=None, sample_indices=None, colors=INHERITANCE_COLORS, alpha=1, **kwargs):
V, C, C2d = unpack_callset(callset)
samples = C.dtype.names
if stop is None:
stop = len(fasta[chrom])
indices = np.nonzero((V.CHROM == chrom.encode('ascii')) & (V.POS >= start) & (V.POS <= stop))[0]
POS = np.take(V.POS, indices, axis=0)
G = np.take(C2d.genotype, indices, axis=0)
P = inheritance(G)
if sample_indices is not None:
P = np.take(P, sample_indices, axis=1)
samples = np.take(samples, sample_indices, axis=0)
anhima.gt.plot_discrete_calldata(P, labels=samples, colors=colors, states=range(len(colors)), ax=ax, **kwargs)
In [27]:
def get_features(chrom, start=None, stop=None):
if None not in {start, stop}:
region = '%s:%s-%s' % (chrom, start, stop)
else:
region = chrom
tbl_features = (etl
.fromgff3(GFF_FN, region)
.cutout('source', 'score', 'phase')
.unpackdict('attributes', ['ID', 'Name', 'previous_systematic_id'], includeoriginal=True)
)
tbl_features = tbl_features.setheader(['feature_' + str(f).lower() for f in tbl_features.header()])
return tbl_features
prog_hv = re.compile('var|rif|stev', flags=re.I)
def plot_genes(ax, chrom, start=0, stop=None, label_named=False, exclude_labels=[], width=.3, label_ytextpad=.1,
normal_color='k', hypervariable_color='r', cds_color='gray', divider_linewidth=1,
annotate_genes=[], annotate_xtext=0, annotate_ytext_fwd=6, annotate_ytext_rev=8,
annotate_arrowprops=None, annotate_bbox=None,
show_cds=False, gene_labels=dict()):
if stop is None:
stop = len(fasta[chrom])
tbl_local_features = get_features(chrom, start, stop)
tbl_local_genes = tbl_local_features.selectin('feature_type', {'gene', 'pseudogene'})
tbl_local_cds = tbl_local_features.eq('feature_type', 'CDS')
# divider
ax.plot([start, stop], [.5, .5], 'k-', linewidth=divider_linewidth)
ax.set_ylim(0, 1)
# positive strand
tbl_fwd_genes = tbl_local_genes.eq('feature_strand', '+')
xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_fwd_genes.records()]
fc = [hypervariable_color if r.feature_name is not None and prog_hv.search(r.feature_name) is not None else normal_color
for r in tbl_fwd_genes.records()]
ax.broken_barh(xranges, (.5, width), color=fc)
if show_cds:
tbl_fwd_cds = tbl_local_cds.eq('feature_strand', '+')
xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_fwd_cds.records()]
ax.broken_barh(xranges, (.5, width), color=cds_color)
# negative strand
tbl_rev_genes = tbl_local_genes.eq('feature_strand', '-')
xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_rev_genes.records()]
fc = [hypervariable_color if r.feature_name is not None and prog_hv.search(r.feature_name) is not None else normal_color
for r in tbl_rev_genes.records()]
ax.broken_barh(xranges, (.5-width, width), color=fc)
if show_cds:
tbl_rev_cds = tbl_local_cds.eq('feature_strand', '-')
xranges = [(r.feature_start, r.feature_end-r.feature_start) for r in tbl_rev_cds.records()]
ax.broken_barh(xranges, (.5-width, width), color=cds_color)
for s in 'top', 'left', 'right':
ax.spines[s].set_visible(False)
ax.set_xlim(start, stop)
ax.set_yticks([])
ax.xaxis.tick_bottom()
# label named genes
if label_named:
for g in tbl_local_genes.records():
if g.feature_name is not None and g.feature_name not in exclude_labels:
x = g.feature_start
y = .5 + width + label_ytextpad if g.feature_strand == '+' else .5 - width - label_ytextpad
va = 'bottom' if g.feature_strand == '+' else 'top'
l = g.feature_name
ax.text(x, y, l, rotation=0, ha='left', va=va, transform=ax.transData)
# annotate specific genes
if annotate_arrowprops is None:
annotate_arrowprops = dict(facecolor='k', arrowstyle='-', shrinkA=0, shrinkB=0)
if annotate_bbox is None:
annotate_bbox = dict(facecolor='w', alpha=.9)
for id in annotate_genes:
g = lkp_feature[id]
gene_id = g['feature_id']
l = gene_labels[gene_id] if gene_id in gene_labels else g['feature_name'] if g['feature_name'] is not None else gene_id
if g['feature_strand'] == '+':
ax.annotate(l, xy=(g['feature_start'], .5), ha='left', xytext=(annotate_xtext, annotate_ytext_fwd),
xycoords='data', textcoords='offset points',
arrowprops=annotate_arrowprops,
bbox=annotate_bbox)
elif g['feature_strand'] == '-':
ax.annotate(l, xy=(g['feature_start'], .5), ha='left', xytext=(annotate_xtext, annotate_ytext_rev),
xycoords='data', textcoords='offset points',
arrowprops=annotate_arrowprops,
bbox=annotate_bbox)
In [28]:
CNV_RUNS = {
'3d7_hb3': ['ERR019061', 'ERR019054'],
'hb3_dd2': ['ERR012788', 'ERR012840', 'ERR019054'], # share HB3s
'7g8_gb4': ['ERR027099', 'ERR027100']
}
accessibility_colors = {
'Core': 'white',
'SubtelomericRepeat': 'yellow',
'SubtelomericHypervariable': '#d73027',
'InternalHypervariable': '#fc8d59',
'Centromere': 'black',
'CNV': '#91bfdb',
}
def plot_accessibility(ax, chrom, cross=None, start=0, stop=None, linewidth=0.5, **kwargs):
tbl_lreg = tbl_regions_1b.eq('region_chrom', chrom)
if stop is None:
stop = len(fasta[chrom])
# plot core
xranges = tbl_lreg.eq('region_type', 'Core').values(['region_start', 'region_size'])
ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['Core'],
linewidth=linewidth, **kwargs)
# plot centromere
xranges = tbl_lreg.eq('region_type', 'Centromere').values(['region_start', 'region_size'])
ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['Centromere'],
linewidth=linewidth, **kwargs)
# plot subtelomeric repeat
xranges = tbl_lreg.eq('region_type', 'SubtelomericRepeat').values(['region_start', 'region_size'])
ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['SubtelomericRepeat'],
linewidth=linewidth, **kwargs)
# plot hypervariable
xranges = tbl_lreg.eq('region_type', 'SubtelomericHypervariable').values(['region_start', 'region_size'])
ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['SubtelomericHypervariable'],
linewidth=linewidth, **kwargs)
xranges = tbl_lreg.eq('region_type', 'InternalHypervariable').values(['region_start', 'region_size'])
ax.broken_barh(xranges, (.1, .8), facecolor=accessibility_colors['InternalHypervariable'],
linewidth=linewidth, **kwargs)
if cross is not None:
# plot CNVs
tbls_cnvs = [etl.fromtsv(CNV_TBL_FN_TEMPLATE.format(run=run)).convertnumbers() for run in CNV_RUNS[cross]]
tbl_cnvs = (etl
.cat(*tbls_cnvs)
.gt('cn', 1)
.eq('chrom', chrom)
.intervalleftjoin(tbl_lreg, lkey='chrom', rkey='region_chrom',
lstart='start', lstop='stop',
rstart='region_start', rstop='region_stop')
.eq('region_type', 'Core')
)
xranges = tbl_cnvs.values(['start', 'size'])
ax.broken_barh(xranges, (.1, .8), color=accessibility_colors['CNV'],
linewidth=linewidth, **kwargs)
# tidy up
for s in 'top', 'left', 'right', 'bottom':
ax.spines[s].set_visible(False)
ax.set_xlim(start, stop)
ax.set_xticks([])
ax.set_yticks([])
ax.xaxis.tick_bottom()
ax.set_ylim(0, 1)
In [29]:
def plot_hp1(ax, chrom, start=0, stop=None):
if stop is None:
stop = len(fasta[chrom])
tbl_genes_local = get_features(chrom, start, stop).selectin('feature_type', {'gene', 'pseudogene'})
A = tbl_genes_local.cut('feature_start', 'feature_end', 'feature_id').join(tbl_hp1, key='feature_id').torecarray()
left = A.feature_start
height = A.hp1
width = A.feature_end - A.feature_start
ax.bar(left, height, width, facecolor='purple', linewidth=1, edgecolor='purple')
ax.set_xlim(start, stop)
for s in 'top', 'left', 'right', 'bottom':
ax.spines[s].set_visible(False)
ax.set_yticks([])
ax.set_ylim(-1, 7)
ax.set_xticks([])
# ax.plot([start, stop], [0, 0], color='k', linestyle='-')
In [30]:
# INHERITANCE_COLORS = [
# 'white', # missing
# 'grey', # filtered
# 'yellow', # parent missing
# 'yellow', # parent filtered
# 'black', # non-parental
# 'orange', # non-segregating ref
# 'green', # non-segregating alt
# 'blue', # parent 1
# 'red' # parent 2
# ]
# def plot_inheritance(ax, callset, chrom, start=0, stop=None, sample_indices=None, colors=INHERITANCE_COLORS, alpha=1, **kwargs):
# V, C, C2d = unpack_callset(callset)
# samples = C.dtype.names
# if stop is None:
# stop = len(fasta[chrom])
# indices = np.nonzero((V.CHROM == chrom) & (V.POS >= start) & (V.POS <= stop))[0]
# POS = np.take(V.POS, indices, axis=0)
# G = np.take(C2d.genotype, indices, axis=0)
# P = inheritance(G)
# if sample_indices is not None:
# P = np.take(P, sample_indices, axis=1)
# samples = np.take(samples, sample_indices, axis=0)
# vcfplt.discrete_calldata_colormesh(P, labels=samples, colors=colors, states=range(len(colors)), alpha=alpha, ax=ax, **kwargs)
In [31]:
def marker_span(callset, chrom):
V, _, _ = unpack_callset(callset)
start_index, stop_index = locate_variants(V.CHROM, V.POS, chrom)
POS = V.POS[start_index:stop_index]
x = POS[-1] - POS[0]
return x
def marker_offset(callset, chrom):
V, _, _ = unpack_callset(callset)
start_index, stop_index = locate_variants(V.CHROM, V.POS, chrom)
POS = V.POS[start_index:stop_index]
return POS[0]
def locate_flanking_markers(CHROM, POS, chrom, pos):
chrom_start_index, chrom_stop_index = bisect_left(CHROM, chrom), bisect_right(CHROM, chrom)
POS = POS[chrom_start_index:chrom_stop_index]
start_index = bisect_left(POS, pos) - 1
stop_index = bisect_right(POS, pos)
if start_index < 0:
start_index = None
else:
start_index += chrom_start_index
if stop_index >= len(POS):
stop_index = None
else:
stop_index += chrom_start_index
return start_index, stop_index
def simulate_random_uniform_crossovers(n_events, callset, chromosomes=CHROMOSOMES):
# table of results
tbl = [['chrom', 'co_pos', 'co_pos_mid', 'co_pos_min', 'co_pos_max', 'co_pos_range']]
# utility variables
V = callset['variants']
marker_spans = [marker_span(callset, chrom) for chrom in chromosomes]
marker_offsets = [marker_offset(callset, chrom) for chrom in chromosomes]
cumulative_marker_span = [np.sum(marker_spans[:i+1]) for i in range(len(marker_spans))]
total_marker_span = np.sum(marker_spans)
# generate random integers over the total marker span
X = np.random.randint(low=0, high=total_marker_span, size=n_events)
# map each event onto a chromosome and position
for x in X:
# which chromosome?
chrom_index = bisect_left(cumulative_marker_span, x)
chrom = chromosomes[chrom_index]
# which position?
if chrom_index > 0:
pos_relative = x - cumulative_marker_span[chrom_index-1]
else:
pos_relative = x
pos = pos_relative + marker_offsets[chrom_index]
# find flanking markers
lidx, ridx = locate_flanking_markers(V.CHROM, V.POS, chrom, pos)
pos_min = V.POS[lidx]
pos_max = V.POS[ridx]
pos_mid = (pos_min + pos_max)//2
pos_range = pos_max - pos_min
row = [chrom, pos, pos_mid, pos_min, pos_max, pos_range]
tbl += [row]
return etl.wrap(tbl)
def simulate_random_uniform_non_crossovers(n_events, callset, phi, chromosomes=CHROMOSOMES):
# table of results
tbl = [['chrom',
'tract_start',
'tract_stop',
'tract_length',
'tract_start_min',
'tract_start_mid',
'tract_start_max',
'tract_stop_min',
'tract_stop_mid',
'tract_stop_max',
'tract_length_min',
'tract_length_mid',
'tract_length_max',
'tract_support']]
# utility variables
V = callset['variants']
marker_spans = [marker_span(callset, chrom) for chrom in chromosomes]
marker_offsets = [marker_offset(callset, chrom) for chrom in chromosomes]
cumulative_marker_span = [np.sum(marker_spans[:i+1]) for i in range(len(marker_spans))]
total_marker_span = np.sum(marker_spans)
# generate random integers over the total marker span
X = np.random.randint(low=0, high=total_marker_span, size=n_events)
# generate random geometric tract lengths
lengths = stats.geom.rvs(1-phi, size=n_events).astype('int32')
# map each event onto a chromosome and position
for x, length in zip(X, lengths):
try:
# which chromosome?
chrom_index = bisect_left(cumulative_marker_span, x)
chrom = chromosomes[chrom_index]
# which start position?
if chrom_index > 0:
pos_relative = x - cumulative_marker_span[chrom_index-1]
else:
pos_relative = x
start = pos_relative + marker_offsets[chrom_index]
stop = start + length
# find flanking markers
start_lidx, start_ridx = locate_flanking_markers(V.CHROM, V.POS, chrom, start)
stop_lidx, stop_ridx = locate_flanking_markers(V.CHROM, V.POS, chrom, stop)
if None in {start_lidx, start_ridx, stop_lidx, stop_ridx}:
# falls off edge of marker span
support = start_min = start_max = start_mid = stop_min = stop_max = stop_mid = length_min = length_max = length_mid = None
else:
support = stop_ridx - start_ridx
# do we even observe the event?
if support == 0:
start_min = start_max = start_mid = stop_min = stop_max = stop_mid = length_min = length_max = length_mid = None
else:
start_min = V.POS[start_lidx]
start_max = V.POS[start_ridx]
start_mid = (start_min + start_max)//2
stop_min = V.POS[stop_lidx]
stop_max = V.POS[stop_ridx]
stop_mid = (stop_min + stop_max)//2
length_min = stop_min - start_max
length_max = stop_max - start_min
length_mid = (length_min + length_max)//2
row = [chrom,
start,
stop,
length,
start_min,
start_mid,
start_max,
stop_min,
stop_mid,
stop_max,
length_min,
length_mid,
length_max,
support]
tbl += [row]
except Exception as e:
log(e)
log(traceback.format_exc(10))
return etl.wrap(tbl)
In [32]:
%reload_ext Cython
In [33]:
%%cython
import numpy as np
cimport numpy as np
import petl as etl
import vcfnp
CHROMOSOMES = [('Pf3D7_%02d_v3' % n).encode('ascii') for n in range(1, 15)]
# constants to represent the possible outcomes when attempting to determine the inheritance for any given variant call
INHERITANCE_MISSING = 0 # genotype call is missing
INHERITANCE_FILTERED = 1 # genotype call is filtered
INHERITANCE_PARENT_MISSING = 2 # genotype is called but cannot be phased because one or both parents is missing
INHERITANCE_PARENT_FILTERED = 3 # genotype is called but cannot be phased because one or both parents is filtered
INHERITANCE_NONPARENTAL = 4 # genotype call does not correspond with either parent (i.e., mendelian error sensu strictu)
INHERITANCE_NONSEGREGATING_REF = 5 # genotype call is ref and parents both have reference allele
INHERITANCE_NONSEGREGATING_ALT = 6 # genotype call is alt and parents both have alternate allele
INHERITANCE_PARENT1 = 7 # genotype call corresponds with first parent
INHERITANCE_PARENT2 = 8 # genotype call corresponds with second parent
# functions to determine inheritance states
###########################################
def inheritance(np.ndarray[np.int8_t, ndim=2] G):
cdef int m = G.shape[0] # number of variants
cdef int n = G.shape[1] # number of samples
cdef np.ndarray[np.uint8_t, ndim=2] P
P = np.zeros((m,n), dtype='u1')
for i in range(m):
for j in range(n):
# N.B., parents are *first* two samples
P[i,j] = inheritance_state(G[i,j], G[i,0], G[i,1])
return P
cdef int inheritance_state(int genotype_child, int genotype_parent1, int genotype_parent2):
if genotype_child == -1:
return INHERITANCE_MISSING
elif genotype_child == -2:
return INHERITANCE_FILTERED
elif genotype_parent1 == -1 or genotype_parent2 == -1:
return INHERITANCE_PARENT_MISSING
elif genotype_parent1 == -2 or genotype_parent2 == -2:
return INHERITANCE_PARENT_FILTERED
elif genotype_child != genotype_parent1 and genotype_child != genotype_parent2:
return INHERITANCE_NONPARENTAL
elif genotype_child == genotype_parent1 == genotype_parent2 == 0:
return INHERITANCE_NONSEGREGATING_REF
elif genotype_child == genotype_parent1 == genotype_parent2 and genotype_child > 0:
return INHERITANCE_NONSEGREGATING_ALT
elif genotype_child == genotype_parent1:
return INHERITANCE_PARENT1
elif genotype_child == genotype_parent2:
return INHERITANCE_PARENT2
else:
return -1 # should never be reached
# functions to calculate haplotype statistics
#############################################
def haplotypes_chrom(V, C, chrom):
"""
Construct arrays of haplotype length and haplotype support for all variant calls.
"""
tbl = list()
header = 'sample', 'chrom', 'start_min', 'start_mid', 'start_max', 'stop_min', 'stop_mid', 'stop_max', 'length_min', \
'length_mid', 'length_max', 'support', 'prv_inheritance', 'inheritance', 'nxt_inheritance'
tbl.append(header)
cdef int i, j, M, N, prv_idx, start_idx, stop_idx, start_min, start_mid, start_max, stop_min, stop_mid, \
stop_max, length_min, length_mid, length_max, support, prv, cur
samples = C.dtype.names
C2d = vcfnp.view2d(C).view(np.recarray)
# only deal with one chromosome, make life simpler
flt = V.CHROM == chrom
indices = np.nonzero(flt)[0]
V = np.take(V, indices, axis=0)
C2d = np.take(C2d, indices, axis=0)
M, N = C2d.shape
cdef np.ndarray[np.uint8_t, ndim=2] P
P = inheritance(C2d.genotype) # inheritance
# print M, N
cdef np.ndarray[np.int32_t, ndim=1] POS
POS = V['POS']
cdef np.ndarray[np.int32_t, ndim=2] HL
cdef np.ndarray[np.int32_t, ndim=2] HS
HL = np.zeros((M, N), dtype='i4') # haplotype lengths (minimum)
HS = np.zeros((M, N), dtype='i4') # haplotype support - number of calls supporting the haplotype
cdef np.ndarray[np.int32_t, ndim=1] previous_call_idx
cdef np.ndarray[np.int32_t, ndim=2] haplotype_start_idxs
cdef np.ndarray[np.int32_t, ndim=1] last_haplotype_inheritance
previous_call_idx = np.array([-1] * N, dtype='i4') # store index of last good call
haplotype_start_idxs = np.array([[-1, -1]] * N, dtype='i4') # store indices of haplotype start (flanking markers)
last_haplotype_inheritance = np.array([-1] * N, dtype='i4') # store parental inheritance of upstream block
# iterate over variants
for i in range(M):
# iterate over samples
for j in range(N):
# extract genotype call for current variant/sample
cur = P[i, j]
if cur == INHERITANCE_PARENT1 or cur == INHERITANCE_PARENT2: # current call is good
# index of last good call for this sample
prv_idx = previous_call_idx[j]
if prv_idx < 0:
# this is the first good call, start a haplotype
haplotype_start_idxs[j] = (i, i)
else:
# extract last good genotype call for current sample
prv = P[prv_idx, j]
if cur != prv:
# inheritance switch, haplotype has ended
start_min_idx, start_max_idx = tuple(haplotype_start_idxs[j])
stop_min_idx, stop_max_idx = prv_idx, i
start_min = POS[start_min_idx]
start_max = POS[start_max_idx]
start_mid = (start_max + start_min) // 2
stop_min = POS[stop_min_idx]
stop_max = POS[stop_max_idx]
stop_mid = (stop_max + stop_min) // 2
length_min = stop_min - start_max
length_max = stop_max - start_min
length_mid = stop_mid - start_mid
support = np.count_nonzero(P[start_max_idx:stop_min_idx+1, j] == prv)
# assign results
HL[start_max_idx:stop_min_idx+1, j] = length_min
HS[start_max_idx:stop_min_idx+1, j] = support
row = samples[j], chrom, start_min, start_mid, start_max, stop_min, stop_mid, stop_max, \
length_min, length_mid, length_max, support, last_haplotype_inheritance[j], prv, cur
tbl.append(row)
# start new haplotype
haplotype_start_idxs[j] = (prv_idx, i)
last_haplotype_inheritance[j] = prv
else:
# haplotype continues
pass
# store last good call index
previous_call_idx[j] = i
# final haplotypes ending at chromosome end
for j in range(N):
start_min_idx, start_max_idx = tuple(haplotype_start_idxs[j])
stop_min_idx, stop_max_idx = prv_idx, prv_idx # nothing subsequent
start_min = POS[start_min_idx]
start_max = POS[start_max_idx]
start_mid = (start_max + start_min) // 2
stop_min = POS[stop_min_idx]
stop_max = POS[stop_max_idx]
stop_mid = (stop_max + stop_min) // 2
length_min = stop_min - start_max
length_max = stop_max - start_min
length_mid = stop_mid - start_mid
inherit = P[prv_idx, j]
support = np.count_nonzero(P[start_max_idx:stop_min_idx+1, j] == inherit)
# assign results
HL[start_max_idx:stop_min_idx+1, j] = length_min
HS[start_max_idx:stop_min_idx+1, j] = support
row = samples[j], chrom, start_min, start_mid, start_max, stop_min, stop_mid, stop_max, length_min, length_mid, \
length_max, support, last_haplotype_inheritance[j], inherit, -1
tbl.append(row)
return HL, HS, etl.wrap(tbl)
def haplotypes(V, C):
stats = [haplotype_stats_chrom(V, C, chrom) for chrom in CHROMOSOMES]
HL = np.vstack(s[0] for s in stats)
HS = np.vstack(s[1] for s in stats)
tbl = etl.cat(*[s[2] for s in stats])
return HL, HS, tbl
# backwards compatibility
haplotype_stats_chrom = haplotypes_chrom
haplotype_stats = haplotypes
# functions to tabulate recombination events
############################################
def tabulate_switches_chrom(V, C, chrom):
tbl = list()
header = 'sample', 'chrom', 'pos', 'lpos', 'rpos', 'range', 'from', 'to'
tbl.append(header)
cdef int i, j, m, n, prv_idx, cur, prv
samples = C.dtype.names
C2d = vcfnp.view2d(C).view(np.recarray)
# only deal with one chromosome, make life simpler
flt = V.CHROM == chrom
indices = np.nonzero(flt)[0]
V, C, C2d = [np.take(X, indices, axis=0) for X in [V, C, C2d]]
m, n = C2d.shape
cdef np.ndarray[np.uint8_t, ndim=2] P
P = inheritance(C2d.genotype)
cdef np.ndarray[np.int32_t, ndim=1] POS
POS = V['POS']
cdef np.ndarray[np.int32_t, ndim=1] previous_call_idx
previous_call_idx = np.array([-1] * n, dtype='i4') # store index of last good call
# iterate over variants
for i in range(m):
# iterate over samples
for j in range(n):
# extract genotype call for current variant/sample
cur = P[i, j]
if cur == INHERITANCE_PARENT1 or cur == INHERITANCE_PARENT2: # current call is good
# index of last good call for this sample
prv_idx = previous_call_idx[j]
if prv_idx < 0:
# this is the first good call
pass
else:
# extract last good genotype call for current sample
prv = P[prv_idx, j]
if cur != prv:
# phase switch
lpos = POS[prv_idx]
rpos = POS[i]
row = samples[j], chrom, (lpos + rpos) // 2, lpos, rpos, rpos-lpos, prv, cur
tbl.append(row)
else:
# haplotype continues
pass
# store last good call index
previous_call_idx[j] = i
return etl.wrap(tbl)
def tabulate_switches(V, C):
tbls = [tabulate_switches_chrom(V, C, chrom) for chrom in CHROMOSOMES]
return etl.cat(*tbls)
In [ ]:
In [ ]: