In [1]:
from __future__ import division, print_function
%run _shared_setup.ipynb
%run _plotting_setup.ipynb
In [2]:
tbl_cp = (etl
.fromxls('../../../data/reference/ponts_2011/NIHMS400242-supplement-SuppTS4.xls')
.skip(2)
.setheader(['egasp_score', 'chrom', 'strand', 'pos', 'gene'])
.convert('chrom', lambda v: 'Pf3D7_%02d' % v)
.convert('pos', int)
.convert('strand', lambda v: '+' if v > 0 else '-')
)
tbl_cp.display(5)
In [3]:
# need to liftover to v3 coords
cp_v2_coords_fn = '../../../data/reference/ponts_2011/cp_v2_coords.bed'
cp_v3_coords_fn = '../../../data/reference/ponts_2011/cp_v3_coords.bed'
if not os.path.exists(cp_v3_coords_fn):
tbl_cp.cut(1, 3, 3, 2, 4).convert(2, lambda v: v+1).data().totsv(cp_v2_coords_fn)
!head {cp_v2_coords_fn}
liftover = sh.Command('../../../opt/liftover/liftOver')
chain_2to3 = '/data/plasmodium/pfalciparum/pf-crosses/opt/liftover/2to3.liftOver'
liftover(cp_v2_coords_fn, chain_2to3, cp_v3_coords_fn, cp_v3_coords_fn + '.unmapped')
!echo ------------------
!head {cp_v3_coords_fn}
!echo ------------------
!head {cp_v3_coords_fn}.unmapped
tbl_cp_v3 = (etl
.fromtsv(cp_v3_coords_fn)
.cut(0, 1, 3, 4)
.pushheader(['chrom', 'pos', 'strand', 'gene'])
.convert('pos', int)
.sort(key=('chrom', 'pos'))
)
tbl_cp_v3
Out[3]:
In [4]:
tbl_cp_v3.nrows()
Out[4]:
In [5]:
# build interval trees from the core promoter sites so we can find nearest
cp_trees = (
tbl_cp_v3
# need start and end fields, make them the same
.rename('pos', 'start')
.addfield('end', lambda v: v.start)
.recordtrees('chrom', 'start', 'end')
)
cp_trees
Out[5]:
In [6]:
# example of how to find nearest CP
cp_trees['Pf3D7_01_v3'].after(20000, max_dist=10000)
Out[6]:
In [7]:
# load callsets
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, variant_filter='FILTER_PASS')
In [91]:
def tabulate_intergenic_variants(cross, max_cp_dist=2000):
V, _, _ = unpack_callset(callsets[cross])
tbl_variants = (etl
.fromarray(V[['CHROM', 'POS', 'is_snp', 'num_alleles', 'svlen', 'STR', 'RU', 'REF', 'ALT']])
.convert(('svlen', 'ALT'), operator.itemgetter(0))
.annex(etl
.fromarray(V['EFF'][['Effect']])
)
.eq('Effect', 'INTERGENIC')
.addfield('cp_before', lambda v: cp_trees[v.CHROM].before(v.POS, max_dist=max_cp_dist))
.addfield('cp_after', lambda v: cp_trees[v.CHROM].after(v.POS, max_dist=max_cp_dist))
.convert(('cp_before', 'cp_after'), lambda v: v[0] if v else tuple())
.addfield('cp_before_dist', lambda v: np.abs(v.cp_before.start - v.POS) if v.cp_before else max_cp_dist)
.addfield('cp_after_dist', lambda v: np.abs(v.cp_after.start - v.POS) if v.cp_after else max_cp_dist)
.addfield('cp', lambda v: v.cp_before if v.cp_before_dist < v.cp_after_dist else v.cp_after)
.cutout('cp_after', 'cp_before', 'cp_after_dist', 'cp_before_dist')
.unpack('cp', ['cp_chrom', 'cp_pos', 'cp_strand', 'cp_gene'])
.cutout('cp_chrom')
.addfield('cp_dist', lambda v: v.POS - v.cp_pos if v.cp_strand == '+' else v.cp_pos - v.POS if v.cp_strand == '-' else None)
.notnone('cp_dist')
.addfield('cross', cross, index=0)
)
return tbl_variants
In [93]:
tabulate_intergenic_variants('3d7_hb3').display(5)
In [94]:
tbl_variants = etl.cat(*[tabulate_intergenic_variants(cross) for cross in CROSSES])
tbl_variants
Out[94]:
In [95]:
df_variants = tbl_variants.todataframe()
df_variants
Out[95]:
In [98]:
df_variants.describe()
Out[98]:
In [100]:
df_variants.cross.value_counts()
Out[100]:
Need to locate genome positions that are (1) core and (2) intergenic, then calculate distance to nearest CP for each position.
In [19]:
tbl_regions_0b = (etl
.fromtsv('../../../meta/regions-20130225.txt')
.pushheader(['chrom', 'start', 'stop', 'region'])
.convertnumbers()
)
tbl_regions_0b
Out[19]:
In [20]:
genome_is_core = {chrom: np.zeros_like(genome[chrom], dtype=np.bool)
for chrom in CHROMOSOMES}
for region in tbl_regions_0b.eq('region', 'Core').records():
genome_is_core[region.chrom][region.start:region.stop] = True
In [21]:
tbl_genes = (etl
.fromgff3(GFF_FN)
.selectin('type', {'gene', 'pseudogene'})
)
tbl_genes
Out[21]:
In [22]:
genome_is_genic = {chrom: np.zeros_like(genome[chrom], dtype=np.bool)
for chrom in CHROMOSOMES}
for gene in tbl_genes.records():
genome_is_genic[gene.seqid][gene.start-1:gene.end] = True
In [23]:
genome_is_core_intergenic = {chrom: genome_is_core[chrom] & ~genome_is_genic[chrom]
for chrom in CHROMOSOMES}
genome_pos_core_intergenic = {chrom: np.nonzero(genome_is_core_intergenic[chrom])[0]
for chrom in CHROMOSOMES}
genome_pos_core_intergenic
Out[23]:
In [24]:
max_cp_dist = 2000
genome_cp_dist = {chrom: np.zeros_like(genome_pos_core_intergenic[chrom])
for chrom in sorted(genome.keys())}
for chrom in CHROMOSOMES:
log(chrom)
pos = genome_pos_core_intergenic[chrom]
for i, p in enumerate(pos):
cp_before = cp_trees[chrom].before(p, max_dist=max_cp_dist)
cp_before = cp_before[0] if cp_before else tuple()
cp_after = cp_trees[chrom].after(p, max_dist=max_cp_dist)
cp_after = cp_after[0] if cp_after else tuple()
cp_before_dist = np.abs(cp_before.start - p) if cp_before else max_cp_dist
cp_after_dist = np.abs(cp_after.start - p) if cp_after else max_cp_dist
cp = cp_before if cp_before_dist < cp_after_dist else cp_after
dist = max_cp_dist
if cp:
dist = p - cp.start if cp.strand == '+' else cp.start - p if cp.strand == '-' else max_cp_dist
genome_cp_dist[chrom][i] = dist
In [25]:
genome_pos_core_intergenic['Pf3D7_01_v3'][2000:]
Out[25]:
In [26]:
genome_cp_dist['Pf3D7_01_v3'][2000:]
Out[26]:
In [30]:
tbl_cp_v3.display(20)
In [116]:
def plot_cp_variant_density(cross=None, bins=np.arange(-1000, 1000, 100), ax=None):
if ax is None:
fig, ax = plt.subplots(figsize=(10, 3))
if cross:
ax.set_title(LABELS[cross])
else:
ax.set_title('all crosses')
sns.despine(ax=ax)
sns.offset_spines(ax=ax)
# x coords
x = (bins[:-1] + bins[1:]) / 2
# genomic cp distances
a1 = np.concatenate(genome_cp_dist.values())
h1, _ = np.histogram(a1, bins=bins)
# obtain variants
df = df_variants
if cross:
df = df[df.cross == cross]
# SNPs
a2 = df[df.is_snp].cp_dist
h2, _ = np.histogram(a2, bins=bins)
y = h2 / h1
ax.plot(x, y, label='SNPs', lw=2)
# indels (STR)
a2 = df[~df.is_snp & df.STR].cp_dist
h2, _ = np.histogram(a2, bins=bins)
y = h2 / h1
ax.plot(x, y, label='indels (STR)', lw=2)
# indels (non-STR)
a2 = df[~df.is_snp & ~df.STR].cp_dist
h2, _ = np.histogram(a2, bins=bins)
y = h2 / h1
ax.plot(x, y, label='indels (non-STR)', lw=2)
# tidy up
ax.legend()
ax.set_xlabel('distance to core promoter')
ax.set_ylabel('relative variant density')
# ax.set_ylim(0, 0.006)
return ax
In [125]:
plot_cp_variant_density(bins=np.arange(-600, 600, 20));
In [132]:
def plot_cp_variant_density_hist(cross=None, bins=np.arange(-1000, 1000, 100), ax=None):
if ax is None:
fig, ax = plt.subplots(figsize=(10, 3))
if cross:
ax.set_title(LABELS[cross])
else:
ax.set_title('all crosses')
sns.despine(ax=ax)
sns.offset_spines(ax=ax)
# x coords
x = (bins[:-1] + bins[1:]) / 2
# genomic cp distances
a1 = np.concatenate(genome_cp_dist.values())
h1, _ = np.histogram(a1, bins=bins)
# obtain variants
df = df_variants
if cross:
df = df[df.cross == cross]
bottom = 0
width = bins[1] - bins[0]
# SNPs
a2 = df[df.is_snp].cp_dist
h2, _ = np.histogram(a2, bins=bins)
y2 = h2 / h1
ax.bar(x, y2, width=width, bottom=bottom, label='SNPs', color='b')
bottom += y2
# indels (STR)
a3 = df[~df.is_snp & df.STR].cp_dist
h3, _ = np.histogram(a3, bins=bins)
y3 = h3 / h1
ax.bar(x, y3, width=width, bottom=bottom, label='indels (STR)', color='g')
bottom += y3
# indels (non-STR)
a4 = df[~df.is_snp & ~df.STR].cp_dist
h4, _ = np.histogram(a4, bins=bins)
y4 = h4 / h1
ax.bar(x, y4, width=width, bottom=bottom, label='indels (non-STR)', color='r')
bottom += y4
# tidy up
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
ax.set_xlabel('distance to core promoter')
ax.set_ylabel('relative variant density')
# ax.set_ylim(0, 0.006)
return ax
In [135]:
plot_cp_variant_density_hist(bins=np.arange(-600, 600, 30));
In [119]:
for cross in CROSSES:
plot_cp_variant_density(cross, bins=np.arange(-600, 600, 20))
plt.show()
In [55]:
for cross in CROSSES:
plot_cp_variant_density(cross, bins=np.arange(-1000, 1000, 20))
plt.show()
In [56]:
for cross in CROSSES:
plot_cp_variant_density(cross, bins=np.arange(-1000, 1000, 100))
plt.show()
In [88]:
def plot_cp_svlen(cross, ax=None):
if ax is None:
fig, ax = plt.subplots(figsize=(6, 6))
ax.set_title(LABELS[cross])
sns.despine(ax=ax)
sns.offset_spines(ax=ax)
# obtain variants
variants = tabulate_intergenic_variants(cross).cut('is_snp', 'num_alleles', 'svlen', 'cp_dist').todataframe()
# coords
flt = ~variants.is_snp & (variants.num_alleles == 2)
x = variants[flt].cp_dist.values
y = variants[flt].svlen.values
# ax.plot(x, y, 'bo', alpha=.1)
ax.hexbin(x, y, extent=(-1000, 1000, -20, 20), gridsize=20)
# sns.kdeplot(x, y, shade=True, ax=ax)
return ax
plot_cp_svlen('3d7_hb3');
In [ ]:
tbl_variants = tabulate_intergenic_variants('3d7_hb3')
tbl_variants
In [63]:
tbl_variants.selectrangeopen('cp_dist', -100, 100).display(100)
In [ ]: