In [1]:
from __future__ import division, print_function
%run _shared_setup.ipynb
%run _plotting_setup.ipynb
In [2]:
tbl_tss = (etl
.fromxls('../../../data/reference/ponts_2011/NIHMS400242-supplement-SuppTS2.xls')
.skip(2)
.setheader(['gene', 'description', 'chrom', 'pos', 'strand'])
.convert('chrom', lambda v: 'Pf3D7_%02d' % v)
.convert('pos', int)
.convert('strand', lambda v: '+' if v > 0 else '-')
.cut('chrom', 'pos', 'strand', 'gene')
)
tbl_tss.display(caption='TSS (Ponts et al. 2011, supplementary table S2)')
N.B., need to lift over to Pf3D7 version 3 coordinates.
In [3]:
tss_v2_coords_fn = '../../../data/reference/ponts_2011/tss_v2_coords.bed'
tss_v3_coords_fn = '../../../data/reference/ponts_2011/tss_v3_coords.bed'
if not os.path.exists(tss_v3_coords_fn):
tbl_tss.cut(0, 1, 1, 2, 3).convert(2, lambda v: v+1).data().totsv(tss_v2_coords_fn)
# !head {tss_v2_coords_fn}
liftover = sh.Command('../../../opt/liftover/liftOver')
chain_2to3 = '/data/plasmodium/pfalciparum/pf-crosses/opt/liftover/2to3.liftOver'
liftover(tss_v2_coords_fn, chain_2to3, tss_v3_coords_fn, tss_v3_coords_fn + '.unmapped')
# !head {tss_v3_coords_fn}
# !head {tss_v3_coords_fn}.unmapped
tbl_tss_v3 = (etl
.fromtsv(tss_v3_coords_fn)
.cut(0, 1, 3, 4)
.pushheader(['chrom', 'pos', 'strand', 'gene'])
.convert('pos', int)
.sort(key=('chrom', 'pos'))
)
tbl_tss_v3
Out[3]:
In [42]:
tbl_tss_v3.nrows()
Out[42]:
In [4]:
# build interval trees from the transcription start sites so we can find nearest
tss_trees = (
tbl_tss_v3
# need start and end fields, make them the same
.rename('pos', 'start')
.addfield('end', lambda v: v.start)
.recordtrees('chrom', 'start', 'end')
)
tss_trees
Out[4]:
In [5]:
# example of how to find nearest TSS
tss_trees['Pf3D7_01_v3'].after(60000, max_dist=10000)
Out[5]:
In [6]:
# load callsets
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, variant_filter='FILTER_PASS')
In [11]:
def tabulate_intergenic_variants(cross, max_tss_dist=2000):
V, _, _ = unpack_callset(callsets[cross])
tbl_variants = (etl
.fromarray(V[['CHROM', 'POS', 'is_snp', 'svlen', 'STR', 'RU']])
.annex(etl
.fromarray(V['EFF'][['Effect']])
)
.eq('Effect', 'INTERGENIC')
.addfield('tss_before', lambda v: tss_trees[v.CHROM].before(v.POS, max_dist=max_tss_dist))
.addfield('tss_after', lambda v: tss_trees[v.CHROM].after(v.POS, max_dist=max_tss_dist))
.convert(('tss_before', 'tss_after'), lambda v: v[0] if v else tuple())
.addfield('tss_before_dist', lambda v: np.abs(v.tss_before.start - v.POS) if v.tss_before else max_tss_dist)
.addfield('tss_after_dist', lambda v: np.abs(v.tss_after.start - v.POS) if v.tss_after else max_tss_dist)
.addfield('tss', lambda v: v.tss_before if v.tss_before_dist < v.tss_after_dist else v.tss_after)
.cutout('tss_after', 'tss_before', 'tss_after_dist', 'tss_before_dist')
.unpack('tss', ['tss_chrom', 'tss_pos', 'tss_strand', 'tss_gene'])
.cutout('tss_chrom')
.addfield('tss_dist', lambda v: v.POS - v.tss_pos if v.tss_strand == '+' else v.tss_pos - v.POS if v.tss_strand == '-' else None)
)
return tbl_variants
In [13]:
tabulate_intergenic_variants('3d7_hb3').display(20)
Need to locate genome positions that are (1) core and (2) intergenic, then calculate distance to nearest TSS for each position.
In [14]:
tbl_regions_0b = (etl
.fromtsv('../../../meta/regions-20130225.txt')
.pushheader(['chrom', 'start', 'stop', 'region'])
.convertnumbers()
)
tbl_regions_0b
Out[14]:
In [17]:
genome_is_core = {chrom: np.zeros_like(genome[chrom], dtype=np.bool)
for chrom in CHROMOSOMES}
for region in tbl_regions_0b.eq('region', 'Core').records():
genome_is_core[region.chrom][region.start:region.stop] = True
In [18]:
tbl_genes = (etl
.fromgff3(GFF_FN)
.selectin('type', {'gene', 'pseudogene'})
)
tbl_genes
Out[18]:
In [20]:
genome_is_genic = {chrom: np.zeros_like(genome[chrom], dtype=np.bool)
for chrom in CHROMOSOMES}
for gene in tbl_genes.records():
genome_is_genic[gene.seqid][gene.start-1:gene.end] = True
In [22]:
genome_is_core_intergenic = {chrom: genome_is_core[chrom] & ~genome_is_genic[chrom]
for chrom in CHROMOSOMES}
genome_pos_core_intergenic = {chrom: np.nonzero(genome_is_core_intergenic[chrom])[0]
for chrom in CHROMOSOMES}
genome_pos_core_intergenic
Out[22]:
In [23]:
max_tss_dist = 2000
genome_tss_dist = {chrom: np.zeros_like(genome_pos_core_intergenic[chrom])
for chrom in sorted(genome.keys())}
for chrom in CHROMOSOMES:
log(chrom)
pos = genome_pos_core_intergenic[chrom]
for i, p in enumerate(pos):
tss_before = tss_trees[chrom].before(p, max_dist=max_tss_dist)
tss_before = tss_before[0] if tss_before else tuple()
tss_after = tss_trees[chrom].after(p, max_dist=max_tss_dist)
tss_after = tss_after[0] if tss_after else tuple()
tss_before_dist = np.abs(tss_before.start - p) if tss_before else max_tss_dist
tss_after_dist = np.abs(tss_after.start - p) if tss_after else max_tss_dist
tss = tss_before if tss_before_dist < tss_after_dist else tss_after
dist = max_tss_dist
if tss:
dist = p - tss.start if tss.strand == '+' else tss.start - p if tss.strand == '-' else max_tss_dist
genome_tss_dist[chrom][i] = dist
In [29]:
genome_pos_core_intergenic['Pf3D7_01_v3'][2000:]
Out[29]:
In [30]:
genome_tss_dist['Pf3D7_01_v3'][2000:]
Out[30]:
In [28]:
tbl_tss_v3
Out[28]:
In [38]:
def plot_tss_variant_density(cross, bins=np.arange(-1025, 1026, 50), ax=None):
if ax is None:
fig, ax = plt.subplots(figsize=(10, 4))
ax.set_title(LABELS[cross])
sns.despine(ax=ax)
sns.offset_spines(ax=ax)
# x coords
x = (bins[:-1] + bins[1:]) / 2
# genomic tss distances
a1 = np.concatenate(genome_tss_dist.values())
h1, _ = np.histogram(a1, bins=bins)
# obtain variants
df_variants = tabulate_intergenic_variants(cross).todataframe()
# SNPs
a2 = df_variants[df_variants.is_snp].tss_dist
h2, _ = np.histogram(a2, bins=bins)
y = h2 / h1
ax.plot(x, y, label='SNPs', lw=2)
# indels
a2 = df_variants[~df_variants.is_snp].tss_dist
h2, _ = np.histogram(a2, bins=bins)
y = h2 / h1
ax.plot(x, y, label='indels', lw=2)
# tidy up
ax.legend()
ax.set_xlabel('distance to transcription start site')
ax.set_ylabel('variant density')
ax.set_ylim(0, 0.003)
return ax
In [39]:
plot_tss_variant_density('3d7_hb3');
In [40]:
plot_tss_variant_density('hb3_dd2');
In [41]:
plot_tss_variant_density('7g8_gb4');
In [ ]: