In [1]:
    
from __future__ import division, print_function
%run _shared_setup.ipynb
%run _plotting_setup.ipynb
    
In [2]:
    
tbl_tss = (etl
    .fromxls('../../../data/reference/ponts_2011/NIHMS400242-supplement-SuppTS2.xls')
    .skip(2)
    .setheader(['gene', 'description', 'chrom', 'pos', 'strand'])
    .convert('chrom', lambda v: 'Pf3D7_%02d' % v)
    .convert('pos', int)
    .convert('strand', lambda v: '+' if v > 0 else '-')
    .cut('chrom', 'pos', 'strand', 'gene')
)
tbl_tss.display(caption='TSS (Ponts et al. 2011, supplementary table S2)')
    
    
N.B., need to lift over to Pf3D7 version 3 coordinates.
In [3]:
    
tss_v2_coords_fn = '../../../data/reference/ponts_2011/tss_v2_coords.bed'
tss_v3_coords_fn = '../../../data/reference/ponts_2011/tss_v3_coords.bed'
if not os.path.exists(tss_v3_coords_fn):
    tbl_tss.cut(0, 1, 1, 2, 3).convert(2, lambda v: v+1).data().totsv(tss_v2_coords_fn)
#     !head {tss_v2_coords_fn}
    liftover = sh.Command('../../../opt/liftover/liftOver')
    chain_2to3 = '/data/plasmodium/pfalciparum/pf-crosses/opt/liftover/2to3.liftOver'
    liftover(tss_v2_coords_fn, chain_2to3, tss_v3_coords_fn, tss_v3_coords_fn + '.unmapped')
#     !head {tss_v3_coords_fn}
#     !head {tss_v3_coords_fn}.unmapped
tbl_tss_v3 = (etl
    .fromtsv(tss_v3_coords_fn)
    .cut(0, 1, 3, 4)
    .pushheader(['chrom', 'pos', 'strand', 'gene'])
    .convert('pos', int)
    .sort(key=('chrom', 'pos'))
)
tbl_tss_v3
    
    Out[3]:
In [42]:
    
tbl_tss_v3.nrows()
    
    Out[42]:
In [4]:
    
# build interval trees from the transcription start sites so we can find nearest
tss_trees = (
    tbl_tss_v3
    # need start and end fields, make them the same
    .rename('pos', 'start')
    .addfield('end', lambda v: v.start)
    .recordtrees('chrom', 'start', 'end')
)
tss_trees
    
    Out[4]:
In [5]:
    
# example of how to find nearest TSS
tss_trees['Pf3D7_01_v3'].after(60000, max_dist=10000)
    
    Out[5]:
In [6]:
    
# load callsets
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, variant_filter='FILTER_PASS')
    
    
In [11]:
    
def tabulate_intergenic_variants(cross, max_tss_dist=2000):
    V, _, _ = unpack_callset(callsets[cross])
    tbl_variants = (etl
        .fromarray(V[['CHROM', 'POS', 'is_snp', 'svlen', 'STR', 'RU']])
        .annex(etl
            .fromarray(V['EFF'][['Effect']])
        )
        .eq('Effect', 'INTERGENIC')
        .addfield('tss_before', lambda v: tss_trees[v.CHROM].before(v.POS, max_dist=max_tss_dist))
        .addfield('tss_after', lambda v: tss_trees[v.CHROM].after(v.POS, max_dist=max_tss_dist))
        .convert(('tss_before', 'tss_after'), lambda v: v[0] if v else tuple())
        .addfield('tss_before_dist', lambda v: np.abs(v.tss_before.start - v.POS) if v.tss_before else max_tss_dist)
        .addfield('tss_after_dist', lambda v: np.abs(v.tss_after.start - v.POS) if v.tss_after else max_tss_dist)
        .addfield('tss', lambda v: v.tss_before if v.tss_before_dist < v.tss_after_dist else v.tss_after)
        .cutout('tss_after', 'tss_before', 'tss_after_dist', 'tss_before_dist')
        .unpack('tss', ['tss_chrom', 'tss_pos', 'tss_strand', 'tss_gene'])
        .cutout('tss_chrom')
        .addfield('tss_dist', lambda v: v.POS - v.tss_pos if v.tss_strand == '+' else v.tss_pos - v.POS if v.tss_strand == '-' else None)
    )
    return tbl_variants
    
In [13]:
    
tabulate_intergenic_variants('3d7_hb3').display(20)
    
    
Need to locate genome positions that are (1) core and (2) intergenic, then calculate distance to nearest TSS for each position.
In [14]:
    
tbl_regions_0b = (etl
    .fromtsv('../../../meta/regions-20130225.txt')
    .pushheader(['chrom', 'start', 'stop', 'region'])
    .convertnumbers()
)
tbl_regions_0b
    
    Out[14]:
In [17]:
    
genome_is_core = {chrom: np.zeros_like(genome[chrom], dtype=np.bool)
           for chrom in CHROMOSOMES}
for region in tbl_regions_0b.eq('region', 'Core').records():
    genome_is_core[region.chrom][region.start:region.stop] = True
    
In [18]:
    
tbl_genes = (etl
    .fromgff3(GFF_FN)
    .selectin('type', {'gene', 'pseudogene'})
)
tbl_genes
    
    Out[18]:
In [20]:
    
genome_is_genic = {chrom: np.zeros_like(genome[chrom], dtype=np.bool) 
                   for chrom in CHROMOSOMES}
for gene in tbl_genes.records():
    genome_is_genic[gene.seqid][gene.start-1:gene.end] = True
    
In [22]:
    
genome_is_core_intergenic = {chrom: genome_is_core[chrom] & ~genome_is_genic[chrom]
                             for chrom in CHROMOSOMES}
genome_pos_core_intergenic = {chrom: np.nonzero(genome_is_core_intergenic[chrom])[0]
                              for chrom in CHROMOSOMES}
genome_pos_core_intergenic
    
    Out[22]:
In [23]:
    
max_tss_dist = 2000
genome_tss_dist = {chrom: np.zeros_like(genome_pos_core_intergenic[chrom])
                   for chrom in sorted(genome.keys())}
for chrom in CHROMOSOMES:
    log(chrom)
    pos = genome_pos_core_intergenic[chrom]
    for i, p in enumerate(pos):
        tss_before = tss_trees[chrom].before(p, max_dist=max_tss_dist)
        tss_before = tss_before[0] if tss_before else tuple()
        tss_after = tss_trees[chrom].after(p, max_dist=max_tss_dist)
        tss_after = tss_after[0] if tss_after else tuple()
        tss_before_dist = np.abs(tss_before.start - p) if tss_before else max_tss_dist 
        tss_after_dist = np.abs(tss_after.start - p) if tss_after else max_tss_dist
        tss = tss_before if tss_before_dist < tss_after_dist else tss_after
        dist = max_tss_dist
        if tss:
            dist = p - tss.start if tss.strand == '+' else tss.start - p if tss.strand == '-' else max_tss_dist
        genome_tss_dist[chrom][i] = dist
    
    
In [29]:
    
genome_pos_core_intergenic['Pf3D7_01_v3'][2000:]
    
    Out[29]:
In [30]:
    
genome_tss_dist['Pf3D7_01_v3'][2000:]
    
    Out[30]:
In [28]:
    
tbl_tss_v3
    
    Out[28]:
In [38]:
    
def plot_tss_variant_density(cross, bins=np.arange(-1025, 1026, 50), ax=None):
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 4))
        ax.set_title(LABELS[cross])
    sns.despine(ax=ax)
    sns.offset_spines(ax=ax)
    
    # x coords
    x = (bins[:-1] + bins[1:]) / 2
    
    # genomic tss distances
    a1 = np.concatenate(genome_tss_dist.values())
    h1, _ = np.histogram(a1, bins=bins)
    
    # obtain variants
    df_variants = tabulate_intergenic_variants(cross).todataframe()
    
    # SNPs
    a2 = df_variants[df_variants.is_snp].tss_dist
    h2, _ = np.histogram(a2, bins=bins)
    y = h2 / h1
    ax.plot(x, y, label='SNPs', lw=2)
    
    # indels
    a2 = df_variants[~df_variants.is_snp].tss_dist
    h2, _ = np.histogram(a2, bins=bins)
    y = h2 / h1
    ax.plot(x, y, label='indels', lw=2)
    
    # tidy up
    ax.legend()
    ax.set_xlabel('distance to transcription start site')
    ax.set_ylabel('variant density')
    ax.set_ylim(0, 0.003)
    return ax
    
In [39]:
    
plot_tss_variant_density('3d7_hb3');
    
    
In [40]:
    
plot_tss_variant_density('hb3_dd2');
    
    
In [41]:
    
plot_tss_variant_density('7g8_gb4');
    
    
In [ ]: