In [1]:
    
from __future__ import division, print_function
%run _shared_setup.ipynb
%run _plotting_setup.ipynb
    
In [2]:
    
tbl_cp = (etl
    .fromxls('../../../data/reference/ponts_2011/NIHMS400242-supplement-SuppTS4.xls')
    .skip(2)
    .setheader(['egasp_score', 'chrom', 'strand', 'pos', 'gene'])
    .convert('chrom', lambda v: 'Pf3D7_%02d' % v)
    .convert('pos', int)
    .convert('strand', lambda v: '+' if v > 0 else '-')
)
tbl_cp.display(5)
    
    
In [3]:
    
# need to liftover to v3 coords
cp_v2_coords_fn = '../../../data/reference/ponts_2011/cp_v2_coords.bed'
cp_v3_coords_fn = '../../../data/reference/ponts_2011/cp_v3_coords.bed'
if not os.path.exists(cp_v3_coords_fn):
    tbl_cp.cut(1, 3, 3, 2, 4).convert(2, lambda v: v+1).data().totsv(cp_v2_coords_fn)
    !head {cp_v2_coords_fn}
    liftover = sh.Command('../../../opt/liftover/liftOver')
    chain_2to3 = '/data/plasmodium/pfalciparum/pf-crosses/opt/liftover/2to3.liftOver'
    liftover(cp_v2_coords_fn, chain_2to3, cp_v3_coords_fn, cp_v3_coords_fn + '.unmapped')
    !echo ------------------
    !head {cp_v3_coords_fn}
    !echo ------------------
    !head {cp_v3_coords_fn}.unmapped
tbl_cp_v3 = (etl
    .fromtsv(cp_v3_coords_fn)
    .cut(0, 1, 3, 4)
    .pushheader(['chrom', 'pos', 'strand', 'gene'])
    .convert('pos', int)
    .sort(key=('chrom', 'pos'))
)
tbl_cp_v3
    
    Out[3]:
In [4]:
    
tbl_cp_v3.nrows()
    
    Out[4]:
In [5]:
    
# build interval trees from the core promoter sites so we can find nearest
cp_trees = (
    tbl_cp_v3
    # need start and end fields, make them the same
    .rename('pos', 'start')
    .addfield('end', lambda v: v.start)
    .recordtrees('chrom', 'start', 'end')
)
cp_trees
    
    Out[5]:
In [6]:
    
# example of how to find nearest CP
cp_trees['Pf3D7_01_v3'].after(20000, max_dist=10000)
    
    Out[6]:
In [7]:
    
# load callsets
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, variant_filter='FILTER_PASS')
    
    
In [91]:
    
def tabulate_intergenic_variants(cross, max_cp_dist=2000):
    V, _, _ = unpack_callset(callsets[cross])
    tbl_variants = (etl
        .fromarray(V[['CHROM', 'POS', 'is_snp', 'num_alleles', 'svlen', 'STR', 'RU', 'REF', 'ALT']])
        .convert(('svlen', 'ALT'), operator.itemgetter(0))
        .annex(etl
            .fromarray(V['EFF'][['Effect']])
        )
        .eq('Effect', 'INTERGENIC')
        .addfield('cp_before', lambda v: cp_trees[v.CHROM].before(v.POS, max_dist=max_cp_dist))
        .addfield('cp_after', lambda v: cp_trees[v.CHROM].after(v.POS, max_dist=max_cp_dist))
        .convert(('cp_before', 'cp_after'), lambda v: v[0] if v else tuple())
        .addfield('cp_before_dist', lambda v: np.abs(v.cp_before.start - v.POS) if v.cp_before else max_cp_dist)
        .addfield('cp_after_dist', lambda v: np.abs(v.cp_after.start - v.POS) if v.cp_after else max_cp_dist)
        .addfield('cp', lambda v: v.cp_before if v.cp_before_dist < v.cp_after_dist else v.cp_after)
        .cutout('cp_after', 'cp_before', 'cp_after_dist', 'cp_before_dist')
        .unpack('cp', ['cp_chrom', 'cp_pos', 'cp_strand', 'cp_gene'])
        .cutout('cp_chrom')
        .addfield('cp_dist', lambda v: v.POS - v.cp_pos if v.cp_strand == '+' else v.cp_pos - v.POS if v.cp_strand == '-' else None)
        .notnone('cp_dist')
        .addfield('cross', cross, index=0)
    )
    return tbl_variants
    
In [93]:
    
tabulate_intergenic_variants('3d7_hb3').display(5)
    
    
In [94]:
    
tbl_variants = etl.cat(*[tabulate_intergenic_variants(cross) for cross in CROSSES])
tbl_variants
    
    Out[94]:
In [95]:
    
df_variants = tbl_variants.todataframe()
df_variants
    
    Out[95]:
In [98]:
    
df_variants.describe()
    
    Out[98]:
In [100]:
    
df_variants.cross.value_counts()
    
    Out[100]:
Need to locate genome positions that are (1) core and (2) intergenic, then calculate distance to nearest CP for each position.
In [19]:
    
tbl_regions_0b = (etl
    .fromtsv('../../../meta/regions-20130225.txt')
    .pushheader(['chrom', 'start', 'stop', 'region'])
    .convertnumbers()
)
tbl_regions_0b
    
    Out[19]:
In [20]:
    
genome_is_core = {chrom: np.zeros_like(genome[chrom], dtype=np.bool)
           for chrom in CHROMOSOMES}
for region in tbl_regions_0b.eq('region', 'Core').records():
    genome_is_core[region.chrom][region.start:region.stop] = True
    
In [21]:
    
tbl_genes = (etl
    .fromgff3(GFF_FN)
    .selectin('type', {'gene', 'pseudogene'})
)
tbl_genes
    
    Out[21]:
In [22]:
    
genome_is_genic = {chrom: np.zeros_like(genome[chrom], dtype=np.bool) 
                   for chrom in CHROMOSOMES}
for gene in tbl_genes.records():
    genome_is_genic[gene.seqid][gene.start-1:gene.end] = True
    
In [23]:
    
genome_is_core_intergenic = {chrom: genome_is_core[chrom] & ~genome_is_genic[chrom]
                             for chrom in CHROMOSOMES}
genome_pos_core_intergenic = {chrom: np.nonzero(genome_is_core_intergenic[chrom])[0]
                              for chrom in CHROMOSOMES}
genome_pos_core_intergenic
    
    Out[23]:
In [24]:
    
max_cp_dist = 2000
genome_cp_dist = {chrom: np.zeros_like(genome_pos_core_intergenic[chrom])
                   for chrom in sorted(genome.keys())}
for chrom in CHROMOSOMES:
    log(chrom)
    pos = genome_pos_core_intergenic[chrom]
    for i, p in enumerate(pos):
        cp_before = cp_trees[chrom].before(p, max_dist=max_cp_dist)
        cp_before = cp_before[0] if cp_before else tuple()
        cp_after = cp_trees[chrom].after(p, max_dist=max_cp_dist)
        cp_after = cp_after[0] if cp_after else tuple()
        cp_before_dist = np.abs(cp_before.start - p) if cp_before else max_cp_dist 
        cp_after_dist = np.abs(cp_after.start - p) if cp_after else max_cp_dist
        cp = cp_before if cp_before_dist < cp_after_dist else cp_after
        dist = max_cp_dist
        if cp:
            dist = p - cp.start if cp.strand == '+' else cp.start - p if cp.strand == '-' else max_cp_dist
        genome_cp_dist[chrom][i] = dist
    
    
In [25]:
    
genome_pos_core_intergenic['Pf3D7_01_v3'][2000:]
    
    Out[25]:
In [26]:
    
genome_cp_dist['Pf3D7_01_v3'][2000:]
    
    Out[26]:
In [30]:
    
tbl_cp_v3.display(20)
    
    
In [116]:
    
def plot_cp_variant_density(cross=None, bins=np.arange(-1000, 1000, 100), ax=None):
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 3))
        if cross:
            ax.set_title(LABELS[cross])
        else:
            ax.set_title('all crosses')
    sns.despine(ax=ax)
    sns.offset_spines(ax=ax)
    
    # x coords
    x = (bins[:-1] + bins[1:]) / 2
    
    # genomic cp distances
    a1 = np.concatenate(genome_cp_dist.values())
    h1, _ = np.histogram(a1, bins=bins)
    
    # obtain variants
    df = df_variants
    if cross:
        df = df[df.cross == cross]
    
    # SNPs
    a2 = df[df.is_snp].cp_dist
    h2, _ = np.histogram(a2, bins=bins)
    y = h2 / h1
    ax.plot(x, y, label='SNPs', lw=2)
    
    # indels (STR)
    a2 = df[~df.is_snp & df.STR].cp_dist
    h2, _ = np.histogram(a2, bins=bins)
    y = h2 / h1
    ax.plot(x, y, label='indels (STR)', lw=2)
    
    # indels (non-STR)
    a2 = df[~df.is_snp & ~df.STR].cp_dist
    h2, _ = np.histogram(a2, bins=bins)
    y = h2 / h1
    ax.plot(x, y, label='indels (non-STR)', lw=2)
    
    # tidy up
    ax.legend()
    ax.set_xlabel('distance to core promoter')
    ax.set_ylabel('relative variant density')
#    ax.set_ylim(0, 0.006)
    return ax
    
In [125]:
    
plot_cp_variant_density(bins=np.arange(-600, 600, 20));
    
    
In [132]:
    
def plot_cp_variant_density_hist(cross=None, bins=np.arange(-1000, 1000, 100), ax=None):
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 3))
        if cross:
            ax.set_title(LABELS[cross])
        else:
            ax.set_title('all crosses')
    sns.despine(ax=ax)
    sns.offset_spines(ax=ax)
    
    # x coords
    x = (bins[:-1] + bins[1:]) / 2
    
    # genomic cp distances
    a1 = np.concatenate(genome_cp_dist.values())
    h1, _ = np.histogram(a1, bins=bins)
    
    # obtain variants
    df = df_variants
    if cross:
        df = df[df.cross == cross]
    
    bottom = 0
    width = bins[1] - bins[0]
    
    # SNPs
    a2 = df[df.is_snp].cp_dist
    h2, _ = np.histogram(a2, bins=bins)
    y2 = h2 / h1
    ax.bar(x, y2, width=width, bottom=bottom, label='SNPs', color='b')
    bottom += y2
    
    # indels (STR)
    a3 = df[~df.is_snp & df.STR].cp_dist
    h3, _ = np.histogram(a3, bins=bins)
    y3 = h3 / h1
    ax.bar(x, y3, width=width, bottom=bottom, label='indels (STR)', color='g')
    bottom += y3
    
    # indels (non-STR)
    a4 = df[~df.is_snp & ~df.STR].cp_dist
    h4, _ = np.histogram(a4, bins=bins)
    y4 = h4 / h1
    ax.bar(x, y4, width=width, bottom=bottom, label='indels (non-STR)', color='r')
    bottom += y4
    # tidy up
    ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    ax.set_xlabel('distance to core promoter')
    ax.set_ylabel('relative variant density')
#    ax.set_ylim(0, 0.006)
    return ax
    
In [135]:
    
plot_cp_variant_density_hist(bins=np.arange(-600, 600, 30));
    
    
In [119]:
    
for cross in CROSSES:
    plot_cp_variant_density(cross, bins=np.arange(-600, 600, 20))
    plt.show()
    
    
    
    
In [55]:
    
for cross in CROSSES:
    plot_cp_variant_density(cross, bins=np.arange(-1000, 1000, 20))
    plt.show()
    
    
    
    
In [56]:
    
for cross in CROSSES:
    plot_cp_variant_density(cross, bins=np.arange(-1000, 1000, 100))
    plt.show()
    
    
    
    
In [88]:
    
def plot_cp_svlen(cross, ax=None):
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(6, 6))
        ax.set_title(LABELS[cross])
    sns.despine(ax=ax)
    sns.offset_spines(ax=ax)
    
    # obtain variants
    variants = tabulate_intergenic_variants(cross).cut('is_snp', 'num_alleles', 'svlen', 'cp_dist').todataframe()
    
    # coords
    flt = ~variants.is_snp & (variants.num_alleles == 2)
    x = variants[flt].cp_dist.values
    y = variants[flt].svlen.values
#    ax.plot(x, y, 'bo', alpha=.1)
    ax.hexbin(x, y, extent=(-1000, 1000, -20, 20), gridsize=20)
#    sns.kdeplot(x, y, shade=True, ax=ax)
    
    return ax
    
plot_cp_svlen('3d7_hb3');
    
    
In [ ]:
    
tbl_variants = tabulate_intergenic_variants('3d7_hb3')
tbl_variants
    
In [63]:
    
tbl_variants.selectrangeopen('cp_dist', -100, 100).display(100)
    
    
In [ ]: