Exploration of variation in relation to transcription start sites (TSSs)


In [1]:
from __future__ import division, print_function
%run _shared_setup.ipynb
%run _plotting_setup.ipynb

Prepare data on TSSs


In [2]:
tbl_tss = (etl
    .fromxls('../../../data/reference/ponts_2011/NIHMS400242-supplement-SuppTS2.xls')
    .skip(2)
    .setheader(['gene', 'description', 'chrom', 'pos', 'strand'])
    .convert('chrom', lambda v: 'Pf3D7_%02d' % v)
    .convert('pos', int)
    .convert('strand', lambda v: '+' if v > 0 else '-')
    .cut('chrom', 'pos', 'strand', 'gene')
)
tbl_tss.display(caption='TSS (Ponts et al. 2011, supplementary table S2)')


TSS (Ponts et al. 2011, supplementary table S2)
0|chrom 1|pos 2|strand 3|gene
Pf3D7_01 67555 - PFA0050c
Pf3D7_01 98636 + PFA0110w
Pf3D7_01 104567 + PFA0115w
Pf3D7_01 109992 - PFA0120c
Pf3D7_01 117026 - PFA0125c

...

N.B., need to lift over to Pf3D7 version 3 coordinates.


In [3]:
tss_v2_coords_fn = '../../../data/reference/ponts_2011/tss_v2_coords.bed'
tss_v3_coords_fn = '../../../data/reference/ponts_2011/tss_v3_coords.bed'

if not os.path.exists(tss_v3_coords_fn):

    tbl_tss.cut(0, 1, 1, 2, 3).convert(2, lambda v: v+1).data().totsv(tss_v2_coords_fn)
#     !head {tss_v2_coords_fn}

    liftover = sh.Command('../../../opt/liftover/liftOver')
    chain_2to3 = '/data/plasmodium/pfalciparum/pf-crosses/opt/liftover/2to3.liftOver'
    liftover(tss_v2_coords_fn, chain_2to3, tss_v3_coords_fn, tss_v3_coords_fn + '.unmapped')
#     !head {tss_v3_coords_fn}
#     !head {tss_v3_coords_fn}.unmapped

tbl_tss_v3 = (etl
    .fromtsv(tss_v3_coords_fn)
    .cut(0, 1, 3, 4)
    .pushheader(['chrom', 'pos', 'strand', 'gene'])
    .convert('pos', int)
    .sort(key=('chrom', 'pos'))
)
tbl_tss_v3


Out[3]:
0|chrom 1|pos 2|strand 3|gene
Pf3D7_01_v3 67322 - PFA0050c
Pf3D7_01_v3 98403 + PFA0110w
Pf3D7_01_v3 104334 + PFA0115w
Pf3D7_01_v3 109759 - PFA0120c
Pf3D7_01_v3 116792 - PFA0125c

...


In [42]:
tbl_tss_v3.nrows()


Out[42]:
2854

In [4]:
# build interval trees from the transcription start sites so we can find nearest
tss_trees = (
    tbl_tss_v3
    # need start and end fields, make them the same
    .rename('pos', 'start')
    .addfield('end', lambda v: v.start)
    .recordtrees('chrom', 'start', 'end')
)
tss_trees


Out[4]:
{'Pf3D7_01_v3': <bx.intervals.intersection.IntervalTree at 0x7f92d8455398>,
 'Pf3D7_02_v3': <bx.intervals.intersection.IntervalTree at 0x7f92d84553d0>,
 'Pf3D7_03_v3': <bx.intervals.intersection.IntervalTree at 0x7f92dc64ff30>,
 'Pf3D7_04_v3': <bx.intervals.intersection.IntervalTree at 0x7f92d8455408>,
 'Pf3D7_05_v3': <bx.intervals.intersection.IntervalTree at 0x7f92d8455440>,
 'Pf3D7_06_v3': <bx.intervals.intersection.IntervalTree at 0x7f92d8455478>,
 'Pf3D7_07_v3': <bx.intervals.intersection.IntervalTree at 0x7f92d84554e8>,
 'Pf3D7_08_v3': <bx.intervals.intersection.IntervalTree at 0x7f92d84554b0>,
 'Pf3D7_09_v3': <bx.intervals.intersection.IntervalTree at 0x7f92d8455520>,
 'Pf3D7_10_v3': <bx.intervals.intersection.IntervalTree at 0x7f92d8455558>,
 'Pf3D7_11_v3': <bx.intervals.intersection.IntervalTree at 0x7f92d8455590>,
 'Pf3D7_12_v3': <bx.intervals.intersection.IntervalTree at 0x7f92d84555c8>,
 'Pf3D7_13_v3': <bx.intervals.intersection.IntervalTree at 0x7f92d8455600>,
 'Pf3D7_14_v3': <bx.intervals.intersection.IntervalTree at 0x7f92d8455638>}

In [5]:
# example of how to find nearest TSS
tss_trees['Pf3D7_01_v3'].after(60000, max_dist=10000)


Out[5]:
[('Pf3D7_01_v3', 67322, '-', 'PFA0050c', 67322)]

Prepare variation data relative to TSSs


In [6]:
# load callsets
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, variant_filter='FILTER_PASS')


2014-11-04 11:41:04.616084 :: loading /home/aliman/src/github/malariagen/pf-crosses/data/public/20141022/3d7_hb3.combined.final.npz
2014-11-04 11:41:04.617103 :: filter variants
2014-11-04 11:41:05.198335 :: excluding 157 (0.4%) retaining 42087 (99.6%) of 42244 variants
2014-11-04 11:41:05.221690 :: filter samples
2014-11-04 11:41:05.222285 :: filter calls
2014-11-04 11:41:05.222642 :: loading /home/aliman/src/github/malariagen/pf-crosses/data/public/20141022/hb3_dd2.combined.final.npz
2014-11-04 11:41:05.274288 :: filter variants
2014-11-04 11:41:05.710799 :: excluding 450 (1.2%) retaining 36461 (98.8%) of 36911 variants
2014-11-04 11:41:05.729494 :: filter samples
2014-11-04 11:41:05.729947 :: filter calls
2014-11-04 11:41:05.730251 :: loading /home/aliman/src/github/malariagen/pf-crosses/data/public/20141022/7g8_gb4.combined.final.npz
2014-11-04 11:41:05.741166 :: filter variants
2014-11-04 11:41:06.132988 :: excluding 304 (0.9%) retaining 34471 (99.1%) of 34775 variants
2014-11-04 11:41:06.153449 :: filter samples
2014-11-04 11:41:06.153956 :: filter calls

In [11]:
def tabulate_intergenic_variants(cross, max_tss_dist=2000):
    V, _, _ = unpack_callset(callsets[cross])
    tbl_variants = (etl
        .fromarray(V[['CHROM', 'POS', 'is_snp', 'svlen', 'STR', 'RU']])
        .annex(etl
            .fromarray(V['EFF'][['Effect']])
        )
        .eq('Effect', 'INTERGENIC')
        .addfield('tss_before', lambda v: tss_trees[v.CHROM].before(v.POS, max_dist=max_tss_dist))
        .addfield('tss_after', lambda v: tss_trees[v.CHROM].after(v.POS, max_dist=max_tss_dist))
        .convert(('tss_before', 'tss_after'), lambda v: v[0] if v else tuple())
        .addfield('tss_before_dist', lambda v: np.abs(v.tss_before.start - v.POS) if v.tss_before else max_tss_dist)
        .addfield('tss_after_dist', lambda v: np.abs(v.tss_after.start - v.POS) if v.tss_after else max_tss_dist)
        .addfield('tss', lambda v: v.tss_before if v.tss_before_dist < v.tss_after_dist else v.tss_after)
        .cutout('tss_after', 'tss_before', 'tss_after_dist', 'tss_before_dist')
        .unpack('tss', ['tss_chrom', 'tss_pos', 'tss_strand', 'tss_gene'])
        .cutout('tss_chrom')
        .addfield('tss_dist', lambda v: v.POS - v.tss_pos if v.tss_strand == '+' else v.tss_pos - v.POS if v.tss_strand == '-' else None)
    )
    return tbl_variants

In [13]:
tabulate_intergenic_variants('3d7_hb3').display(20)


0|CHROM 1|POS 2|is_snp 3|svlen 4|STR 5|RU 6|Effect 7|tss_pos 8|tss_strand 9|tss_gene 10|tss_dist
Pf3D7_01_v3 93901 False [-10 0] True AT INTERGENIC None None None None
Pf3D7_01_v3 94590 False [4 0] False . INTERGENIC None None None None
Pf3D7_01_v3 94993 False [-6 0] True AT INTERGENIC None None None None
Pf3D7_01_v3 97296 False [-2 0] True TA INTERGENIC 98403 + PFA0110w -1107
Pf3D7_01_v3 97825 False [-1 0] True T INTERGENIC 98403 + PFA0110w -578
Pf3D7_01_v3 103196 True [0 0] False . INTERGENIC 104334 + PFA0115w -1138
Pf3D7_01_v3 103413 False [2 0] True AT INTERGENIC 104334 + PFA0115w -921
Pf3D7_01_v3 104116 False [4 0] True AT INTERGENIC 104334 + PFA0115w -218
Pf3D7_01_v3 104408 False [2 0] True AT INTERGENIC 104334 + PFA0115w 74
Pf3D7_01_v3 104659 False [-8 0] True AT INTERGENIC 104334 + PFA0115w 325
Pf3D7_01_v3 105268 False [2 0] True TA INTERGENIC 104334 + PFA0115w 934
Pf3D7_01_v3 105501 False [4 0] True TATT INTERGENIC 104334 + PFA0115w 1167
Pf3D7_01_v3 105643 False [8 0] True TA INTERGENIC 104334 + PFA0115w 1309
Pf3D7_01_v3 106113 False [-12 0] True AT INTERGENIC 104334 + PFA0115w 1779
Pf3D7_01_v3 106975 False [-8 0] True AT INTERGENIC None None None None
Pf3D7_01_v3 108363 False [-4 0] True AT INTERGENIC 109759 - PFA0120c 1396
Pf3D7_01_v3 108675 False [-12 0] True TA INTERGENIC 109759 - PFA0120c 1084
Pf3D7_01_v3 108943 False [-8 0] True A INTERGENIC 109759 - PFA0120c 816
Pf3D7_01_v3 109393 False [18 0] True AT INTERGENIC 109759 - PFA0120c 366
Pf3D7_01_v3 109984 False [-2 0] True TA INTERGENIC 109759 - PFA0120c -225

...

Prepare genomic data relative to TSSs

Need to locate genome positions that are (1) core and (2) intergenic, then calculate distance to nearest TSS for each position.


In [14]:
tbl_regions_0b = (etl
    .fromtsv('../../../meta/regions-20130225.txt')
    .pushheader(['chrom', 'start', 'stop', 'region'])
    .convertnumbers()
)
tbl_regions_0b


Out[14]:
0|chrom 1|start 2|stop 3|region
Pf3D7_01_v3 0 27336 SubtelomericRepeat
Pf3D7_01_v3 27336 92900 SubtelomericHypervariable
Pf3D7_01_v3 92900 457931 Core
Pf3D7_01_v3 457931 460311 Centromere
Pf3D7_01_v3 460311 575900 Core

...


In [17]:
genome_is_core = {chrom: np.zeros_like(genome[chrom], dtype=np.bool)
           for chrom in CHROMOSOMES}

for region in tbl_regions_0b.eq('region', 'Core').records():
    genome_is_core[region.chrom][region.start:region.stop] = True

In [18]:
tbl_genes = (etl
    .fromgff3(GFF_FN)
    .selectin('type', {'gene', 'pseudogene'})
)
tbl_genes


Out[18]:
0|seqid 1|source 2|type 3|start 4|end 5|score 6|strand 7|phase 8|attributes
Pf3D7_01_v3 chado gene 29510 37126 . + . {'Name': 'VAR', 'feature_id': '20734001', 'isObsolete': 'false', 'previous_systematic_id': 'PFA0005w', 'ID': 'PF3D7_0100100', 'timelastmodified': '07.09.2011 06:34:15 BST'}
Pf3D7_01_v3 chado gene 38982 40207 . - . {'Name': 'RIF', 'feature_id': '20725865', 'isObsolete': 'false', 'previous_systematic_id': 'PFA0010c', 'ID': 'PF3D7_0100200', 'timelastmodified': '07.09.2011 05:28:48 BST'}
Pf3D7_01_v3 chado gene 42367 46507 . - . {'feature_id': '20725861', 'previous_systematic_id': 'PFA0015c', 'ID': 'PF3D7_0100300', 'timelastmodified': '07.09.2011 05:28:46 BST', 'isObsolete': 'false'}
Pf3D7_01_v3 chado gene 50363 51636 . + . {'Name': 'RIF', 'feature_id': '20725850', 'isObsolete': 'false', 'previous_systematic_id': 'PFA0020w', 'ID': 'PF3D7_0100400', 'timelastmodified': '07.09.2011 05:28:40 BST'}
Pf3D7_01_v3 chado pseudogene 53169 53280 . - . {'Name': 'var pseudogene, exon 1', 'feature_id': '20725852', 'isObsolete': 'false', 'previous_systematic_id': 'PFA0025c', 'ID': 'PF3D7_0100500', 'timelastmodified': '07.09.2011 05:28:40 BST'}

...


In [20]:
genome_is_genic = {chrom: np.zeros_like(genome[chrom], dtype=np.bool) 
                   for chrom in CHROMOSOMES}

for gene in tbl_genes.records():
    genome_is_genic[gene.seqid][gene.start-1:gene.end] = True

In [22]:
genome_is_core_intergenic = {chrom: genome_is_core[chrom] & ~genome_is_genic[chrom]
                             for chrom in CHROMOSOMES}

genome_pos_core_intergenic = {chrom: np.nonzero(genome_is_core_intergenic[chrom])[0]
                              for chrom in CHROMOSOMES}
genome_pos_core_intergenic


Out[22]:
{'Pf3D7_01_v3': array([ 92900,  92901,  92902, ..., 575897, 575898, 575899]),
 'Pf3D7_02_v3': array([105800, 105801, 105802, ..., 862497, 862498, 862499]),
 'Pf3D7_03_v3': array([  71919,   71920,   71921, ..., 1002868, 1002869, 1002870]),
 'Pf3D7_04_v3': array([  91420,   91421,   91422, ..., 1143987, 1143988, 1143989]),
 'Pf3D7_05_v3': array([  37900,   37901,   37902, ..., 1321387, 1321388, 1321389]),
 'Pf3D7_06_v3': array([  72350,   72351,   72352, ..., 1294827, 1294828, 1294829]),
 'Pf3D7_07_v3': array([  77100,   77101,   77102, ..., 1381597, 1381598, 1381599]),
 'Pf3D7_08_v3': array([  73560,   73561,   73562, ..., 1365463, 1365464, 1365465]),
 'Pf3D7_09_v3': array([  79100,   79101,   79102, ..., 1473557, 1473558, 1473559]),
 'Pf3D7_10_v3': array([  68970,   68971,   68972, ..., 1568705, 1568706, 1568707]),
 'Pf3D7_11_v3': array([ 110000,  110001,  110002, ..., 2003317, 2003318, 2003319]),
 'Pf3D7_12_v3': array([  60300,   60301,   60302, ..., 2163697, 2163698, 2163699]),
 'Pf3D7_13_v3': array([  74413,   74414,   74415, ..., 2791897, 2791898, 2791899]),
 'Pf3D7_14_v3': array([  35774,   35775,   35776, ..., 3254088, 3254089, 3254090])}

In [23]:
max_tss_dist = 2000

genome_tss_dist = {chrom: np.zeros_like(genome_pos_core_intergenic[chrom])
                   for chrom in sorted(genome.keys())}

for chrom in CHROMOSOMES:
    log(chrom)
    pos = genome_pos_core_intergenic[chrom]
    for i, p in enumerate(pos):
        tss_before = tss_trees[chrom].before(p, max_dist=max_tss_dist)
        tss_before = tss_before[0] if tss_before else tuple()
        tss_after = tss_trees[chrom].after(p, max_dist=max_tss_dist)
        tss_after = tss_after[0] if tss_after else tuple()
        tss_before_dist = np.abs(tss_before.start - p) if tss_before else max_tss_dist 
        tss_after_dist = np.abs(tss_after.start - p) if tss_after else max_tss_dist
        tss = tss_before if tss_before_dist < tss_after_dist else tss_after
        dist = max_tss_dist
        if tss:
            dist = p - tss.start if tss.strand == '+' else tss.start - p if tss.strand == '-' else max_tss_dist
        genome_tss_dist[chrom][i] = dist


2014-11-04 12:03:21.520227 :: Pf3D7_01_v3
2014-11-04 12:03:23.615262 :: Pf3D7_02_v3
2014-11-04 12:03:26.825294 :: Pf3D7_03_v3
2014-11-04 12:03:30.989030 :: Pf3D7_04_v3
2014-11-04 12:03:34.953315 :: Pf3D7_05_v3
2014-11-04 12:03:40.810692 :: Pf3D7_06_v3
2014-11-04 12:03:45.401742 :: Pf3D7_07_v3
2014-11-04 12:03:48.973156 :: Pf3D7_08_v3
2014-11-04 12:03:53.689960 :: Pf3D7_09_v3
2014-11-04 12:04:00.455264 :: Pf3D7_10_v3
2014-11-04 12:04:07.577698 :: Pf3D7_11_v3
2014-11-04 12:04:15.156139 :: Pf3D7_12_v3
2014-11-04 12:04:23.829108 :: Pf3D7_13_v3
2014-11-04 12:04:34.285063 :: Pf3D7_14_v3

In [29]:
genome_pos_core_intergenic['Pf3D7_01_v3'][2000:]


Out[29]:
array([ 96513,  96514,  96515, ..., 575897, 575898, 575899])

In [30]:
genome_tss_dist['Pf3D7_01_v3'][2000:]


Out[30]:
array([-1890, -1889, -1888, ...,  2000,  2000,  2000])

In [28]:
tbl_tss_v3


Out[28]:
0|chrom 1|pos 2|strand 3|gene
Pf3D7_01_v3 67322 - PFA0050c
Pf3D7_01_v3 98403 + PFA0110w
Pf3D7_01_v3 104334 + PFA0115w
Pf3D7_01_v3 109759 - PFA0120c
Pf3D7_01_v3 116792 - PFA0125c

...

Analyse variants density relative to TSSs


In [38]:
def plot_tss_variant_density(cross, bins=np.arange(-1025, 1026, 50), ax=None):
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 4))
        ax.set_title(LABELS[cross])
    sns.despine(ax=ax)
    sns.offset_spines(ax=ax)
    
    # x coords
    x = (bins[:-1] + bins[1:]) / 2
    
    # genomic tss distances
    a1 = np.concatenate(genome_tss_dist.values())
    h1, _ = np.histogram(a1, bins=bins)
    
    # obtain variants
    df_variants = tabulate_intergenic_variants(cross).todataframe()
    
    # SNPs
    a2 = df_variants[df_variants.is_snp].tss_dist
    h2, _ = np.histogram(a2, bins=bins)
    y = h2 / h1
    ax.plot(x, y, label='SNPs', lw=2)
    
    # indels
    a2 = df_variants[~df_variants.is_snp].tss_dist
    h2, _ = np.histogram(a2, bins=bins)
    y = h2 / h1
    ax.plot(x, y, label='indels', lw=2)
    
    # tidy up
    ax.legend()
    ax.set_xlabel('distance to transcription start site')
    ax.set_ylabel('variant density')
    ax.set_ylim(0, 0.003)

    return ax

In [39]:
plot_tss_variant_density('3d7_hb3');



In [40]:
plot_tss_variant_density('hb3_dd2');



In [41]:
plot_tss_variant_density('7g8_gb4');



In [ ]: