Exploration of variation in relation to core promoters (CPs)


In [1]:
from __future__ import division, print_function
%run _shared_setup.ipynb
%run _plotting_setup.ipynb

Prepare data on CPs


In [2]:
tbl_cp = (etl
    .fromxls('../../../data/reference/ponts_2011/NIHMS400242-supplement-SuppTS4.xls')
    .skip(2)
    .setheader(['egasp_score', 'chrom', 'strand', 'pos', 'gene'])
    .convert('chrom', lambda v: 'Pf3D7_%02d' % v)
    .convert('pos', int)
    .convert('strand', lambda v: '+' if v > 0 else '-')
)
tbl_cp.display(5)


0|egasp_score 1|chrom 2|strand 3|pos 4|gene
1.0 Pf3D7_01 + 29297 PFA0005w
1.0 Pf3D7_01 + 35558 PFA0020w
1.0 Pf3D7_01 + 37985 PFA0020w
1.0 Pf3D7_01 - 50291 PFA0015c
1.0 Pf3D7_01 + 50348 PFA0020w

...


In [3]:
# need to liftover to v3 coords
cp_v2_coords_fn = '../../../data/reference/ponts_2011/cp_v2_coords.bed'
cp_v3_coords_fn = '../../../data/reference/ponts_2011/cp_v3_coords.bed'

if not os.path.exists(cp_v3_coords_fn):

    tbl_cp.cut(1, 3, 3, 2, 4).convert(2, lambda v: v+1).data().totsv(cp_v2_coords_fn)
    !head {cp_v2_coords_fn}

    liftover = sh.Command('../../../opt/liftover/liftOver')
    chain_2to3 = '/data/plasmodium/pfalciparum/pf-crosses/opt/liftover/2to3.liftOver'
    liftover(cp_v2_coords_fn, chain_2to3, cp_v3_coords_fn, cp_v3_coords_fn + '.unmapped')
    !echo ------------------
    !head {cp_v3_coords_fn}
    !echo ------------------
    !head {cp_v3_coords_fn}.unmapped

tbl_cp_v3 = (etl
    .fromtsv(cp_v3_coords_fn)
    .cut(0, 1, 3, 4)
    .pushheader(['chrom', 'pos', 'strand', 'gene'])
    .convert('pos', int)
    .sort(key=('chrom', 'pos'))
)
tbl_cp_v3


Out[3]:
0|chrom 1|pos 2|strand 3|gene
Pf3D7_01_v3 29074 + PFA0005w
Pf3D7_01_v3 35335 + PFA0020w
Pf3D7_01_v3 37762 + PFA0020w
Pf3D7_01_v3 50068 - PFA0015c
Pf3D7_01_v3 50125 + PFA0020w

...


In [4]:
tbl_cp_v3.nrows()


Out[4]:
3382

In [5]:
# build interval trees from the core promoter sites so we can find nearest
cp_trees = (
    tbl_cp_v3
    # need start and end fields, make them the same
    .rename('pos', 'start')
    .addfield('end', lambda v: v.start)
    .recordtrees('chrom', 'start', 'end')
)
cp_trees


Out[5]:
{'Pf3D7_01_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a35e565c8>,
 'Pf3D7_02_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a35e56600>,
 'Pf3D7_03_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a39fd3f30>,
 'Pf3D7_04_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a35e56638>,
 'Pf3D7_05_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a35e56670>,
 'Pf3D7_06_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a35e566a8>,
 'Pf3D7_07_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a35e566e0>,
 'Pf3D7_08_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a35e56718>,
 'Pf3D7_09_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a35e56750>,
 'Pf3D7_10_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a35e56788>,
 'Pf3D7_11_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a35e567c0>,
 'Pf3D7_12_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a35e567f8>,
 'Pf3D7_13_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a35e56868>,
 'Pf3D7_14_v3': <bx.intervals.intersection.IntervalTree at 0x7f8a35e56830>}

In [6]:
# example of how to find nearest CP
cp_trees['Pf3D7_01_v3'].after(20000, max_dist=10000)


Out[6]:
[('Pf3D7_01_v3', 29074, '+', 'PFA0005w', 29074)]

Prepare variation data relative to CPs


In [7]:
# load callsets
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, variant_filter='FILTER_PASS')


2014-11-04 13:45:05.917059 :: loading /home/aliman/src/github/malariagen/pf-crosses/data/public/20141022/3d7_hb3.combined.final.npz
2014-11-04 13:45:05.917818 :: filter variants
2014-11-04 13:45:06.206606 :: excluding 157 (0.4%) retaining 42087 (99.6%) of 42244 variants
2014-11-04 13:45:06.229973 :: filter samples
2014-11-04 13:45:06.230350 :: noop
2014-11-04 13:45:06.230614 :: filter calls
2014-11-04 13:45:06.230898 :: noop
2014-11-04 13:45:06.231195 :: loading /home/aliman/src/github/malariagen/pf-crosses/data/public/20141022/hb3_dd2.combined.final.npz
2014-11-04 13:45:06.231697 :: filter variants
2014-11-04 13:45:06.611508 :: excluding 450 (1.2%) retaining 36461 (98.8%) of 36911 variants
2014-11-04 13:45:06.634304 :: filter samples
2014-11-04 13:45:06.634692 :: noop
2014-11-04 13:45:06.634906 :: filter calls
2014-11-04 13:45:06.635102 :: noop
2014-11-04 13:45:06.635300 :: loading /home/aliman/src/github/malariagen/pf-crosses/data/public/20141022/7g8_gb4.combined.final.npz
2014-11-04 13:45:06.635631 :: filter variants
2014-11-04 13:45:06.988186 :: excluding 304 (0.9%) retaining 34471 (99.1%) of 34775 variants
2014-11-04 13:45:07.009416 :: filter samples
2014-11-04 13:45:07.009884 :: noop
2014-11-04 13:45:07.010118 :: filter calls
2014-11-04 13:45:07.010316 :: noop

In [91]:
def tabulate_intergenic_variants(cross, max_cp_dist=2000):
    V, _, _ = unpack_callset(callsets[cross])
    tbl_variants = (etl
        .fromarray(V[['CHROM', 'POS', 'is_snp', 'num_alleles', 'svlen', 'STR', 'RU', 'REF', 'ALT']])
        .convert(('svlen', 'ALT'), operator.itemgetter(0))
        .annex(etl
            .fromarray(V['EFF'][['Effect']])
        )
        .eq('Effect', 'INTERGENIC')
        .addfield('cp_before', lambda v: cp_trees[v.CHROM].before(v.POS, max_dist=max_cp_dist))
        .addfield('cp_after', lambda v: cp_trees[v.CHROM].after(v.POS, max_dist=max_cp_dist))
        .convert(('cp_before', 'cp_after'), lambda v: v[0] if v else tuple())
        .addfield('cp_before_dist', lambda v: np.abs(v.cp_before.start - v.POS) if v.cp_before else max_cp_dist)
        .addfield('cp_after_dist', lambda v: np.abs(v.cp_after.start - v.POS) if v.cp_after else max_cp_dist)
        .addfield('cp', lambda v: v.cp_before if v.cp_before_dist < v.cp_after_dist else v.cp_after)
        .cutout('cp_after', 'cp_before', 'cp_after_dist', 'cp_before_dist')
        .unpack('cp', ['cp_chrom', 'cp_pos', 'cp_strand', 'cp_gene'])
        .cutout('cp_chrom')
        .addfield('cp_dist', lambda v: v.POS - v.cp_pos if v.cp_strand == '+' else v.cp_pos - v.POS if v.cp_strand == '-' else None)
        .notnone('cp_dist')
        .addfield('cross', cross, index=0)
    )
    return tbl_variants

In [93]:
tabulate_intergenic_variants('3d7_hb3').display(5)


0|cross 1|CHROM 2|POS 3|is_snp 4|num_alleles 5|svlen 6|STR 7|RU 8|REF 9|ALT 10|Effect 11|cp_pos 12|cp_strand 13|cp_gene 14|cp_dist
3d7_hb3 Pf3D7_01_v3 97296 False 2 -2 True TA TTA T INTERGENIC 98398 - PFA0100c 1102
3d7_hb3 Pf3D7_01_v3 97825 False 2 -1 True T AT A INTERGENIC 98398 - PFA0100c 573
3d7_hb3 Pf3D7_01_v3 117080 False 2 -2 True TA CTA C INTERGENIC 116896 - PFA0125c -184
3d7_hb3 Pf3D7_01_v3 117326 False 2 -18 False . AATAAATAAATAAATAAAT A INTERGENIC 116896 - PFA0125c -430
3d7_hb3 Pf3D7_01_v3 117342 False 2 -2 True AT AAT A INTERGENIC 116896 - PFA0125c -446

...


In [94]:
tbl_variants = etl.cat(*[tabulate_intergenic_variants(cross) for cross in CROSSES])
tbl_variants


Out[94]:
0|cross 1|CHROM 2|POS 3|is_snp 4|num_alleles 5|svlen 6|STR 7|RU 8|REF 9|ALT 10|Effect 11|cp_pos 12|cp_strand 13|cp_gene 14|cp_dist
3d7_hb3 Pf3D7_01_v3 97296 False 2 -2 True TA TTA T INTERGENIC 98398 - PFA0100c 1102
3d7_hb3 Pf3D7_01_v3 97825 False 2 -1 True T AT A INTERGENIC 98398 - PFA0100c 573
3d7_hb3 Pf3D7_01_v3 117080 False 2 -2 True TA CTA C INTERGENIC 116896 - PFA0125c -184
3d7_hb3 Pf3D7_01_v3 117326 False 2 -18 False . AATAAATAAATAAATAAAT A INTERGENIC 116896 - PFA0125c -430
3d7_hb3 Pf3D7_01_v3 117342 False 2 -2 True AT AAT A INTERGENIC 116896 - PFA0125c -446

...


In [95]:
df_variants = tbl_variants.todataframe()
df_variants


Out[95]:
cross CHROM POS is_snp num_alleles svlen STR RU REF ALT Effect cp_pos cp_strand cp_gene cp_dist
0 3d7_hb3 Pf3D7_01_v3 97296 False 2 -2 True TA TTA T INTERGENIC 98398 - PFA0100c 1102
1 3d7_hb3 Pf3D7_01_v3 97825 False 2 -1 True T AT A INTERGENIC 98398 - PFA0100c 573
2 3d7_hb3 Pf3D7_01_v3 117080 False 2 -2 True TA CTA C INTERGENIC 116896 - PFA0125c -184
3 3d7_hb3 Pf3D7_01_v3 117326 False 2 -18 False . AATAAATAAATAAATAAAT A INTERGENIC 116896 - PFA0125c -430
4 3d7_hb3 Pf3D7_01_v3 117342 False 2 -2 True AT AAT A INTERGENIC 116896 - PFA0125c -446
5 3d7_hb3 Pf3D7_01_v3 117427 False 2 -5 True TTAAT CTTAAT C INTERGENIC 116896 - PFA0125c -531
6 3d7_hb3 Pf3D7_01_v3 117524 False 2 15 True AAAAGAAAAAAAGAA C CAAAAGAAAAAAAGAA INTERGENIC 116896 - PFA0125c -628
7 3d7_hb3 Pf3D7_01_v3 118029 False 2 2 True TA G GTA INTERGENIC 116896 - PFA0125c -1133
8 3d7_hb3 Pf3D7_01_v3 118550 False 2 2 True AT A AAT INTERGENIC 116896 - PFA0125c -1654
9 3d7_hb3 Pf3D7_01_v3 122104 False 2 6 True AT A AATATAT INTERGENIC 124035 - PFA0130c 1931
10 3d7_hb3 Pf3D7_01_v3 123313 False 2 2 True AT C CAT INTERGENIC 124035 - PFA0130c 722
11 3d7_hb3 Pf3D7_01_v3 123589 False 2 4 True TA T TTATA INTERGENIC 124035 - PFA0130c 446
12 3d7_hb3 Pf3D7_01_v3 123770 False 2 -10 True AAAATAAAAA CAAAATAAAAA C INTERGENIC 124035 - PFA0130c 265
13 3d7_hb3 Pf3D7_01_v3 125757 True 2 0 False . T A INTERGENIC 124100 + PFA0135w 1657
14 3d7_hb3 Pf3D7_01_v3 125809 True 2 0 False . T G INTERGENIC 124100 + PFA0135w 1709
15 3d7_hb3 Pf3D7_01_v3 125815 False 2 -6 True AT AATATAT A INTERGENIC 124100 + PFA0135w 1715
16 3d7_hb3 Pf3D7_01_v3 125983 False 2 4 True AT A AATAT INTERGENIC 124100 + PFA0135w 1883
17 3d7_hb3 Pf3D7_01_v3 128164 True 2 0 False . T A INTERGENIC 128798 - PFA0140c 634
18 3d7_hb3 Pf3D7_01_v3 128186 True 2 0 False . T A INTERGENIC 128798 - PFA0140c 612
19 3d7_hb3 Pf3D7_01_v3 128543 False 2 12 True TA T TTATATATATATA INTERGENIC 128798 - PFA0140c 255
20 3d7_hb3 Pf3D7_01_v3 128805 False 2 -5 True A TAAAAA T INTERGENIC 128798 - PFA0140c -7
21 3d7_hb3 Pf3D7_01_v3 128831 False 2 -10 True ATAAA TATAAAATAAA T INTERGENIC 128798 - PFA0140c -33
22 3d7_hb3 Pf3D7_01_v3 128832 False 2 -6 False . ATAAAAT A INTERGENIC 128798 - PFA0140c -34
23 3d7_hb3 Pf3D7_01_v3 128841 True 2 0 False . A T INTERGENIC 128798 - PFA0140c -43
24 3d7_hb3 Pf3D7_01_v3 128883 False 2 7 True ATATATA C CATATATA INTERGENIC 128798 - PFA0140c -85
25 3d7_hb3 Pf3D7_01_v3 128930 False 2 -1 True T AT A INTERGENIC 128798 - PFA0140c -132
26 3d7_hb3 Pf3D7_01_v3 131672 False 2 1 True A C CA INTERGENIC 133220 + PFA0175w -1548
27 3d7_hb3 Pf3D7_01_v3 133996 False 2 -1 True T AT A INTERGENIC 134146 - PFA0150c 150
28 3d7_hb3 Pf3D7_01_v3 139335 False 2 -1 True A TA T INTERGENIC 139306 - PFA0155c -29
29 3d7_hb3 Pf3D7_01_v3 139488 False 2 -2 True A CAA C INTERGENIC 139306 - PFA0155c -182
30 3d7_hb3 Pf3D7_01_v3 139572 False 2 12 True AAAT A AAAATAAATAAAT INTERGENIC 139306 - PFA0155c -266
31 3d7_hb3 Pf3D7_01_v3 139801 False 2 1 False . A AT INTERGENIC 139306 - PFA0155c -495
32 3d7_hb3 Pf3D7_01_v3 139802 True 2 0 False . A T INTERGENIC 139306 - PFA0155c -496
33 3d7_hb3 Pf3D7_01_v3 139885 False 2 -2 True TA TTA T INTERGENIC 139306 - PFA0155c -579
34 3d7_hb3 Pf3D7_01_v3 140192 False 2 -16 True TA TTATATATATATATATA T INTERGENIC 140472 - PFA0155c 280
35 3d7_hb3 Pf3D7_01_v3 142370 False 2 -5 True AAAAT AAAAAT A INTERGENIC 143305 - PFA0160c 935
36 3d7_hb3 Pf3D7_01_v3 152823 False 2 7 False . A ATATATAT INTERGENIC 153525 - PFA0170c 702
37 3d7_hb3 Pf3D7_01_v3 153251 False 2 -4 True AT AATAT A INTERGENIC 153525 - PFA0170c 274
38 3d7_hb3 Pf3D7_01_v3 153597 False 2 -10 True TA TTATATATATA T INTERGENIC 153562 + PFA0175w 35
39 3d7_hb3 Pf3D7_01_v3 153961 False 2 -4 True TA TTATA T INTERGENIC 153562 + PFA0175w 399
40 3d7_hb3 Pf3D7_01_v3 160556 True 2 0 False . C T INTERGENIC 160830 + PFA0180w -274
41 3d7_hb3 Pf3D7_01_v3 160988 False 2 4 True AT A AATAT INTERGENIC 160830 + PFA0180w 158
42 3d7_hb3 Pf3D7_01_v3 161016 False 2 5 True TTTAT A ATTTAT INTERGENIC 160830 + PFA0180w 186
43 3d7_hb3 Pf3D7_01_v3 166413 False 2 -10 False . TTTTTTTTTTA T INTERGENIC 166435 + PFA0185w -22
44 3d7_hb3 Pf3D7_01_v3 175087 False 2 8 True AT C CATATATAT INTERGENIC 174934 + PFA0200w 153
45 3d7_hb3 Pf3D7_01_v3 175414 False 3 6 False . A ATATATT INTERGENIC 175803 + PFA0205w -389
46 3d7_hb3 Pf3D7_01_v3 175468 False 2 -1 True T AT A INTERGENIC 175803 + PFA0205w -335
47 3d7_hb3 Pf3D7_01_v3 176391 False 2 4 True AT A AATAT INTERGENIC 175803 + PFA0205w 588
48 3d7_hb3 Pf3D7_01_v3 176899 False 2 -2 True AT CAT C INTERGENIC 175803 + PFA0205w 1096
49 3d7_hb3 Pf3D7_01_v3 177040 True 2 0 False . A C INTERGENIC 175803 + PFA0205w 1237
50 3d7_hb3 Pf3D7_01_v3 177355 False 2 4 True TA T TTATA INTERGENIC 175803 + PFA0205w 1552
51 3d7_hb3 Pf3D7_01_v3 177554 True 2 0 False . T G INTERGENIC 175803 + PFA0205w 1751
52 3d7_hb3 Pf3D7_01_v3 182293 False 2 6 True AT C CATATAT INTERGENIC 184226 - PFA0210c 1933
53 3d7_hb3 Pf3D7_01_v3 182472 False 2 6 True TA G GTATATA INTERGENIC 184226 - PFA0210c 1754
54 3d7_hb3 Pf3D7_01_v3 184273 False 2 2 True AT A AAT INTERGENIC 184309 + PFA0220w -36
55 3d7_hb3 Pf3D7_01_v3 184617 False 2 -7 True TGAAAAG ATGAAAAG A INTERGENIC 184309 + PFA0220w 308
56 3d7_hb3 Pf3D7_01_v3 184700 False 2 10 True TA T TTATATATATA INTERGENIC 184309 + PFA0220w 391
57 3d7_hb3 Pf3D7_01_v3 185343 False 2 -1 False . AG A INTERGENIC 184309 + PFA0220w 1034
58 3d7_hb3 Pf3D7_01_v3 185353 False 3 2 True AAAAAATG A ATG INTERGENIC 184309 + PFA0220w 1044
59 3d7_hb3 Pf3D7_01_v3 185436 False 2 -4 True AT AATAT A INTERGENIC 184309 + PFA0220w 1127
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...

24874 rows × 15 columns


In [98]:
df_variants.describe()


Out[98]:
POS is_snp num_alleles svlen STR cp_pos cp_dist
count 24874.000000 24874 24874.000000 24874.000000 24874 24874.000000 24874.000000
mean 980142.372316 0.2406529 2.061751 -0.947737 0.6347592 980155.090657 110.956742
std 680303.850524 0.4274885 0.240708 7.863508 0.4815073 680327.119997 935.610310
min 35796.000000 False 2.000000 -176.000000 False 35892.000000 -1999.000000
25% 459425.750000 0 2.000000 -4.000000 0 459285.000000 -488.000000
50% 831140.000000 0 2.000000 0.000000 1 830873.000000 128.000000
75% 1324667.250000 0 2.000000 2.000000 1 1324851.000000 715.000000
max 3246795.000000 True 3.000000 401.000000 True 3246603.000000 2000.000000

8 rows × 7 columns


In [100]:
df_variants.cross.value_counts()


Out[100]:
3d7_hb3    9794
hb3_dd2    7939
7g8_gb4    7141
dtype: int64

Prepare genomic data relative to CPs

Need to locate genome positions that are (1) core and (2) intergenic, then calculate distance to nearest CP for each position.


In [19]:
tbl_regions_0b = (etl
    .fromtsv('../../../meta/regions-20130225.txt')
    .pushheader(['chrom', 'start', 'stop', 'region'])
    .convertnumbers()
)
tbl_regions_0b


Out[19]:
0|chrom 1|start 2|stop 3|region
Pf3D7_01_v3 0 27336 SubtelomericRepeat
Pf3D7_01_v3 27336 92900 SubtelomericHypervariable
Pf3D7_01_v3 92900 457931 Core
Pf3D7_01_v3 457931 460311 Centromere
Pf3D7_01_v3 460311 575900 Core

...


In [20]:
genome_is_core = {chrom: np.zeros_like(genome[chrom], dtype=np.bool)
           for chrom in CHROMOSOMES}

for region in tbl_regions_0b.eq('region', 'Core').records():
    genome_is_core[region.chrom][region.start:region.stop] = True

In [21]:
tbl_genes = (etl
    .fromgff3(GFF_FN)
    .selectin('type', {'gene', 'pseudogene'})
)
tbl_genes


Out[21]:
0|seqid 1|source 2|type 3|start 4|end 5|score 6|strand 7|phase 8|attributes
Pf3D7_01_v3 chado gene 29510 37126 . + . {'Name': 'VAR', 'feature_id': '20734001', 'isObsolete': 'false', 'previous_systematic_id': 'PFA0005w', 'ID': 'PF3D7_0100100', 'timelastmodified': '07.09.2011 06:34:15 BST'}
Pf3D7_01_v3 chado gene 38982 40207 . - . {'Name': 'RIF', 'feature_id': '20725865', 'isObsolete': 'false', 'previous_systematic_id': 'PFA0010c', 'ID': 'PF3D7_0100200', 'timelastmodified': '07.09.2011 05:28:48 BST'}
Pf3D7_01_v3 chado gene 42367 46507 . - . {'feature_id': '20725861', 'previous_systematic_id': 'PFA0015c', 'ID': 'PF3D7_0100300', 'timelastmodified': '07.09.2011 05:28:46 BST', 'isObsolete': 'false'}
Pf3D7_01_v3 chado gene 50363 51636 . + . {'Name': 'RIF', 'feature_id': '20725850', 'isObsolete': 'false', 'previous_systematic_id': 'PFA0020w', 'ID': 'PF3D7_0100400', 'timelastmodified': '07.09.2011 05:28:40 BST'}
Pf3D7_01_v3 chado pseudogene 53169 53280 . - . {'Name': 'var pseudogene, exon 1', 'feature_id': '20725852', 'isObsolete': 'false', 'previous_systematic_id': 'PFA0025c', 'ID': 'PF3D7_0100500', 'timelastmodified': '07.09.2011 05:28:40 BST'}

...


In [22]:
genome_is_genic = {chrom: np.zeros_like(genome[chrom], dtype=np.bool) 
                   for chrom in CHROMOSOMES}

for gene in tbl_genes.records():
    genome_is_genic[gene.seqid][gene.start-1:gene.end] = True

In [23]:
genome_is_core_intergenic = {chrom: genome_is_core[chrom] & ~genome_is_genic[chrom]
                             for chrom in CHROMOSOMES}

genome_pos_core_intergenic = {chrom: np.nonzero(genome_is_core_intergenic[chrom])[0]
                              for chrom in CHROMOSOMES}
genome_pos_core_intergenic


Out[23]:
{'Pf3D7_01_v3': array([ 92900,  92901,  92902, ..., 575897, 575898, 575899]),
 'Pf3D7_02_v3': array([105800, 105801, 105802, ..., 862497, 862498, 862499]),
 'Pf3D7_03_v3': array([  71919,   71920,   71921, ..., 1002868, 1002869, 1002870]),
 'Pf3D7_04_v3': array([  91420,   91421,   91422, ..., 1143987, 1143988, 1143989]),
 'Pf3D7_05_v3': array([  37900,   37901,   37902, ..., 1321387, 1321388, 1321389]),
 'Pf3D7_06_v3': array([  72350,   72351,   72352, ..., 1294827, 1294828, 1294829]),
 'Pf3D7_07_v3': array([  77100,   77101,   77102, ..., 1381597, 1381598, 1381599]),
 'Pf3D7_08_v3': array([  73560,   73561,   73562, ..., 1365463, 1365464, 1365465]),
 'Pf3D7_09_v3': array([  79100,   79101,   79102, ..., 1473557, 1473558, 1473559]),
 'Pf3D7_10_v3': array([  68970,   68971,   68972, ..., 1568705, 1568706, 1568707]),
 'Pf3D7_11_v3': array([ 110000,  110001,  110002, ..., 2003317, 2003318, 2003319]),
 'Pf3D7_12_v3': array([  60300,   60301,   60302, ..., 2163697, 2163698, 2163699]),
 'Pf3D7_13_v3': array([  74413,   74414,   74415, ..., 2791897, 2791898, 2791899]),
 'Pf3D7_14_v3': array([  35774,   35775,   35776, ..., 3254088, 3254089, 3254090])}

In [24]:
max_cp_dist = 2000

genome_cp_dist = {chrom: np.zeros_like(genome_pos_core_intergenic[chrom])
                   for chrom in sorted(genome.keys())}

for chrom in CHROMOSOMES:
    log(chrom)
    pos = genome_pos_core_intergenic[chrom]
    for i, p in enumerate(pos):
        cp_before = cp_trees[chrom].before(p, max_dist=max_cp_dist)
        cp_before = cp_before[0] if cp_before else tuple()
        cp_after = cp_trees[chrom].after(p, max_dist=max_cp_dist)
        cp_after = cp_after[0] if cp_after else tuple()
        cp_before_dist = np.abs(cp_before.start - p) if cp_before else max_cp_dist 
        cp_after_dist = np.abs(cp_after.start - p) if cp_after else max_cp_dist
        cp = cp_before if cp_before_dist < cp_after_dist else cp_after
        dist = max_cp_dist
        if cp:
            dist = p - cp.start if cp.strand == '+' else cp.start - p if cp.strand == '-' else max_cp_dist
        genome_cp_dist[chrom][i] = dist


2014-11-04 13:52:35.672030 :: Pf3D7_01_v3
2014-11-04 13:52:37.630505 :: Pf3D7_02_v3
2014-11-04 13:52:40.718799 :: Pf3D7_03_v3
2014-11-04 13:52:44.498020 :: Pf3D7_04_v3
2014-11-04 13:52:49.007844 :: Pf3D7_05_v3
2014-11-04 13:52:54.671499 :: Pf3D7_06_v3
2014-11-04 13:52:59.795122 :: Pf3D7_07_v3
2014-11-04 13:53:04.423435 :: Pf3D7_08_v3
2014-11-04 13:53:10.273788 :: Pf3D7_09_v3
2014-11-04 13:53:16.860611 :: Pf3D7_10_v3
2014-11-04 13:53:23.555634 :: Pf3D7_11_v3
2014-11-04 13:53:32.342878 :: Pf3D7_12_v3
2014-11-04 13:53:39.862025 :: Pf3D7_13_v3
2014-11-04 13:53:50.900521 :: Pf3D7_14_v3

In [25]:
genome_pos_core_intergenic['Pf3D7_01_v3'][2000:]


Out[25]:
array([ 96513,  96514,  96515, ..., 575897, 575898, 575899])

In [26]:
genome_cp_dist['Pf3D7_01_v3'][2000:]


Out[26]:
array([1885, 1884, 1883, ..., 2000, 2000, 2000])

In [30]:
tbl_cp_v3.display(20)


0|chrom 1|pos 2|strand 3|gene
Pf3D7_01_v3 29074 + PFA0005w
Pf3D7_01_v3 35335 + PFA0020w
Pf3D7_01_v3 37762 + PFA0020w
Pf3D7_01_v3 50068 - PFA0015c
Pf3D7_01_v3 50125 + PFA0020w
Pf3D7_01_v3 52772 + PFA0040w
Pf3D7_01_v3 52854 - PFA0015c
Pf3D7_01_v3 53049 + PFA0040w
Pf3D7_01_v3 55814 + PFA0040w
Pf3D7_01_v3 74205 - PFA0055c
Pf3D7_01_v3 74244 + PFA0065w
Pf3D7_01_v3 80503 - PFA0055c
Pf3D7_01_v3 80541 + PFA0110w
Pf3D7_01_v3 83351 - PFA0080c
Pf3D7_01_v3 83429 + PFA0110w
Pf3D7_01_v3 98398 - PFA0100c
Pf3D7_01_v3 98541 + PFA0110w
Pf3D7_01_v3 116896 - PFA0125c
Pf3D7_01_v3 124035 - PFA0130c
Pf3D7_01_v3 124100 + PFA0135w

...

Analyse variants density relative to CPs


In [116]:
def plot_cp_variant_density(cross=None, bins=np.arange(-1000, 1000, 100), ax=None):
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 3))
        if cross:
            ax.set_title(LABELS[cross])
        else:
            ax.set_title('all crosses')
    sns.despine(ax=ax)
    sns.offset_spines(ax=ax)
    
    # x coords
    x = (bins[:-1] + bins[1:]) / 2
    
    # genomic cp distances
    a1 = np.concatenate(genome_cp_dist.values())
    h1, _ = np.histogram(a1, bins=bins)
    
    # obtain variants
    df = df_variants
    if cross:
        df = df[df.cross == cross]
    
    # SNPs
    a2 = df[df.is_snp].cp_dist
    h2, _ = np.histogram(a2, bins=bins)
    y = h2 / h1
    ax.plot(x, y, label='SNPs', lw=2)
    
    # indels (STR)
    a2 = df[~df.is_snp & df.STR].cp_dist
    h2, _ = np.histogram(a2, bins=bins)
    y = h2 / h1
    ax.plot(x, y, label='indels (STR)', lw=2)
    
    # indels (non-STR)
    a2 = df[~df.is_snp & ~df.STR].cp_dist
    h2, _ = np.histogram(a2, bins=bins)
    y = h2 / h1
    ax.plot(x, y, label='indels (non-STR)', lw=2)
    
    # tidy up
    ax.legend()
    ax.set_xlabel('distance to core promoter')
    ax.set_ylabel('relative variant density')
#    ax.set_ylim(0, 0.006)

    return ax

In [125]:
plot_cp_variant_density(bins=np.arange(-600, 600, 20));



In [132]:
def plot_cp_variant_density_hist(cross=None, bins=np.arange(-1000, 1000, 100), ax=None):
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 3))
        if cross:
            ax.set_title(LABELS[cross])
        else:
            ax.set_title('all crosses')
    sns.despine(ax=ax)
    sns.offset_spines(ax=ax)
    
    # x coords
    x = (bins[:-1] + bins[1:]) / 2
    
    # genomic cp distances
    a1 = np.concatenate(genome_cp_dist.values())
    h1, _ = np.histogram(a1, bins=bins)
    
    # obtain variants
    df = df_variants
    if cross:
        df = df[df.cross == cross]
    
    bottom = 0
    width = bins[1] - bins[0]
    
    # SNPs
    a2 = df[df.is_snp].cp_dist
    h2, _ = np.histogram(a2, bins=bins)
    y2 = h2 / h1
    ax.bar(x, y2, width=width, bottom=bottom, label='SNPs', color='b')
    bottom += y2
    
    # indels (STR)
    a3 = df[~df.is_snp & df.STR].cp_dist
    h3, _ = np.histogram(a3, bins=bins)
    y3 = h3 / h1
    ax.bar(x, y3, width=width, bottom=bottom, label='indels (STR)', color='g')
    bottom += y3
    
    # indels (non-STR)
    a4 = df[~df.is_snp & ~df.STR].cp_dist
    h4, _ = np.histogram(a4, bins=bins)
    y4 = h4 / h1
    ax.bar(x, y4, width=width, bottom=bottom, label='indels (non-STR)', color='r')
    bottom += y4

    # tidy up
    ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    ax.set_xlabel('distance to core promoter')
    ax.set_ylabel('relative variant density')
#    ax.set_ylim(0, 0.006)

    return ax

In [135]:
plot_cp_variant_density_hist(bins=np.arange(-600, 600, 30));



In [119]:
for cross in CROSSES:
    plot_cp_variant_density(cross, bins=np.arange(-600, 600, 20))
    plt.show()



In [55]:
for cross in CROSSES:
    plot_cp_variant_density(cross, bins=np.arange(-1000, 1000, 20))
    plt.show()



In [56]:
for cross in CROSSES:
    plot_cp_variant_density(cross, bins=np.arange(-1000, 1000, 100))
    plt.show()


Sandbox


In [88]:
def plot_cp_svlen(cross, ax=None):
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(6, 6))
        ax.set_title(LABELS[cross])
    sns.despine(ax=ax)
    sns.offset_spines(ax=ax)
    
    # obtain variants
    variants = tabulate_intergenic_variants(cross).cut('is_snp', 'num_alleles', 'svlen', 'cp_dist').todataframe()
    
    # coords
    flt = ~variants.is_snp & (variants.num_alleles == 2)
    x = variants[flt].cp_dist.values
    y = variants[flt].svlen.values
#    ax.plot(x, y, 'bo', alpha=.1)
    ax.hexbin(x, y, extent=(-1000, 1000, -20, 20), gridsize=20)
#    sns.kdeplot(x, y, shade=True, ax=ax)
    
    return ax
    
plot_cp_svlen('3d7_hb3');



In [ ]:
tbl_variants = tabulate_intergenic_variants('3d7_hb3')
tbl_variants

In [63]:
tbl_variants.selectrangeopen('cp_dist', -100, 100).display(100)


0|CHROM 1|POS 2|is_snp 3|svlen 4|STR 5|RU 6|REF 7|ALT 8|Effect 9|cp_pos 10|cp_strand 11|cp_gene 12|cp_dist
Pf3D7_01_v3 128805 False [-5 0] True A TAAAAA ['T' ''] INTERGENIC 128798 - PFA0140c -7
Pf3D7_01_v3 128831 False [-10 0] True ATAAA TATAAAATAAA ['T' ''] INTERGENIC 128798 - PFA0140c -33
Pf3D7_01_v3 128832 False [-6 0] False . ATAAAAT ['A' ''] INTERGENIC 128798 - PFA0140c -34
Pf3D7_01_v3 128841 True [0 0] False . A ['T' ''] INTERGENIC 128798 - PFA0140c -43
Pf3D7_01_v3 128883 False [7 0] True ATATATA C ['CATATATA' ''] INTERGENIC 128798 - PFA0140c -85
Pf3D7_01_v3 139335 False [-1 0] True A TA ['T' ''] INTERGENIC 139306 - PFA0155c -29
Pf3D7_01_v3 153597 False [-10 0] True TA TTATATATATA ['T' ''] INTERGENIC 153562 + PFA0175w 35
Pf3D7_01_v3 166413 False [-10 0] False . TTTTTTTTTTA ['T' ''] INTERGENIC 166435 + PFA0185w -22
Pf3D7_01_v3 184273 False [2 0] True AT A ['AAT' ''] INTERGENIC 184309 + PFA0220w -36
Pf3D7_01_v3 204164 False [-5 0] False . ATATAT ['A' ''] INTERGENIC 204238 + PFA0235w -74
Pf3D7_01_v3 204214 True [0 0] False . G ['C' ''] INTERGENIC 204238 + PFA0235w -24
Pf3D7_01_v3 259082 False [-24 0] True AAAAAAAG AAAAAAAAGAAAAAAAGAAAAAAAG ['A' ''] INTERGENIC 259099 + PFA0315w -17
Pf3D7_01_v3 259097 False [-9 0] False . AGAAAAAAAG ['A' ''] INTERGENIC 259099 + PFA0315w -2
Pf3D7_01_v3 295213 False [-6 0] False . ATTTATT ['A' ''] INTERGENIC 295127 - PFA0310c -86
Pf3D7_01_v3 319673 True [0 0] False . G ['A' ''] INTERGENIC 319708 - PFA0375c 35
Pf3D7_01_v3 319791 False [-1 0] True T AT ['A' ''] INTERGENIC 319708 - PFA0375c -83
Pf3D7_01_v3 337063 False [-8 0] False . ACAATAAAT ['A' ''] INTERGENIC 337051 - PFA0400c -12
Pf3D7_01_v3 337064 False [-8 0] True AATA CAATAAATA ['C' ''] INTERGENIC 337051 - PFA0400c -13
Pf3D7_01_v3 348366 False [-2 0] True TA TTA ['T' ''] INTERGENIC 348459 - PFA0415c 93
Pf3D7_01_v3 349914 False [10 0] True AT A ['AATATATATAT' ''] INTERGENIC 349988 + PFA0435w -74
Pf3D7_01_v3 388167 True [0 0] False . A ['G' ''] INTERGENIC 388165 + PFA0490w 2
Pf3D7_01_v3 392178 False [-1 0] True A TA ['T' ''] INTERGENIC 392252 + PFA0500w -74
Pf3D7_01_v3 413662 False [9 0] True ATATATATA T ['TATATATATA' ''] INTERGENIC 413714 + PFA0525w -52
Pf3D7_01_v3 413664 False [7 0] True ATATATA T ['TATATATA' ''] INTERGENIC 413714 + PFA0525w -50
Pf3D7_01_v3 413666 False [5 0] True ATATA T ['TATATA' ''] INTERGENIC 413714 + PFA0525w -48
Pf3D7_01_v3 431258 False [-22 0] True AT AATATATATATATATATATATAT ['A' ''] INTERGENIC 431325 + PFA0550w -67
Pf3D7_01_v3 538686 False [-30 0] False . TAAAAACAAAAATAAAAACAAAAATAAAAAC ['T' ''] INTERGENIC 538693 + PFA0675w -7
Pf3D7_02_v3 114668 False [2 0] True TA T ['TTA' ''] INTERGENIC 114629 - PFB0106c -39
Pf3D7_02_v3 141395 True [0 0] False . T ['A' ''] INTERGENIC 141363 + PFB0160w 32
Pf3D7_02_v3 148453 False [-12 0] True AT CATATATATATAT ['C' ''] INTERGENIC 148553 - PFB0145c 100
Pf3D7_02_v3 178721 False [4 0] True ATAA T ['TATAA' ''] INTERGENIC 178803 + PFB0185w -82
Pf3D7_02_v3 181582 False [-3 0] False . AAAT ['A' ''] INTERGENIC 181553 - PFB0177c -29
Pf3D7_02_v3 181583 False [-2 0] True AT AAT ['A' ''] INTERGENIC 181553 - PFB0177c -30
Pf3D7_02_v3 181584 False [-1 0] False . AT ['A' ''] INTERGENIC 181553 - PFB0177c -31
Pf3D7_02_v3 192076 False [4 0] True TA T ['TTATA' ''] INTERGENIC 192154 + PFB0194w -78
Pf3D7_02_v3 207566 False [2 0] True AT A ['AAT' ''] INTERGENIC 207643 + PFB0220w -77
Pf3D7_02_v3 211150 False [14 0] True AT A ['AATATATATATATAT' ''] INTERGENIC 211073 + PFB0220w 77
Pf3D7_02_v3 211165 False [15 0] False . A ['ATATATATATATATAT' ''] INTERGENIC 211073 + PFB0220w 92
Pf3D7_02_v3 211166 False [13 0] False . T ['TATATATATATATA' ''] INTERGENIC 211073 + PFB0220w 93
Pf3D7_02_v3 222932 False [2 0] True AT A ['AAT' ''] INTERGENIC 223031 + PFB0240w -99
Pf3D7_02_v3 223017 False [-5 0] True TTTAT ATTTAT ['A' ''] INTERGENIC 223031 + PFB0240w -14
Pf3D7_02_v3 244228 False [10 0] True TA T ['TTATATATATA' ''] INTERGENIC 244178 - PFB0265c -50
Pf3D7_02_v3 271487 False [8 0] True TA T ['TTATATATA' ''] INTERGENIC 271390 + PFB0295w 97
Pf3D7_02_v3 370213 False [-2 0] True AT AAT ['A' ''] INTERGENIC 370289 - PFB0391c-b 76
Pf3D7_02_v3 370357 False [-8 0] True TATTTTTA TTATTTTTA ['T' ''] INTERGENIC 370338 + PFB0405w 19
Pf3D7_02_v3 391188 False [2 0] True AT C ['CAT' ''] INTERGENIC 391156 - PFB0423c -32
Pf3D7_02_v3 406578 False [2 0] True T A ['ATT' ''] INTERGENIC 406561 - PFB0445c -17
Pf3D7_02_v3 409874 False [-2 0] True TA GTA ['G' ''] INTERGENIC 409879 - PFB0445c 5
Pf3D7_02_v3 435851 False [-14 0] True AT AATATATATATATAT ['A' ''] INTERGENIC 435946 - PFB0475c 95
Pf3D7_02_v3 437407 False [11 0] True AATAATAAAAT A ['AAATAATAAAAT' ''] INTERGENIC 437370 + PFB0495w 37
Pf3D7_02_v3 468575 False [-2 0] True T ATT ['A' ''] INTERGENIC 468574 + PFB0520w 1
Pf3D7_02_v3 468603 False [5 0] True TTATT A ['ATTATT' ''] INTERGENIC 468574 + PFB0520w 29
Pf3D7_02_v3 526277 False [-7 0] True TTTTATT ATTTTATT ['A' ''] INTERGENIC 526305 - PFB0575c 28
Pf3D7_02_v3 526289 False [-1 0] True T AT ['A' ''] INTERGENIC 526305 - PFB0575c 16
Pf3D7_02_v3 526291 False [-9 0] False . TTTTTATTTA ['T' ''] INTERGENIC 526305 - PFB0575c 14
Pf3D7_02_v3 558405 False [-4 0] True ATAA TATAA ['T' ''] INTERGENIC 558482 - PFB0615c 77
Pf3D7_02_v3 592548 False [-4 0] False . TTTTA ['T' ''] INTERGENIC 592595 + PFB0665w -47
Pf3D7_02_v3 592550 False [-4 0] False . TTATT ['A' ''] INTERGENIC 592595 + PFB0665w -45
Pf3D7_02_v3 592554 False [3 0] False . T ['TTTA' ''] INTERGENIC 592595 + PFB0665w -41
Pf3D7_02_v3 616654 False [-9 0] True TTA TTTATTATTA ['T' ''] INTERGENIC 616671 - PFB0670c 17
Pf3D7_02_v3 699837 False [2 0] True AT A ['AAT' ''] INTERGENIC 699893 - PFB0770c 56
Pf3D7_02_v3 709674 False [-4 0] False . ATTTA ['T' ''] INTERGENIC 709771 + PFB0810w -97
Pf3D7_02_v3 709676 False [-2 0] True TA TTA ['T' ''] INTERGENIC 709771 + PFB0810w -95
Pf3D7_02_v3 719348 False [-2 0] True AT AAT ['A' ''] INTERGENIC 719283 + PFB0815w 65
Pf3D7_02_v3 747153 False [-2 0] True AT AAT ['A' ''] INTERGENIC 747223 - PFB0850c 70
Pf3D7_02_v3 783036 False [2 0] True TA T ['TTA' ''] INTERGENIC 783099 - PFB0895c 63
Pf3D7_02_v3 835066 False [-14 0] True TTTTTTA TTTTTTTATTTTTTA ['T' ''] INTERGENIC 835064 - PFB0926c -2
Pf3D7_03_v3 112542 False [-1 0] False . AT ['A' ''] INTERGENIC 112495 + PFC0105w 47
Pf3D7_03_v3 118650 True [0 0] False . A ['G' ''] INTERGENIC 118712 - PFC0100c 62
Pf3D7_03_v3 118713 True [0 0] False . A ['T' ''] INTERGENIC 118712 - PFC0100c -1
Pf3D7_03_v3 118714 False [-11 0] False . TGAGAAAAAAAA ['T' ''] INTERGENIC 118712 - PFC0100c -2
Pf3D7_03_v3 168897 True [0 0] False . A ['T' ''] INTERGENIC 168799 + PFC0150w 98
Pf3D7_03_v3 181909 False [-12 0] True AT CATATATATATAT ['C' ''] INTERGENIC 181980 + PFC0165w -71
Pf3D7_03_v3 235429 False [5 0] True AAAGG A ['AAAAGG' ''] INTERGENIC 235393 - PFC0225c -36
Pf3D7_03_v3 250684 False [-12 0] True AT AATATATATATAT ['A' ''] INTERGENIC 250584 + PFC0235w 100
Pf3D7_03_v3 250988 True [0 0] False . T ['A' ''] INTERGENIC 250923 + PFC0235w 65
Pf3D7_03_v3 368436 False [-4 0] True AT AATAT ['A' ''] INTERGENIC 368395 - PFC0355c -41
Pf3D7_03_v3 420488 False [18 0] True AT C ['CATATATATATATATATAT' ''] INTERGENIC 420568 + PFC0416w -80
Pf3D7_03_v3 425772 True [0 0] False . A ['T' ''] INTERGENIC 425802 - PFC0415c 30
Pf3D7_03_v3 425873 True [0 0] False . C ['T' ''] INTERGENIC 425802 - PFC0415c -71
Pf3D7_03_v3 441156 True [0 0] False . A ['T' ''] INTERGENIC 441179 - PFC0415c 23
Pf3D7_03_v3 441207 False [-2 0] True AT CAT ['C' ''] INTERGENIC 441179 - PFC0415c -28
Pf3D7_03_v3 473908 False [9 0] True TATTATTAA T ['TTATTATTAA' ''] INTERGENIC 473863 + PFC0470w 45
Pf3D7_03_v3 508877 False [-6 0] True TA TTATATA ['T' ''] INTERGENIC 508958 + PFC0506w -81
Pf3D7_03_v3 508932 True [0 0] False . T ['A' ''] INTERGENIC 508958 + PFC0506w -26
Pf3D7_03_v3 513011 False [-5 0] False . AAATAT ['A' ''] INTERGENIC 513085 + PFC0510w -74
Pf3D7_03_v3 516356 True [0 0] False . A ['T' ''] INTERGENIC 516410 + PFC0520w -54
Pf3D7_03_v3 516378 True [0 0] False . T ['A' ''] INTERGENIC 516410 + PFC0520w -32
Pf3D7_03_v3 527083 False [-11 0] False . ATATATATTTTT ['A' ''] INTERGENIC 527128 + PFC0530w -45
Pf3D7_03_v3 535456 False [4 0] True ATTT C ['CATTT' ''] INTERGENIC 535530 + PFC0540w -74
Pf3D7_03_v3 538837 False [-49 0] False . AAAAAAAAAAAAAAAATTTTTTTAAATGTTATATATACAAAAACAGATAT ['A' ''] INTERGENIC 538867 + PFC0550w -30
Pf3D7_03_v3 543082 False [-8 0] True TATTCATT ATATTCATT ['A' ''] INTERGENIC 543011 + PFC0565w 71
Pf3D7_03_v3 548445 False [-8 0] True AT AATATATAT ['A' ''] INTERGENIC 548534 + PFC0565w -89
Pf3D7_03_v3 593458 False [18 0] True TTTATA T ['TTTTATATTTATATTTATA' ''] INTERGENIC 593456 + PFC0615w 2
Pf3D7_03_v3 644096 False [-14 0] False . TTTTTTATTTATTTA ['T' ''] INTERGENIC 644076 - PFC0690c -20
Pf3D7_03_v3 649724 False [-10 0] True T ATTTTTTTTTT ['A' ''] INTERGENIC 649725 - PFC0700c 1
Pf3D7_03_v3 649812 False [-1 0] False . TA ['T' ''] INTERGENIC 649785 + PFC0710w-a 27
Pf3D7_03_v3 673446 False [-4 0] True TA TTATA ['T' ''] INTERGENIC 673510 + PFC0730w -64
Pf3D7_03_v3 673558 False [-2 0] True TA TTA ['T' ''] INTERGENIC 673510 + PFC0730w 48
Pf3D7_03_v3 686222 False [6 0] True TTTTTA T ['TTTTTTA' ''] INTERGENIC 686229 + PFC0750w -7

...


In [ ]: