In [1]:
%run ../../shared_setup.ipynb


docker image cggh/biipy:v1.6.0

In [2]:
def tabulate_tr(chrom):
    fn = '/data/plasmodium/pfalciparum/pf-crosses/data/genome/sanger/version3/September_2012/%s.tr.pickle' % chrom
    tbl_tr = etl.frompickle(fn)
    return tbl_tr
    

tbl_tr = (
    etl
    .cat(*[tabulate_tr(chrom) for chrom in sorted(fasta.keys())])
    .sort(key=('chrom', 'start'))
)
tbl_tr

Compute fraction of core genome within a TR


In [4]:
def select_tr_threshold(row):
    unit_length = row.unit_length
    tract_length = row.tract_length
    return (
        ((unit_length == 1) and (tract_length >= 6)) or
        ((unit_length == 2) and (tract_length >= 9)) or
        ((unit_length == 3) and (tract_length >= 11)) or
        ((unit_length == 4) and (tract_length >= 13)) or
        ((unit_length == 5) and (tract_length >= 14)) or
        ((unit_length == 6) and (tract_length >= 16)) or
        ((unit_length >= 7) and (tract_length >= 18))
    )


tbl_tr_proper = tbl_tr.select(select_tr_threshold).cache()
tbl_tr_proper.display(20)


0|chrom 1|start 2|stop 3|repeat_unit 4|n_units 5|unit_length 6|last_unit_length 7|tract_length 8|region_type
Pf3D7_01_v3 13 37 cctaaac 3 7 3 24 SubtelomericRepeat
Pf3D7_01_v3 24 65 aaccctaaaccctg 2 14 13 41 SubtelomericRepeat
Pf3D7_01_v3 38 107 aaccctaaaccctgaacccta 3 21 6 69 SubtelomericRepeat
Pf3D7_01_v3 52 72 aacccta 2 7 6 20 SubtelomericRepeat
Pf3D7_01_v3 73 93 aacccta 2 7 6 20 SubtelomericRepeat
Pf3D7_01_v3 80 110 aaccctaaaccctg 2 14 2 30 SubtelomericRepeat
Pf3D7_01_v3 111 142 cctaaaccctgaac 2 14 3 31 SubtelomericRepeat
Pf3D7_01_v3 129 149 aaccctg 2 7 6 20 SubtelomericRepeat
Pf3D7_01_v3 149 167 aacccta 2 7 4 18 SubtelomericRepeat
Pf3D7_01_v3 169 197 aaaccctgaaccct 2 14 0 28 SubtelomericRepeat
Pf3D7_01_v3 170 218 aaccctgaaccctaaaccctg 2 21 6 48 SubtelomericRepeat
Pf3D7_01_v3 184 204 aaccctg 2 7 6 20 SubtelomericRepeat
Pf3D7_01_v3 191 225 aaccctgaacccta 2 14 6 34 SubtelomericRepeat
Pf3D7_01_v3 198 246 aaccctaaaccctgaacccta 2 21 6 48 SubtelomericRepeat
Pf3D7_01_v3 212 232 aacccta 2 7 6 20 SubtelomericRepeat
Pf3D7_01_v3 219 260 aaccctaaaccctg 2 14 13 41 SubtelomericRepeat
Pf3D7_01_v3 247 267 aacccta 2 7 6 20 SubtelomericRepeat
Pf3D7_01_v3 261 281 aaccctg 2 7 6 20 SubtelomericRepeat
Pf3D7_01_v3 285 316 cctaaac 4 7 3 31 SubtelomericRepeat
Pf3D7_01_v3 322 348 aaaccta 3 7 5 26 SubtelomericRepeat

...


In [5]:
tbl_tr_proper.valuecounts('chrom').displayall()


0|chrom 1|count 2|frequency
Pf3D7_14_v3 52976 0.14296200345423143
Pf3D7_13_v3 46201 0.12467886442141624
Pf3D7_12_v3 35998 0.09714486183074265
Pf3D7_11_v3 32946 0.08890867875647668
Pf3D7_10_v3 26423 0.07130559153713299
Pf3D7_09_v3 25141 0.06784596286701208
Pf3D7_08_v3 23661 0.06385200777202073
Pf3D7_07_v3 22298 0.06017379101899827
Pf3D7_06_v3 22161 0.05980408031088083
Pf3D7_05_v3 21589 0.058260470639032814
Pf3D7_04_v3 18843 0.05085006476683938
Pf3D7_03_v3 17356 0.04683721934369603
Pf3D7_02_v3 14970 0.040398316062176164
Pf3D7_01_v3 9997 0.026978087219343697

In [6]:
is_tr = dict()
for chrom in sorted(fasta.keys()):
    log(chrom)
    is_tr[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
    for rec in tbl_tr_proper.eq('chrom', chrom).records():
        start = rec.start - 1
        stop = rec.stop
        is_tr[chrom][start:stop] = True
is_tr


2016-03-14 22:24:12.622972 :: Pf3D7_01_v3
2016-03-14 22:24:12.910301 :: Pf3D7_02_v3
2016-03-14 22:24:13.251836 :: Pf3D7_03_v3
2016-03-14 22:24:13.732777 :: Pf3D7_04_v3
2016-03-14 22:24:14.154009 :: Pf3D7_05_v3
2016-03-14 22:24:14.620608 :: Pf3D7_06_v3
2016-03-14 22:24:15.166738 :: Pf3D7_07_v3
2016-03-14 22:24:15.584229 :: Pf3D7_08_v3
2016-03-14 22:24:16.012541 :: Pf3D7_09_v3
2016-03-14 22:24:16.463575 :: Pf3D7_10_v3
2016-03-14 22:24:16.929692 :: Pf3D7_11_v3
2016-03-14 22:24:17.466149 :: Pf3D7_12_v3
2016-03-14 22:24:18.035457 :: Pf3D7_13_v3
2016-03-14 22:24:18.708444 :: Pf3D7_14_v3
Out[6]:
{'Pf3D7_01_v3': array([False, False, False, ...,  True,  True,  True], dtype=bool),
 'Pf3D7_02_v3': array([ True,  True,  True, ..., False, False, False], dtype=bool),
 'Pf3D7_03_v3': array([False, False, False, ..., False, False, False], dtype=bool),
 'Pf3D7_04_v3': array([False, False, False, ...,  True,  True,  True], dtype=bool),
 'Pf3D7_05_v3': array([ True,  True,  True, ..., False, False, False], dtype=bool),
 'Pf3D7_06_v3': array([False, False, False, ..., False, False, False], dtype=bool),
 'Pf3D7_07_v3': array([False, False, False, ...,  True,  True, False], dtype=bool),
 'Pf3D7_08_v3': array([False, False,  True, ...,  True,  True,  True], dtype=bool),
 'Pf3D7_09_v3': array([False, False, False, ..., False, False, False], dtype=bool),
 'Pf3D7_10_v3': array([ True,  True,  True, ..., False, False, False], dtype=bool),
 'Pf3D7_11_v3': array([False, False, False, ..., False, False, False], dtype=bool),
 'Pf3D7_12_v3': array([False, False, False, ..., False, False, False], dtype=bool),
 'Pf3D7_13_v3': array([ True,  True,  True, ...,  True,  True,  True], dtype=bool),
 'Pf3D7_14_v3': array([False, False,  True, ..., False, False, False], dtype=bool)}

In [7]:
tbl_regions_1b


Out[7]:
0|region_chrom 1|region_start 2|region_stop 3|region_type 4|region_size
Pf3D7_01_v3 1 27336 SubtelomericRepeat 27336
Pf3D7_01_v3 27337 92900 SubtelomericHypervariable 65564
Pf3D7_01_v3 92901 457931 Core 365031
Pf3D7_01_v3 457932 460311 Centromere 2380
Pf3D7_01_v3 460312 575900 Core 115589

...


In [8]:
is_core = dict()
for chrom in sorted(fasta.keys()):
    log(chrom)
    is_core[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
    for rec in tbl_regions_1b.eq('region_chrom', chrom).eq('region_type', 'Core').records():
        start = rec.region_start - 1
        stop = rec.region_stop
        is_core[chrom][start:stop] = True


2016-03-14 22:24:19.471532 :: Pf3D7_01_v3
2016-03-14 22:24:19.474153 :: Pf3D7_02_v3
2016-03-14 22:24:19.477062 :: Pf3D7_03_v3
2016-03-14 22:24:19.479959 :: Pf3D7_04_v3
2016-03-14 22:24:19.482921 :: Pf3D7_05_v3
2016-03-14 22:24:19.485278 :: Pf3D7_06_v3
2016-03-14 22:24:19.488650 :: Pf3D7_07_v3
2016-03-14 22:24:19.490666 :: Pf3D7_08_v3
2016-03-14 22:24:19.493293 :: Pf3D7_09_v3
2016-03-14 22:24:19.495400 :: Pf3D7_10_v3
2016-03-14 22:24:19.497461 :: Pf3D7_11_v3
2016-03-14 22:24:19.499799 :: Pf3D7_12_v3
2016-03-14 22:24:19.502057 :: Pf3D7_13_v3
2016-03-14 22:24:19.504180 :: Pf3D7_14_v3

In [9]:
tbl_exons


Out[9]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_strand 6|feature_id 7|feature_parent_id 8|feature_name 9|feature_previous_id 10|feature_region_type 11|feature_region_size
Pf3D7_01_v3 CDS 29510 34762 5252 + PF3D7_0100100.1:exon:1 PF3D7_0100100.1 None None SubtelomericHypervariable 65564
Pf3D7_01_v3 CDS 35888 37126 1238 + PF3D7_0100100.1:exon:2 PF3D7_0100100.1 None None SubtelomericHypervariable 65564
Pf3D7_01_v3 CDS 38982 39923 941 - PF3D7_0100200.1:exon:1 PF3D7_0100200.1 None None SubtelomericHypervariable 65564
Pf3D7_01_v3 CDS 40154 40207 53 - PF3D7_0100200.1:exon:2 PF3D7_0100200.1 None None SubtelomericHypervariable 65564
Pf3D7_01_v3 CDS 42367 43617 1250 - PF3D7_0100300.1:exon:1 PF3D7_0100300.1 None None SubtelomericHypervariable 65564

...


In [10]:
is_exon = dict()
for chrom in sorted(fasta.keys()):
    log(chrom)
    is_exon[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
    for rec in tbl_exons.eq('feature_chrom', chrom).records():
        start = rec.feature_start - 1
        stop = rec.feature_stop
        is_exon[chrom][start:stop] = True


2016-03-14 22:24:19.522300 :: Pf3D7_01_v3
2016-03-14 22:24:19.620318 :: Pf3D7_02_v3
2016-03-14 22:24:19.684707 :: Pf3D7_03_v3
2016-03-14 22:24:19.748798 :: Pf3D7_04_v3
2016-03-14 22:24:19.812738 :: Pf3D7_05_v3
2016-03-14 22:24:19.879187 :: Pf3D7_06_v3
2016-03-14 22:24:19.997908 :: Pf3D7_07_v3
2016-03-14 22:24:20.075195 :: Pf3D7_08_v3
2016-03-14 22:24:20.143756 :: Pf3D7_09_v3
2016-03-14 22:24:20.232170 :: Pf3D7_10_v3
2016-03-14 22:24:20.360729 :: Pf3D7_11_v3
2016-03-14 22:24:20.495027 :: Pf3D7_12_v3
2016-03-14 22:24:20.607378 :: Pf3D7_13_v3
2016-03-14 22:24:20.689853 :: Pf3D7_14_v3

In [11]:
tbl_introns


Out[11]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_strand 6|feature_id 7|feature_parent_id 8|feature_name 9|feature_previous_id 10|feature_region_size 11|feature_region_type 12|region_size
Pf3D7_01_v3 intron 34762 35888 1126 + PF3D7_0100100.1:exon:1_PF3D7_0100100.1:exon:2 PF3D7_0100100.1 None None 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intron 39923 40154 231 - PF3D7_0100200.1:exon:1_PF3D7_0100200.1:exon:2 PF3D7_0100200.1 None None 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intron 43617 43775 158 - PF3D7_0100300.1:exon:1_PF3D7_0100300.1:exon:2 PF3D7_0100300.1 None None 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intron 50416 50572 156 + PF3D7_0100400.1:exon:1_PF3D7_0100400.1:exon:2 PF3D7_0100400.1 None None 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intron 54788 54938 150 - PF3D7_0100600.1:exon:1_PF3D7_0100600.1:exon:2 PF3D7_0100600.1 None None 65564 SubtelomericHypervariable 65564

...


In [12]:
is_intron = dict()
for chrom in sorted(fasta.keys()):
    log(chrom)
    is_intron[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
    for rec in tbl_introns.eq('feature_chrom', chrom).records():
        start = rec.feature_start - 1
        stop = rec.feature_stop
        is_intron[chrom][start:stop] = True


2016-03-14 22:24:21.120420 :: Pf3D7_01_v3
2016-03-14 22:24:21.162565 :: Pf3D7_02_v3
2016-03-14 22:24:21.197099 :: Pf3D7_03_v3
2016-03-14 22:24:21.234275 :: Pf3D7_04_v3
2016-03-14 22:24:21.279180 :: Pf3D7_05_v3
2016-03-14 22:24:21.316237 :: Pf3D7_06_v3
2016-03-14 22:24:21.356404 :: Pf3D7_07_v3
2016-03-14 22:24:21.394350 :: Pf3D7_08_v3
2016-03-14 22:24:21.433994 :: Pf3D7_09_v3
2016-03-14 22:24:21.473997 :: Pf3D7_10_v3
2016-03-14 22:24:21.520066 :: Pf3D7_11_v3
2016-03-14 22:24:21.562448 :: Pf3D7_12_v3
2016-03-14 22:24:21.619398 :: Pf3D7_13_v3
2016-03-14 22:24:21.664117 :: Pf3D7_14_v3

In [13]:
tbl_intergenic


Out[13]:
0|feature_chrom 1|feature_type 2|feature_start 3|feature_stop 4|feature_length 5|feature_id 6|feature_region_size 7|feature_region_type 8|region_size
Pf3D7_01_v3 intergenic_0 37126 38982 1856 PF3D7_0100100_PF3D7_0100200 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 40207 42367 2160 PF3D7_0100200_PF3D7_0100300 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_2 46507 50363 3856 PF3D7_0100300_PF3D7_0100400 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_0 51636 53169 1533 PF3D7_0100400_PF3D7_0100500 65564 SubtelomericHypervariable 65564
Pf3D7_01_v3 intergenic_1 53280 53778 498 PF3D7_0100500_PF3D7_0100600 65564 SubtelomericHypervariable 65564

...


In [14]:
is_intergenic = dict()
for chrom in sorted(fasta.keys()):
    log(chrom)
    is_intergenic[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
    for rec in tbl_intergenic.eq('feature_chrom', chrom).records():
        start = rec.feature_start - 1
        stop = rec.feature_stop
        is_intergenic[chrom][start:stop] = True


2016-03-14 22:24:21.758949 :: Pf3D7_01_v3
2016-03-14 22:24:21.789995 :: Pf3D7_02_v3
2016-03-14 22:24:21.814395 :: Pf3D7_03_v3
2016-03-14 22:24:21.839100 :: Pf3D7_04_v3
2016-03-14 22:24:21.864438 :: Pf3D7_05_v3
2016-03-14 22:24:21.889469 :: Pf3D7_06_v3
2016-03-14 22:24:21.916646 :: Pf3D7_07_v3
2016-03-14 22:24:21.943388 :: Pf3D7_08_v3
2016-03-14 22:24:21.970589 :: Pf3D7_09_v3
2016-03-14 22:24:21.998652 :: Pf3D7_10_v3
2016-03-14 22:24:22.029882 :: Pf3D7_11_v3
2016-03-14 22:24:22.057625 :: Pf3D7_12_v3
2016-03-14 22:24:22.089588 :: Pf3D7_13_v3
2016-03-14 22:24:22.122849 :: Pf3D7_14_v3

In [15]:
def tr_composition():
    tbl = [['chrom', 'n_core', 'n_core_tr', 'n_core_coding', 'n_core_coding_tr', 'n_core_noncoding', 'n_core_noncoding_tr']]
    all_n_core = all_n_core_tr = all_n_core_coding = all_n_core_coding_tr = all_n_core_noncoding = all_n_core_noncoding_tr = 0
    for chrom in sorted(fasta.keys()):
        
        n_core = nnz(is_core[chrom])
        all_n_core += n_core
        is_core_tr = is_core[chrom] & is_tr[chrom]
        n_core_tr = nnz(is_core_tr)
        all_n_core_tr += n_core_tr
        
        is_core_coding = is_core[chrom] & is_exon[chrom]
        n_core_coding = nnz(is_core_coding)
        all_n_core_coding += n_core_coding
        is_core_coding_tr = is_core_coding & is_tr[chrom]
        n_core_coding_tr = nnz(is_core_coding_tr)
        all_n_core_coding_tr += n_core_coding_tr

        is_core_noncoding = is_core[chrom] & (is_intron[chrom] | is_intergenic[chrom])
        n_core_noncoding = nnz(is_core_noncoding)
        all_n_core_noncoding += n_core_noncoding
        is_core_noncoding_tr = is_core_noncoding & is_tr[chrom]
        n_core_noncoding_tr = nnz(is_core_noncoding_tr)
        all_n_core_noncoding_tr += n_core_noncoding_tr
        
        row = [chrom, n_core, n_core_tr, n_core_coding, n_core_coding_tr, n_core_noncoding, n_core_noncoding_tr]
        tbl += [row]
        
    # final row
    row = ['all', all_n_core, all_n_core_tr, all_n_core_coding, all_n_core_coding_tr, all_n_core_noncoding, all_n_core_noncoding_tr]
    tbl += [row]
    
    return (etl
        .wrap(tbl)
        .addfield('pc_core_tr', lambda row: '%.1f%%' % (row.n_core_tr * 100 / row.n_core))
        .addfield('pc_core_coding_tr', lambda row: '%.1f%%' % (row.n_core_coding_tr * 100 / row.n_core_coding))
        .addfield('pc_core_noncoding_tr', lambda row: '%.1f%%' % (row.n_core_noncoding_tr * 100 / row.n_core_noncoding))
    )
        
tr_composition().displayall()


0|chrom 1|n_core 2|n_core_tr 3|n_core_coding 4|n_core_coding_tr 5|n_core_noncoding 6|n_core_noncoding_tr 7|pc_core_tr 8|pc_core_coding_tr 9|pc_core_noncoding_tr
Pf3D7_01_v3 480620 109274 241195 28615 224105 77976 22.7% 11.9% 34.8%
Pf3D7_02_v3 753550 163990 407460 47753 326664 111767 21.8% 11.7% 34.2%
Pf3D7_03_v3 929971 201179 530310 64056 394018 135852 21.6% 12.1% 34.5%
Pf3D7_04_v3 932894 206598 511913 65693 397236 138495 22.1% 12.8% 34.9%
Pf3D7_05_v3 1281978 272890 719041 83527 560358 187975 21.3% 11.6% 33.5%
Pf3D7_06_v3 1200478 254757 693110 81555 504785 171445 21.2% 11.8% 34.0%
Pf3D7_07_v3 1204739 248704 723557 86073 477273 162022 20.6% 11.9% 33.9%
Pf3D7_08_v3 1249936 271319 691215 81595 560202 189499 21.7% 11.8% 33.8%
Pf3D7_09_v3 1392114 303228 740164 79260 645169 221593 21.8% 10.7% 34.3%
Pf3D7_10_v3 1502845 316407 815945 87421 676628 224962 21.1% 10.7% 33.2%
Pf3D7_11_v3 1891043 398788 1035402 113162 847908 282760 21.1% 10.9% 33.3%
Pf3D7_12_v3 2030380 426433 1124737 122546 879971 298763 21.0% 10.9% 34.0%
Pf3D7_13_v3 2715189 551789 1534463 162999 1168244 386124 20.3% 10.6% 33.1%
Pf3D7_14_v3 3216370 653678 1782554 188237 1407166 459367 20.3% 10.6% 32.6%
all 20782107 4379034 11551066 1292492 9069727 3048600 21.1% 11.2% 33.6%

Compute fraction polymorphic


In [16]:
tbl_tr_core = tbl_tr.eq('region_type', 'Core').cutout('region_type').cache()
tbl_tr_core


Out[16]:
0|chrom 1|start 2|stop 3|repeat_unit 4|n_units 5|unit_length 6|last_unit_length 7|tract_length
Pf3D7_01_v3 92899 92907 taa 2 3 2 8
Pf3D7_01_v3 92900 92902 a 2 1 0 2
Pf3D7_01_v3 92903 92905 a 2 1 0 2
Pf3D7_01_v3 92912 92918 ata 2 3 0 6
Pf3D7_01_v3 92914 92916 a 2 1 0 2

...


In [17]:
def tabulate_indels(cross):
    
    # VCF file name
    vcf_fn = COMBINED_VCF_FN_TEMPLATE.format(cross=cross)
    
    # build the table
    tbl = (etl
        .fromvcf(vcf_fn, samples=None)
        # only keep PASS variants
        .false('FILTER')
        # exclude fields not needed in this analysis
        .cutout('ID', 'QUAL', 'FILTER')
        # unpack INFO
        .unpackdict('INFO', ['set', 'STR', 'RU', 'RPA'])
        # only keep INDELs
        .select(lambda row: len(row.REF) > 1 or any(len(a) > 1 for a in row.ALT))
        .addfield('cross', cross)        
    )
    
    return tbl

In [18]:
tbl_indels = (
    etl
    .cat(*[tabulate_indels(cross) for cross in CROSSES])
    .sort(key=('CHROM', 'POS'))
)
tbl_indels


Out[18]:
0|CHROM 1|POS 2|REF 3|ALT 4|set 5|STR 6|RU 7|RPA 8|cross
Pf3D7_01_v3 93901 AATATATATAT [A] GATK True AT [19, 14] 3d7_hb3
Pf3D7_01_v3 93901 AATATATATATATAT [A] GATK True AT [19, 12] 7g8_gb4
Pf3D7_01_v3 94588 C [CATATAT] GATK True AT [13, 16] 7g8_gb4
Pf3D7_01_v3 94590 T [TATAC] Intersection None None None 3d7_hb3
Pf3D7_01_v3 94590 T [TATAT, TATAC] Intersection None None None hb3_dd2

...


In [19]:
tbl_tr_indel = (
    tbl_tr_core
    .leftjoin(tbl_indels.addfield('pos_indel', lambda row: row.POS+1), 
              lkey=('chrom', 'start'),
              rkey=('CHROM', 'pos_indel'))
#     .cutout(13)
)
tbl_tr_indel.display(10)


0|chrom 1|start 2|stop 3|repeat_unit 4|n_units 5|unit_length 6|last_unit_length 7|tract_length 8|POS 9|REF 10|ALT 11|set 12|STR 13|RU 14|RPA 15|cross
Pf3D7_01_v3 92899 92907 taa 2 3 2 8 None None None None None None None None
Pf3D7_01_v3 92900 92902 a 2 1 0 2 None None None None None None None None
Pf3D7_01_v3 92903 92905 a 2 1 0 2 None None None None None None None None
Pf3D7_01_v3 92912 92918 ata 2 3 0 6 None None None None None None None None
Pf3D7_01_v3 92914 92916 a 2 1 0 2 None None None None None None None None
Pf3D7_01_v3 92915 92919 at 2 2 0 4 None None None None None None None None
Pf3D7_01_v3 92918 92920 t 2 1 0 2 None None None None None None None None
Pf3D7_01_v3 92920 92924 a 4 1 0 4 None None None None None None None None
Pf3D7_01_v3 92928 92932 t 4 1 0 4 None None None None None None None None
Pf3D7_01_v3 92932 92934 c 2 1 0 2 None None None None None None None None

...


In [20]:
tbl_tr_indel.gt('tract_length', 10).display(50)


0|chrom 1|start 2|stop 3|repeat_unit 4|n_units 5|unit_length 6|last_unit_length 7|tract_length 8|POS 9|REF 10|ALT 11|set 12|STR 13|RU 14|RPA 15|cross
Pf3D7_01_v3 93247 93259 atttc 2 5 2 12 None None None None None None None None
Pf3D7_01_v3 93902 93941 at 19 2 1 39 93901 AATATATATAT [A] GATK True AT [19, 14] 3d7_hb3
Pf3D7_01_v3 93902 93941 at 19 2 1 39 93901 AATATATATATATAT [A] GATK True AT [19, 12] 7g8_gb4
Pf3D7_01_v3 94126 94146 tatttta 2 7 6 20 None None None None None None None None
Pf3D7_01_v3 94258 94283 at 12 2 1 25 None None None None None None None None
Pf3D7_01_v3 94298 94312 ttataat 2 7 0 14 None None None None None None None None
Pf3D7_01_v3 94315 94329 tatttat 2 7 0 14 None None None None None None None None
Pf3D7_01_v3 94364 94376 atattt 2 6 0 12 None None None None None None None None
Pf3D7_01_v3 94433 94447 taa 4 3 2 14 None None None None None None None None
Pf3D7_01_v3 94497 94509 aatta 2 5 2 12 None None None None None None None None
Pf3D7_01_v3 94589 94615 at 13 2 0 26 94588 C [CATATAT] GATK True AT [13, 16] 7g8_gb4
Pf3D7_01_v3 94748 94762 tataat 2 6 2 14 None None None None None None None None
Pf3D7_01_v3 94994 95022 at 14 2 0 28 94993 CATATAT [C] Intersection True AT [14, 11] 3d7_hb3
Pf3D7_01_v3 94994 95022 at 14 2 0 28 94993 CATATATAT [C, CAT] Intersection True AT None hb3_dd2
Pf3D7_01_v3 94994 95022 at 14 2 0 28 94993 CATATATATAT [C] GATK True AT [14, 9] 7g8_gb4
Pf3D7_01_v3 95579 95591 ggcaaa 2 6 0 12 None None None None None None None None
Pf3D7_01_v3 95950 95961 tatat 2 5 1 11 None None None None None None None None
Pf3D7_01_v3 96047 96068 a 21 1 0 21 None None None None None None None None
Pf3D7_01_v3 96091 96105 tataatt 2 7 0 14 None None None None None None None None
Pf3D7_01_v3 96265 96280 at 7 2 1 15 None None None None None None None None
Pf3D7_01_v3 96317 96340 t 23 1 0 23 None None None None None None None None
Pf3D7_01_v3 96390 96401 atcat 2 5 1 11 None None None None None None None None
Pf3D7_01_v3 96419 96437 a 18 1 0 18 None None None None None None None None
Pf3D7_01_v3 96438 96490 ta 26 2 0 52 None None None None None None None None
Pf3D7_01_v3 96617 96628 tattt 2 5 1 11 None None None None None None None None
Pf3D7_01_v3 96677 96689 at 6 2 0 12 96676 A [AAT] Intersection True AT [6, 7] hb3_dd2
Pf3D7_01_v3 96744 96757 tatca 2 5 3 13 None None None None None None None None
Pf3D7_01_v3 96813 96831 aaaaaat 2 7 4 18 None None None None None None None None
Pf3D7_01_v3 96822 96837 aaaat 3 5 0 15 None None None None None None None None
Pf3D7_01_v3 96837 96865 tataaa 4 6 4 28 96836 TTATAAA [T] Cortex True TATAAA [4, 3] 7g8_gb4
Pf3D7_01_v3 96905 96916 aag 3 3 2 11 None None None None None None None None
Pf3D7_01_v3 96988 97002 t 14 1 0 14 None None None None None None None None
Pf3D7_01_v3 96998 97012 ttttc 2 5 4 14 None None None None None None None None
Pf3D7_01_v3 97040 97060 a 20 1 0 20 None None None None None None None None
Pf3D7_01_v3 97058 97072 aat 4 3 2 14 None None None None None None None None
Pf3D7_01_v3 97176 97187 aaat 2 4 3 11 None None None None None None None None
Pf3D7_01_v3 97212 97223 ta 5 2 1 11 None None None None None None None None
Pf3D7_01_v3 97215 97235 atatatatta 2 10 0 20 None None None None None None None None
Pf3D7_01_v3 97295 97311 attatat 2 7 2 16 None None None None None None None None
Pf3D7_01_v3 97454 97467 ttta 3 4 1 13 None None None None None None None None
Pf3D7_01_v3 97475 97487 ta 6 2 0 12 None None None None None None None None
Pf3D7_01_v3 97564 97576 tataaa 2 6 0 12 None None None None None None None None
Pf3D7_01_v3 97726 97738 aaaag 2 5 2 12 None None None None None None None None
Pf3D7_01_v3 97871 97890 ataaaaaa 2 8 3 19 None None None None None None None None
Pf3D7_01_v3 98019 98034 tatcttt 2 7 1 15 None None None None None None None None
Pf3D7_01_v3 98042 98058 tatttttt 2 8 0 16 None None None None None None None None
Pf3D7_01_v3 98045 98062 tttttta 2 7 3 17 None None None None None None None None
Pf3D7_01_v3 98087 98100 a 13 1 0 13 None None None None None None None None
Pf3D7_01_v3 98098 98112 aat 4 3 2 14 None None None None None None None None
Pf3D7_01_v3 98107 98124 aataa 3 5 2 17 None None None None None None None None

...


In [21]:
tbl_analysis = (
    tbl_tr_indel
    .aggregate(('chrom', 'start', 'stop', 'repeat_unit', 'n_units', 'unit_length', 'last_unit_length', 'tract_length'), 
               set, 'cross', presorted=True)
    .addfield('polymorphic', lambda row: row.value != set([None]))
    .addfield('polymorphic_3d7_hb3', lambda row: '3d7_hb3' in row.value)
    .addfield('polymorphic_hb3_dd2', lambda row: 'hb3_dd2' in row.value)
    .addfield('polymorphic_7g8_gb4', lambda row: '7g8_gb4' in row.value)
    .cutout('value')
)
tbl_analysis.gt('tract_length', 10).display(20)


0|chrom 1|start 2|stop 3|repeat_unit 4|n_units 5|unit_length 6|last_unit_length 7|tract_length 8|polymorphic 9|polymorphic_3d7_hb3 10|polymorphic_hb3_dd2 11|polymorphic_7g8_gb4
Pf3D7_01_v3 93247 93259 atttc 2 5 2 12 False False False False
Pf3D7_01_v3 93902 93941 at 19 2 1 39 True True False True
Pf3D7_01_v3 94126 94146 tatttta 2 7 6 20 False False False False
Pf3D7_01_v3 94258 94283 at 12 2 1 25 False False False False
Pf3D7_01_v3 94298 94312 ttataat 2 7 0 14 False False False False
Pf3D7_01_v3 94315 94329 tatttat 2 7 0 14 False False False False
Pf3D7_01_v3 94364 94376 atattt 2 6 0 12 False False False False
Pf3D7_01_v3 94433 94447 taa 4 3 2 14 False False False False
Pf3D7_01_v3 94497 94509 aatta 2 5 2 12 False False False False
Pf3D7_01_v3 94589 94615 at 13 2 0 26 True False False True
Pf3D7_01_v3 94748 94762 tataat 2 6 2 14 False False False False
Pf3D7_01_v3 94994 95022 at 14 2 0 28 True True True True
Pf3D7_01_v3 95579 95591 ggcaaa 2 6 0 12 False False False False
Pf3D7_01_v3 95950 95961 tatat 2 5 1 11 False False False False
Pf3D7_01_v3 96047 96068 a 21 1 0 21 False False False False
Pf3D7_01_v3 96091 96105 tataatt 2 7 0 14 False False False False
Pf3D7_01_v3 96265 96280 at 7 2 1 15 False False False False
Pf3D7_01_v3 96317 96340 t 23 1 0 23 False False False False
Pf3D7_01_v3 96390 96401 atcat 2 5 1 11 False False False False
Pf3D7_01_v3 96419 96437 a 18 1 0 18 False False False False

...


In [22]:
df = tbl_analysis.todataframe()
df.head()


Out[22]:
chrom start stop repeat_unit n_units unit_length last_unit_length tract_length polymorphic polymorphic_3d7_hb3 polymorphic_hb3_dd2 polymorphic_7g8_gb4
0 Pf3D7_01_v3 92899 92907 taa 2 3 2 8 False False False False
1 Pf3D7_01_v3 92900 92902 a 2 1 0 2 False False False False
2 Pf3D7_01_v3 92903 92905 a 2 1 0 2 False False False False
3 Pf3D7_01_v3 92912 92918 ata 2 3 0 6 False False False False
4 Pf3D7_01_v3 92914 92916 a 2 1 0 2 False False False False

In [23]:
df.tail()


Out[23]:
chrom start stop repeat_unit n_units unit_length last_unit_length tract_length polymorphic polymorphic_3d7_hb3 polymorphic_hb3_dd2 polymorphic_7g8_gb4
5570119 Pf3D7_14_v3 3255686 3255689 a 3 1 0 3 False False False False
5570120 Pf3D7_14_v3 3255691 3255695 ta 2 2 0 4 False False False False
5570121 Pf3D7_14_v3 3255694 3255697 a 3 1 0 3 False False False False
5570122 Pf3D7_14_v3 3255702 3255704 g 2 1 0 2 False False False False
5570123 Pf3D7_14_v3 3255709 3255711 t 2 1 0 2 False False False False

In [32]:
palette = sns.hls_palette(13)
sns.palplot(palette);



In [44]:
def fig_str_indel():
    fig = plt.figure()
    ax = fig.add_subplot(111)
    sns.despine(ax=ax, offset=5)

    unit_lengths = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
    palette = sns.hls_palette(len(unit_lengths))
    for unit_length, color in zip(unit_lengths, palette):
        s1 = df[df.unit_length == unit_length].tract_length
        s2 = df[(df.unit_length == unit_length) & df.polymorphic].tract_length
        y1 = np.bincount(s1, minlength=70)[:70]
        y2 = np.bincount(s2, minlength=70)[:70]
        x = np.arange(0, 70, 1)
        y3 = y2 / y1
        if unit_length == 1:
            x = x[:12]
            y3 = y3[:12]
        ax.plot(x, y3, linestyle='-', marker=' ', lw=2, color=color, label=unit_length)

    ax.set_xlim(0, 30)
    ax.legend(bbox_to_anchor=(1, 1), loc='upper left', title='Repeat unit length')
    ax.set_yscale('log')
    ax.set_ylim(0.0005, 1)
    ax.set_xlabel('Tract length')
    ax.set_ylabel('Fraction polymorphic')
    
    fig.tight_layout()
    fn = '../../artwork/supp/indel_str.{dpi}.{fmt}'
    for fmt in 'jpeg', 'png':
        for dpi in 120, 300:
            fig.savefig(fn.format(dpi=dpi, fmt=fmt), dpi=dpi, jpeg_quality=100, bbox_inches='tight')

fig_str_indel()


Sandbox


In [26]:
tbl_tr_indel_2 = (
    tbl_tr_core
    .intervalleftjoin(tbl_indels.true('STR').addfield('pos_indel_stop', lambda row: row.POS+len(row.REF)), 
                      lkey='chrom', lstart='start', lstop='stop',
                      rkey='CHROM', rstart='POS', rstop='pos_indel_stop',
                      include_stop=True)
)
tbl_tr_indel_2.display(10)


0|chrom 1|start 2|stop 3|repeat_unit 4|n_units 5|unit_length 6|last_unit_length 7|tract_length 8|CHROM 9|POS 10|REF 11|ALT 12|set 13|STR 14|RU 15|RPA 16|cross 17|pos_indel_stop
Pf3D7_01_v3 92899 92907 taa 2 3 2 8 None None None None None None None None None None
Pf3D7_01_v3 92900 92902 a 2 1 0 2 None None None None None None None None None None
Pf3D7_01_v3 92903 92905 a 2 1 0 2 None None None None None None None None None None
Pf3D7_01_v3 92912 92918 ata 2 3 0 6 None None None None None None None None None None
Pf3D7_01_v3 92914 92916 a 2 1 0 2 None None None None None None None None None None
Pf3D7_01_v3 92915 92919 at 2 2 0 4 None None None None None None None None None None
Pf3D7_01_v3 92918 92920 t 2 1 0 2 None None None None None None None None None None
Pf3D7_01_v3 92920 92924 a 4 1 0 4 None None None None None None None None None None
Pf3D7_01_v3 92928 92932 t 4 1 0 4 None None None None None None None None None None
Pf3D7_01_v3 92932 92934 c 2 1 0 2 None None None None None None None None None None

...


In [27]:
tbl_tr_indel_2.gt('tract_length', 10).display(50)


0|chrom 1|start 2|stop 3|repeat_unit 4|n_units 5|unit_length 6|last_unit_length 7|tract_length 8|CHROM 9|POS 10|REF 11|ALT 12|set 13|STR 14|RU 15|RPA 16|cross 17|pos_indel_stop
Pf3D7_01_v3 93247 93259 atttc 2 5 2 12 None None None None None None None None None None
Pf3D7_01_v3 93902 93941 at 19 2 1 39 Pf3D7_01_v3 93901 AATATATATAT [A] GATK True AT [19, 14] 3d7_hb3 93912
Pf3D7_01_v3 93902 93941 at 19 2 1 39 Pf3D7_01_v3 93901 AATATATATATATAT [A] GATK True AT [19, 12] 7g8_gb4 93916
Pf3D7_01_v3 94126 94146 tatttta 2 7 6 20 None None None None None None None None None None
Pf3D7_01_v3 94258 94283 at 12 2 1 25 None None None None None None None None None None
Pf3D7_01_v3 94298 94312 ttataat 2 7 0 14 None None None None None None None None None None
Pf3D7_01_v3 94315 94329 tatttat 2 7 0 14 None None None None None None None None None None
Pf3D7_01_v3 94364 94376 atattt 2 6 0 12 None None None None None None None None None None
Pf3D7_01_v3 94433 94447 taa 4 3 2 14 None None None None None None None None None None
Pf3D7_01_v3 94497 94509 aatta 2 5 2 12 None None None None None None None None None None
Pf3D7_01_v3 94589 94615 at 13 2 0 26 Pf3D7_01_v3 94588 C [CATATAT] GATK True AT [13, 16] 7g8_gb4 94589
Pf3D7_01_v3 94748 94762 tataat 2 6 2 14 None None None None None None None None None None
Pf3D7_01_v3 94994 95022 at 14 2 0 28 Pf3D7_01_v3 94993 CATATAT [C] Intersection True AT [14, 11] 3d7_hb3 95000
Pf3D7_01_v3 94994 95022 at 14 2 0 28 Pf3D7_01_v3 94993 CATATATAT [C, CAT] Intersection True AT None hb3_dd2 95002
Pf3D7_01_v3 94994 95022 at 14 2 0 28 Pf3D7_01_v3 94993 CATATATATAT [C] GATK True AT [14, 9] 7g8_gb4 95004
Pf3D7_01_v3 95579 95591 ggcaaa 2 6 0 12 None None None None None None None None None None
Pf3D7_01_v3 95950 95961 tatat 2 5 1 11 None None None None None None None None None None
Pf3D7_01_v3 96047 96068 a 21 1 0 21 None None None None None None None None None None
Pf3D7_01_v3 96091 96105 tataatt 2 7 0 14 None None None None None None None None None None
Pf3D7_01_v3 96265 96280 at 7 2 1 15 None None None None None None None None None None
Pf3D7_01_v3 96317 96340 t 23 1 0 23 None None None None None None None None None None
Pf3D7_01_v3 96390 96401 atcat 2 5 1 11 None None None None None None None None None None
Pf3D7_01_v3 96419 96437 a 18 1 0 18 None None None None None None None None None None
Pf3D7_01_v3 96438 96490 ta 26 2 0 52 None None None None None None None None None None
Pf3D7_01_v3 96617 96628 tattt 2 5 1 11 None None None None None None None None None None
Pf3D7_01_v3 96677 96689 at 6 2 0 12 Pf3D7_01_v3 96676 A [AAT] Intersection True AT [6, 7] hb3_dd2 96677
Pf3D7_01_v3 96744 96757 tatca 2 5 3 13 None None None None None None None None None None
Pf3D7_01_v3 96813 96831 aaaaaat 2 7 4 18 None None None None None None None None None None
Pf3D7_01_v3 96822 96837 aaaat 3 5 0 15 Pf3D7_01_v3 96836 TTATAAA [T] Cortex True TATAAA [4, 3] 7g8_gb4 96843
Pf3D7_01_v3 96837 96865 tataaa 4 6 4 28 Pf3D7_01_v3 96836 TTATAAA [T] Cortex True TATAAA [4, 3] 7g8_gb4 96843
Pf3D7_01_v3 96905 96916 aag 3 3 2 11 None None None None None None None None None None
Pf3D7_01_v3 96988 97002 t 14 1 0 14 None None None None None None None None None None
Pf3D7_01_v3 96998 97012 ttttc 2 5 4 14 None None None None None None None None None None
Pf3D7_01_v3 97040 97060 a 20 1 0 20 None None None None None None None None None None
Pf3D7_01_v3 97058 97072 aat 4 3 2 14 None None None None None None None None None None
Pf3D7_01_v3 97176 97187 aaat 2 4 3 11 None None None None None None None None None None
Pf3D7_01_v3 97212 97223 ta 5 2 1 11 None None None None None None None None None None
Pf3D7_01_v3 97215 97235 atatatatta 2 10 0 20 None None None None None None None None None None
Pf3D7_01_v3 97295 97311 attatat 2 7 2 16 Pf3D7_01_v3 97296 TTA [T] Intersection True TA [3, 2] 3d7_hb3 97299
Pf3D7_01_v3 97454 97467 ttta 3 4 1 13 None None None None None None None None None None
Pf3D7_01_v3 97475 97487 ta 6 2 0 12 None None None None None None None None None None
Pf3D7_01_v3 97564 97576 tataaa 2 6 0 12 None None None None None None None None None None
Pf3D7_01_v3 97726 97738 aaaag 2 5 2 12 None None None None None None None None None None
Pf3D7_01_v3 97871 97890 ataaaaaa 2 8 3 19 None None None None None None None None None None
Pf3D7_01_v3 98019 98034 tatcttt 2 7 1 15 None None None None None None None None None None
Pf3D7_01_v3 98042 98058 tatttttt 2 8 0 16 None None None None None None None None None None
Pf3D7_01_v3 98045 98062 tttttta 2 7 3 17 None None None None None None None None None None
Pf3D7_01_v3 98087 98100 a 13 1 0 13 None None None None None None None None None None
Pf3D7_01_v3 98098 98112 aat 4 3 2 14 None None None None None None None None None None
Pf3D7_01_v3 98107 98124 aataa 3 5 2 17 None None None None None None None None None None

...


In [28]:
fasta['Pf3D7_01_v3'][94980:94993+30]


Out[28]:
'atatataattcacatatatatatatatatatatatatatattt'

In [29]:
tbl_tr_core.eq('chrom', 'Pf3D7_01_v3').gt('start', 94990)


Out[29]:
0|chrom 1|start 2|stop 3|repeat_unit 4|n_units 5|unit_length 6|last_unit_length 7|tract_length
Pf3D7_01_v3 94991 94995 ca 2 2 0 4
Pf3D7_01_v3 94994 95022 at 14 2 0 28
Pf3D7_01_v3 95021 95024 t 3 1 0 3
Pf3D7_01_v3 95027 95032 ta 2 2 1 5
Pf3D7_01_v3 95031 95033 t 2 1 0 2

...


In [30]:
tbl_analysis_2 = (
    tbl_tr_indel_2
    .aggregate(('chrom', 'start', 'stop', 'repeat_unit', 'n_units', 'unit_length', 'last_unit_length', 'tract_length'), 
               set, 'cross', presorted=True)
    .addfield('polymorphic', lambda row: row.value != set([None]))
    .addfield('polymorphic_3d7_hb3', lambda row: '3d7_hb3' in row.value)
    .addfield('polymorphic_hb3_dd2', lambda row: 'hb3_dd2' in row.value)
    .addfield('polymorphic_7g8_gb4', lambda row: '7g8_gb4' in row.value)
    .cutout('value')
)
tbl_analysis_2.gt('tract_length', 10).display(20)


0|chrom 1|start 2|stop 3|repeat_unit 4|n_units 5|unit_length 6|last_unit_length 7|tract_length 8|polymorphic 9|polymorphic_3d7_hb3 10|polymorphic_hb3_dd2 11|polymorphic_7g8_gb4
Pf3D7_01_v3 93247 93259 atttc 2 5 2 12 False False False False
Pf3D7_01_v3 93902 93941 at 19 2 1 39 True True False True
Pf3D7_01_v3 94126 94146 tatttta 2 7 6 20 False False False False
Pf3D7_01_v3 94258 94283 at 12 2 1 25 False False False False
Pf3D7_01_v3 94298 94312 ttataat 2 7 0 14 False False False False
Pf3D7_01_v3 94315 94329 tatttat 2 7 0 14 False False False False
Pf3D7_01_v3 94364 94376 atattt 2 6 0 12 False False False False
Pf3D7_01_v3 94433 94447 taa 4 3 2 14 False False False False
Pf3D7_01_v3 94497 94509 aatta 2 5 2 12 False False False False
Pf3D7_01_v3 94589 94615 at 13 2 0 26 True False False True
Pf3D7_01_v3 94748 94762 tataat 2 6 2 14 False False False False
Pf3D7_01_v3 94994 95022 at 14 2 0 28 True True True True
Pf3D7_01_v3 95579 95591 ggcaaa 2 6 0 12 False False False False
Pf3D7_01_v3 95950 95961 tatat 2 5 1 11 False False False False
Pf3D7_01_v3 96047 96068 a 21 1 0 21 False False False False
Pf3D7_01_v3 96091 96105 tataatt 2 7 0 14 False False False False
Pf3D7_01_v3 96265 96280 at 7 2 1 15 False False False False
Pf3D7_01_v3 96317 96340 t 23 1 0 23 False False False False
Pf3D7_01_v3 96390 96401 atcat 2 5 1 11 False False False False
Pf3D7_01_v3 96419 96437 a 18 1 0 18 False False False False

...


In [31]:
df2 = tbl_analysis_2.todataframe()
df2.head()


Out[31]:
chrom start stop repeat_unit n_units unit_length last_unit_length tract_length polymorphic polymorphic_3d7_hb3 polymorphic_hb3_dd2 polymorphic_7g8_gb4
0 Pf3D7_01_v3 92899 92907 taa 2 3 2 8 False False False False
1 Pf3D7_01_v3 92900 92902 a 2 1 0 2 False False False False
2 Pf3D7_01_v3 92903 92905 a 2 1 0 2 False False False False
3 Pf3D7_01_v3 92912 92918 ata 2 3 0 6 False False False False
4 Pf3D7_01_v3 92914 92916 a 2 1 0 2 False False False False

In [32]:
fig = plt.figure()
ax = fig.add_subplot(111)

for unit_length in 1, 2, 3, 4, 5, 6:
    s1 = df2[df2.unit_length == unit_length].tract_length
    s2 = df2[(df2.unit_length == unit_length) & df2.polymorphic].tract_length
    y1 = np.bincount(s1, minlength=70)[:70]
    y2 = np.bincount(s2, minlength=70)[:70]
    x = np.arange(0, 70, 1)
    y3 = y2 / y1
    ax.plot(x, y3, linestyle='-', marker=' ', lw=2, label=unit_length)

ax.set_xlim(1, 30)
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
ax.set_yscale('log');



In [ ]: