Supplementary data: recombination events

Setup


In [1]:
%run ../../shared_setup.ipynb


docker image cggh/biipy:v1.6.0

In [2]:
# load variation data
sample_exclusions = dup_samples.copy()
for cross in excessive_recomb_samples:
    sample_exclusions[cross] += excessive_recomb_samples[cross]

callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, 
                         sample_exclusions=sample_exclusions, 
                         variant_filter='FILTER_PASS',
                         call_filter=combined_conf_calls)

samples = {cross: callsets[cross]['calldata'].dtype.names
           for cross in CROSSES}
progeny = {cross: samples[cross][2:] for cross in CROSSES}
n_progeny = {cross: len(progeny[cross]) for cross in CROSSES}
print(n_progeny)
print(np.sum(list(n_progeny.values())))


2016-03-08 18:00:14.144437 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/3d7_hb3.combined.final.npz
2016-03-08 18:00:14.837650 :: filter variants: excluding 157 (0.4%) retaining 42087 (99.6%) of 42244 variants
2016-03-08 18:00:14.855152 :: filter samples: excluding ['C01/PG0062-C/ERR019070', 'C02/PG0053-C/ERR019067', 'C02/PG0055-C/ERR019066', 'C02/PG0056-C/ERR019068'] including ['3D7/PG0051-C/ERR019061', 'HB3/PG0052-C/ERR019054', 'C01/PG0065-C/ERR019064', 'C02/PG0067-C/ERR019073', 'C03/PG0066-C/ERR019072', 'C04/PG0061-C/ERR019059', 'C05/PG0068-C/ERR019065', 'C06/PG0069-C/ERR019055', 'C07/PG0070-C/ERR019056', 'C08/PG0071-C/ERR019074', 'C09/PG0072-C/ERR019057', 'C10/PG0063-C/ERR019060', 'C11/PG0064-C/ERR019071', 'C12/PG0058-C/ERR019063', 'C13/PG0054-C/ERR019062', 'C14/PG0060-C/ERR019058', 'C15/PG0057-C/ERR019069']
2016-03-08 18:00:14.901409 :: filter calls: excluding 2216 (0.3%) retaining 713263 (99.7%) of 715479 calls
2016-03-08 18:00:14.902387 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/hb3_dd2.combined.final.npz
2016-03-08 18:00:15.332268 :: filter variants: excluding 450 (1.2%) retaining 36461 (98.8%) of 36911 variants
2016-03-08 18:00:15.350733 :: filter samples: excluding ['SC01/PG0025-C/ERR019045'] including ['HB3/PG0004-CW/ERR012788', 'DD2/PG0008-CW/ERR012840', '1BB5/PG0023-C/ERR015449', '3BA6/PG0022-Cx/ERR126027', '3BD5/PG0024-C/ERR019053', '7C101/PG0074-C/ERR019048', '7C111/PG0038-C/ERR015457', '7C12/PG0035-Cx/ERR037704', '7C126/PG0047-C/ERR015452', '7C140/PG0039-C/ERR015454', '7C159/PG0040-Cx/ERR107475', '7C16/PG0036-C/ERR015455', '7C170/PG0041-C/ERR015446', '7C183/PG0042-C/ERR015448', '7C188/PG0030-C/ERR019046', '7C20/PG0037-C/ERR015451', '7C3/PG0034-C/ERR019047', '7C408/PG0031-C/ERR015458', '7C421/PG0043-C/ERR015459', '7C424/PG0044-C/ERR019043', '7C46/PG0046-Cx/ERR107476', '7C7/PG0048-C/ERR019049', 'B1SD/PG0015-C/ERR019044', 'B4R3/PG0018-C/ERR019042', 'CH3_116/PG0032-Cx/ERR037703', 'CH3_61/PG0033-Cx/ERR175544', 'D43/PG0029-Cx/ERR107474', 'GC03/PG0021-C/ERR015447', 'GC06/PG0028-C/ERR015456', 'QC01/PG0017-C/ERR019050', 'QC13/PG0016-C/ERR012895', 'QC23/PG0045-C/ERR012892', 'QC34/PG0026-C/ERR015453', 'SC05/PG0019-C/ERR019051', 'TC05/PG0027-C/ERR015450', 'TC08/PG0020-C/ERR019052']
2016-03-08 18:00:15.431000 :: filter calls: excluding 28934 (2.2%) retaining 1283662 (97.8%) of 1312596 calls
2016-03-08 18:00:15.433186 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/7g8_gb4.combined.final.npz
2016-03-08 18:00:15.855605 :: filter variants: excluding 304 (0.9%) retaining 34471 (99.1%) of 34775 variants
2016-03-08 18:00:15.874058 :: filter samples: excluding ['AUD/PG0112-CW/ERR045639', 'JC9/PG0111-C/ERR029409', 'JE11/PG0100-CW/ERR045630', 'JF6/PG0079-CW/ERR045637', 'KB8/PG0104-CW/ERR045642', 'LA10/PG0086-CW/ERR045629', 'NIC/PG0095-C/ERR027107', 'QF5/PG0078-CW/ERR045638', 'XD8/PG0105-CW/ERR045628', 'XF12/PG0102-CW/ERR045635', 'D2/PG0094-CW/ERR045632'] including ['7G8/PG0083-C/ERR027099', 'GB4/PG0084-C/ERR027100', 'AL2/PG0103-CW/ERR045627', 'AUD/PG0112-C/ERR029406', 'DAN/PG0098-C/ERR027110', 'DEV/PG0081-CW/ERR045633', 'JB12/PG0099-C/ERR029146', 'JB8/PG0087-C/ERR029091', 'JC3/PG0077-CW/ERR045636', 'JC9/PG0111-CW/ERR045634', 'JE11/PG0100-C/ERR029404', 'JF6/PG0079-C/ERR027102', 'JON/PG0107-C/ERR029408', 'KA6/PG0091-C/ERR027117', 'KB8/PG0104-C/ERR029148', 'KH7/PG0088-C/ERR027111', 'LA10/PG0086-C/ERR029090', 'NF10/PG0096-C/ERR027108', 'NIC/PG0095-CW/ERR045631', 'QF5/PG0078-C/ERR029092', 'TF1/PG0080-C/ERR027103', 'WC4/PG0082-C/ERR029093', 'WE2/PG0085-C/ERR027101', 'WF12/PG0097-C/ERR027109', 'XB3/PG0093-C/ERR029105', 'XD8/PG0105-C/ERR029144', 'XE7/PG0106-C/ERR029407', 'XF12/PG0102-C/ERR029143', 'XG10/PG0109-C/ERR029405']
2016-03-08 18:00:15.940467 :: filter calls: excluding 12209 (1.2%) retaining 987450 (98.8%) of 999659 calls
{'7g8_gb4': 27, 'hb3_dd2': 34, '3d7_hb3': 15}
76

CO recombination events

To infer CO events we first mask out genotypes on blocks shorter than 10kb.


In [3]:
min_co_block_size = 10000
callsets_co = {cross: filter_calls(callsets[cross], min_haplen_calls(min_co_block_size))
               for cross in CROSSES}


2016-03-08 18:00:21.507979 :: filter calls: excluding 786 (0.1%) retaining 714693 (99.9%) of 715479 calls
2016-03-08 18:00:21.690570 :: filter calls: excluding 2121 (0.2%) retaining 1310475 (99.8%) of 1312596 calls
2016-03-08 18:00:21.826622 :: filter calls: excluding 1683 (0.2%) retaining 997976 (99.8%) of 999659 calls

In [4]:
def tabulate_crossovers(cross):
    variants, calldata, _ = unpack_callset(callsets_co[cross])
    tbl = (
        tabulate_switches(variants, calldata)
        .addfield('cross', cross)
        .rename({'pos': 'co_pos_mid', 'lpos': 'co_pos_min', 'rpos': 'co_pos_max', 'range': 'co_pos_range'})
        .addfield('co_from_parent', lambda r: r.cross.split('_')[r['from'] - 7])
        .addfield('co_to_parent', lambda r: r.cross.split('_')[r['to'] - 7])
        .cutout('from', 'to')
    )
    return etl.wrap(tbl)  


tbl_co = (etl
    .cat(*[tabulate_crossovers(cross) for cross in CROSSES])
    .sort(key=('chrom', 'co_pos_mid'))
)
tbl_co.totsv(os.path.join(PUBLIC_DIR, 'tbl_co.txt'))
tbl_co.topickle(os.path.join(PUBLIC_DIR, 'tbl_co.pickle'))
display_with_nrows(tbl_co, caption='CO events')


CO events (1194 rows)
0|sample 1|chrom 2|co_pos_mid 3|co_pos_min 4|co_pos_max 5|co_pos_range 6|cross 7|co_from_parent 8|co_to_parent
B1SD/PG0015-C/ERR019044 b'Pf3D7_01_v3' 145052 144877 145227 350 hb3_dd2 hb3 dd2
GC03/PG0021-C/ERR015447 b'Pf3D7_01_v3' 163584 163145 164024 879 hb3_dd2 dd2 hb3
XF12/PG0102-C/ERR029143 b'Pf3D7_01_v3' 206769 205803 207736 1933 7g8_gb4 gb4 7g8
7C159/PG0040-Cx/ERR107475 b'Pf3D7_01_v3' 206905 206074 207736 1662 hb3_dd2 hb3 dd2
CH3_61/PG0033-Cx/ERR175544 b'Pf3D7_01_v3' 206905 206074 207736 1662 hb3_dd2 dd2 hb3

...

Conversion tracts

Identify short blocks

Start by tabulating all the short inheritance blocks individually.


In [5]:
def tabulate_short_inheritance_blocks(cross):
    variants, calldata, _ = unpack_callset(callsets[cross])
    _, _, tbl_blocks = haplotypes(variants, calldata)   
    tbl = (
        tbl_blocks
        .select(lambda r: r.length_min < min_co_block_size and r.nxt_inheritance != -1 and r.prv_inheritance != -1)
        .addfield('cross', cross)
        .addfield('is_complex', False)
        .addfield('blocks', 1)
    )
    return tbl


tbl_short_blocks = (etl
    .cat(*[tabulate_short_inheritance_blocks(cross) for cross in CROSSES])
    .sort(key=('sample', 'chrom', 'start_min'))
)
display_with_nrows(tbl_short_blocks, caption='short inheritance blocks')


short inheritance blocks (1183 rows)
0|sample 1|chrom 2|start_min 3|start_mid 4|start_max 5|stop_min 6|stop_mid 7|stop_max 8|length_min 9|length_mid 10|length_max 11|support 12|prv_inheritance 13|inheritance 14|nxt_inheritance 15|cross 16|is_complex 17|blocks
1BB5/PG0023-C/ERR015449 b'Pf3D7_02_v3' 551191 551710 552230 553769 554132 554496 1539 2422 3305 5 8 7 8 hb3_dd2 False 1
1BB5/PG0023-C/ERR015449 b'Pf3D7_03_v3' 810043 810060 810077 810077 811042 812008 0 982 1965 1 8 7 8 hb3_dd2 False 1
1BB5/PG0023-C/ERR015449 b'Pf3D7_05_v3' 973050 973180 973311 973311 974623 975935 0 1443 2885 1 7 8 7 hb3_dd2 False 1
1BB5/PG0023-C/ERR015449 b'Pf3D7_08_v3' 1293542 1294381 1295221 1296788 1297035 1297283 1567 2654 3741 4 7 8 7 hb3_dd2 False 1
1BB5/PG0023-C/ERR015449 b'Pf3D7_08_v3' 1314649 1314740 1314831 1314831 1315166 1315502 0 426 853 1 7 8 7 hb3_dd2 False 1

...


In [6]:
tbl_short_blocks.valuecounts('sample').head(10)


Out[6]:
0|sample 1|count 2|frequency
CH3_116/PG0032-Cx/ERR037703 48 0.04057480980557904
CH3_61/PG0033-Cx/ERR175544 47 0.039729501267962805
JF6/PG0079-C/ERR027102 38 0.032121724429416736
JB12/PG0099-C/ERR029146 36 0.030431107354184278
B1SD/PG0015-C/ERR019044 35 0.029585798816568046

...


In [7]:
df_sb = tbl_short_blocks.valuecounts('sample').todataframe()
plt.hist(df_sb['count']);


Combine adjacent blocks into conversion tracts

Now combine adjacent blocks together.


In [8]:
class MergeAdjacentBlocks(object):
    
    def __init__(self, source):
        self.source = source
        
    def __iter__(self):
        tbl = etl.wrap(self.source)
        fields = tbl.fieldnames()
        it = iter(tbl.records())
        yield ['sample', 'cross', 'chrom', 'start_min', 'start_mid', 'start_max', 'stop_min', 'stop_mid', 'stop_max', 'length_min', 'length_mid', 'length_max', 'support', 'is_complex', 'blocks']
        cur = next(it)
        sample = cur.sample
        cross = cur.cross
        chrom = cur.chrom
        start_min = cur.start_min
        start_mid = cur.start_mid
        start_max = cur.start_max
        stop_min = cur.stop_min
        stop_mid = cur.stop_mid
        stop_max = cur.stop_max
        length_min = cur.length_min
        length_mid = cur.length_mid
        length_max = cur.length_max
        support = cur.support
        is_complex = cur.is_complex
        blocks = cur.blocks
        for cur in it:
            # are they adjacent?
            if sample == cur.sample and chrom == cur.chrom and stop_mid == cur.start_mid:
                # yes, merge
                stop_min = cur.stop_min
                stop_mid = cur.stop_mid
                stop_max = cur.stop_max
                support += cur.support
                length_min = stop_min - start_max
                length_max = stop_max - start_min
                length_mid = stop_mid - start_mid
                is_complex = True
                blocks += 1
            else:
                # yield previous
                yield (sample, cross, chrom, start_min, start_mid, start_max, stop_min, stop_mid, stop_max, length_min, length_mid, length_max, support, is_complex, blocks)
                # reset
                sample = cur.sample
                cross = cur.cross
                chrom = cur.chrom
                start_min = cur.start_min
                start_mid = cur.start_mid
                start_max = cur.start_max
                stop_min = cur.stop_min
                stop_mid = cur.stop_mid
                stop_max = cur.stop_max
                length_min = cur.length_min
                length_mid = cur.length_mid
                length_max = cur.length_max
                support = cur.support
                is_complex = cur.is_complex
                blocks = cur.blocks
        # handle last one left over
        yield (sample, cross, chrom, start_min, start_mid, start_max, stop_min, stop_mid, stop_max, length_min, length_mid, length_max, support, is_complex, blocks)

In [9]:
tbl_conversion_tracts = etl.wrap(
    MergeAdjacentBlocks(tbl_short_blocks.cutout('prv_inheritance', 'inheritance', 'nxt_inheritance'))
)
display_with_nrows(tbl_conversion_tracts, caption='conversion tracts')


conversion tracts (921 rows)
0|sample 1|cross 2|chrom 3|start_min 4|start_mid 5|start_max 6|stop_min 7|stop_mid 8|stop_max 9|length_min 10|length_mid 11|length_max 12|support 13|is_complex 14|blocks
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_02_v3' 551191 551710 552230 553769 554132 554496 1539 2422 3305 5 False 1
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_03_v3' 810043 810060 810077 810077 811042 812008 0 982 1965 1 False 1
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_05_v3' 973050 973180 973311 973311 974623 975935 0 1443 2885 1 False 1
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_08_v3' 1293542 1294381 1295221 1296788 1297035 1297283 1567 2654 3741 4 False 1
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_08_v3' 1314649 1314740 1314831 1314831 1315166 1315502 0 426 853 1 False 1

...


In [10]:
tbl_conversion_tracts.valuecounts('sample').head()


Out[10]:
0|sample 1|count 2|frequency
CH3_116/PG0032-Cx/ERR037703 44 0.04777415852334419
CH3_61/PG0033-Cx/ERR175544 38 0.04125950054288816
JB12/PG0099-C/ERR029146 32 0.03474484256243214
B1SD/PG0015-C/ERR019044 30 0.03257328990228013
7C183/PG0042-C/ERR015448 26 0.02823018458197611

In [11]:
tbl_conversion_tracts.valuecounts('is_complex')


Out[11]:
0|is_complex 1|count 2|frequency
False 773 0.8393051031487514
True 148 0.16069489685124863

In [12]:
tbl_conversion_tracts.valuecounts('blocks')


Out[12]:
0|blocks 1|count 2|frequency
1 773 0.8393051031487514
2 86 0.09337676438653637
3 42 0.04560260586319218
4 10 0.010857763300760043
5 3 0.003257328990228013

...


In [13]:
X = tbl_conversion_tracts.valuecounts('sample').values('count').list()
plt.hist(X);


Identify tracts with robust support


In [14]:
tbl_tracts_robust = tbl_conversion_tracts.select(lambda r: r.support > 1 and r.length_min > 100)
display_with_nrows(tbl_tracts_robust, caption='conversion tracts with robust support')


conversion tracts with robust support (331 rows)
0|sample 1|cross 2|chrom 3|start_min 4|start_mid 5|start_max 6|stop_min 7|stop_mid 8|stop_max 9|length_min 10|length_mid 11|length_max 12|support 13|is_complex 14|blocks
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_02_v3' 551191 551710 552230 553769 554132 554496 1539 2422 3305 5 False 1
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_08_v3' 1293542 1294381 1295221 1296788 1297035 1297283 1567 2654 3741 4 False 1
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_08_v3' 1337134 1337350 1337567 1346702 1347132 1347563 9135 9782 10429 24 True 8
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_09_v3' 950476 951138 951801 954256 954735 955215 2455 3597 4739 4 True 3
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_12_v3' 331487 331820 332154 336886 337497 338108 4732 5677 6621 7 False 1

...


In [15]:
tbl_tracts_robust.valuecounts('is_complex')


Out[15]:
0|is_complex 1|count 2|frequency
False 190 0.5740181268882175
True 141 0.4259818731117825

In [16]:
tbl_tracts_robust.valuecounts('blocks')


Out[16]:
0|blocks 1|count 2|frequency
1 190 0.5740181268882175
2 82 0.24773413897280966
3 39 0.11782477341389729
4 10 0.030211480362537766
5 3 0.00906344410876133

...


In [17]:
tbl_tracts_robust.valuecounts('sample').head(5).display()
tbl_tracts_robust.valuecounts('sample').tail(5).display()


0|sample 1|count 2|frequency
7C111/PG0038-C/ERR015457 10 0.030211480362537766
JF6/PG0079-C/ERR027102 10 0.030211480362537766
CH3_61/PG0033-Cx/ERR175544 9 0.027190332326283987
KA6/PG0091-C/ERR027117 8 0.02416918429003021
C09/PG0072-C/ERR019057 8 0.02416918429003021
0|sample 1|count 2|frequency
TF1/PG0080-C/ERR027103 2 0.006042296072507553
XF12/PG0102-C/ERR029143 1 0.0030211480362537764
7C126/PG0047-C/ERR015452 1 0.0030211480362537764
QC34/PG0026-C/ERR015453 1 0.0030211480362537764
7C408/PG0031-C/ERR015458 1 0.0030211480362537764

In [18]:
X = tbl_tracts_robust.valuecounts('sample').values('count').list()
plt.hist(X, bins=10);



In [19]:
X = tbl_tracts_robust.values('length_min').list()
plt.hist(X, bins=60);


Differentiate CO and NCO conversion tracts

Now figure out which conversion tracts are associated with COs and which are NCOs...


In [20]:
tbl_tracts_differentiated = (
    tbl_tracts_robust
    .addfield('facet', lambda r: '%s_%s' % (r.sample, r.chrom))
    .intervalleftjoin(tbl_co.addfield('facet', lambda r: '%s_%s' % (r.sample, r.chrom)), 
                      lkey='facet', 
                      rkey='facet', 
                      lstart='start_min', 
                      lstop='stop_max', 
                      rstart='co_pos_min', 
                      rstop='co_pos_max')
    .cutout(15, 16, 17, 22, 23, 24, 25)
    .rename({
        'start_min': 'tract_start_min',
        'start_mid': 'tract_start_mid',
        'start_max': 'tract_start_max',
        'stop_min': 'tract_stop_min',
        'stop_mid': 'tract_stop_mid',
        'stop_max': 'tract_stop_max',
        'length_min': 'tract_length_min',
        'length_mid': 'tract_length_mid',
        'length_max': 'tract_length_max',
        'support': 'tract_support',
        'is_complex': 'tract_is_complex',
        'blocks': 'tract_blocks',
    })
    .addfield('tract_type', lambda row: 'NCO' if row.co_pos_min is None else 'CO')
)
tbl_tracts_differentiated.topickle(os.path.join(PUBLIC_DIR, 'tbl_conversion_tracts.pickle'))
tbl_tracts_differentiated.totsv(os.path.join(PUBLIC_DIR, 'tbl_conversion_tracts.txt'))
display_with_nrows(tbl_tracts_differentiated, caption='differentiated conversion tracts')


differentiated conversion tracts (331 rows)
0|sample 1|cross 2|chrom 3|tract_start_min 4|tract_start_mid 5|tract_start_max 6|tract_stop_min 7|tract_stop_mid 8|tract_stop_max 9|tract_length_min 10|tract_length_mid 11|tract_length_max 12|tract_support 13|tract_is_complex 14|tract_blocks 15|co_pos_mid 16|co_pos_min 17|co_pos_max 18|co_pos_range 19|tract_type
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_02_v3' 551191 551710 552230 553769 554132 554496 1539 2422 3305 5 False 1 None None None None NCO
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_08_v3' 1293542 1294381 1295221 1296788 1297035 1297283 1567 2654 3741 4 False 1 None None None None NCO
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_08_v3' 1337134 1337350 1337567 1346702 1347132 1347563 9135 9782 10429 24 True 8 1342348 1337134 1347563 10429 CO
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_09_v3' 950476 951138 951801 954256 954735 955215 2455 3597 4739 4 True 3 None None None None NCO
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_12_v3' 331487 331820 332154 336886 337497 338108 4732 5677 6621 7 False 1 None None None None NCO

...


In [21]:
tbl_tracts_differentiated.valuecounts('tract_type')


Out[21]:
0|tract_type 1|count 2|frequency
NCO 235 0.7099697885196374
CO 96 0.29003021148036257

In [22]:
tbl_tracts_co = tbl_tracts_differentiated.eq('tract_type', 'CO')
display_with_nrows(tbl_tracts_co, caption='CO conversion tracts')


CO conversion tracts (96 rows)
0|sample 1|cross 2|chrom 3|tract_start_min 4|tract_start_mid 5|tract_start_max 6|tract_stop_min 7|tract_stop_mid 8|tract_stop_max 9|tract_length_min 10|tract_length_mid 11|tract_length_max 12|tract_support 13|tract_is_complex 14|tract_blocks 15|co_pos_mid 16|co_pos_min 17|co_pos_max 18|co_pos_range 19|tract_type
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_08_v3' 1337134 1337350 1337567 1346702 1347132 1347563 9135 9782 10429 24 True 8 1342348 1337134 1347563 10429 CO
3BD5/PG0024-C/ERR019053 hb3_dd2 b'Pf3D7_03_v3' 912649 912727 912805 916518 916794 917070 3713 4067 4421 8 True 2 914859 912649 917070 4421 CO
3BD5/PG0024-C/ERR019053 hb3_dd2 b'Pf3D7_04_v3' 729031 729218 729405 733911 734356 734801 4506 5138 5770 7 True 2 731916 729031 734801 5770 CO
3BD5/PG0024-C/ERR019053 hb3_dd2 b'Pf3D7_10_v3' 225428 226716 228004 229437 229802 230167 1433 3086 4739 3 True 2 227797 225428 230167 4739 CO
3BD5/PG0024-C/ERR019053 hb3_dd2 b'Pf3D7_10_v3' 1341344 1342132 1342921 1361560 1361995 1362430 18639 19863 21086 32 True 6 1351887 1341344 1362430 21086 CO

...


In [23]:
tbl_tracts_nco = tbl_tracts_differentiated.eq('tract_type', 'NCO').cutout(15, 16, 17, 18, 19)
display_with_nrows(tbl_tracts_nco, caption='NCO conversion tracts')


NCO conversion tracts (235 rows)
0|sample 1|cross 2|chrom 3|tract_start_min 4|tract_start_mid 5|tract_start_max 6|tract_stop_min 7|tract_stop_mid 8|tract_stop_max 9|tract_length_min 10|tract_length_mid 11|tract_length_max 12|tract_support 13|tract_is_complex 14|tract_blocks
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_02_v3' 551191 551710 552230 553769 554132 554496 1539 2422 3305 5 False 1
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_08_v3' 1293542 1294381 1295221 1296788 1297035 1297283 1567 2654 3741 4 False 1
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_09_v3' 950476 951138 951801 954256 954735 955215 2455 3597 4739 4 True 3
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_12_v3' 331487 331820 332154 336886 337497 338108 4732 5677 6621 7 False 1
1BB5/PG0023-C/ERR015449 hb3_dd2 b'Pf3D7_13_v3' 756601 756957 757313 760183 761269 762355 2870 4312 5754 9 False 1

...


In [24]:
tbl_tracts_nco.valuecounts('tract_is_complex')


Out[24]:
0|tract_is_complex 1|count 2|frequency
False 190 0.8085106382978723
True 45 0.19148936170212766

In [25]:
tbl_tracts_differentiated.gt('tract_length_min', 15000).displayall()


0|sample 1|cross 2|chrom 3|tract_start_min 4|tract_start_mid 5|tract_start_max 6|tract_stop_min 7|tract_stop_mid 8|tract_stop_max 9|tract_length_min 10|tract_length_mid 11|tract_length_max 12|tract_support 13|tract_is_complex 14|tract_blocks 15|co_pos_mid 16|co_pos_min 17|co_pos_max 18|co_pos_range 19|tract_type
3BD5/PG0024-C/ERR019053 hb3_dd2 b'Pf3D7_10_v3' 1308754 1309392 1310031 1330511 1330807 1331104 20480 21415 22350 39 True 3 None None None None NCO
3BD5/PG0024-C/ERR019053 hb3_dd2 b'Pf3D7_10_v3' 1341344 1342132 1342921 1361560 1361995 1362430 18639 19863 21086 32 True 6 1351887 1341344 1362430 21086 CO
7C3/PG0034-C/ERR019047 hb3_dd2 b'Pf3D7_08_v3' 1202631 1203122 1203613 1227264 1227421 1227578 23651 24299 24947 44 True 5 None None None None NCO
C04/PG0061-C/ERR019059 3d7_hb3 b'Pf3D7_14_v3' 2891817 2892879 2893941 2922794 2923016 2923239 28853 30137 31422 77 True 9 None None None None NCO
JC3/PG0077-CW/ERR045636 7g8_gb4 b'Pf3D7_08_v3' 1207822 1208720 1209619 1228966 1229286 1229606 19347 20566 21784 42 True 10 1218714 1207822 1229606 21784 CO
JF6/PG0079-C/ERR027102 7g8_gb4 b'Pf3D7_11_v3' 1685640 1686586 1687533 1713449 1713727 1714005 25916 27141 28365 40 True 7 None None None None NCO
JF6/PG0079-C/ERR027102 7g8_gb4 b'Pf3D7_11_v3' 1726074 1726125 1726177 1746099 1747010 1747922 19922 20885 21848 42 True 11 None None None None NCO

In [ ]: