In [1]:
%run ../../shared_setup.ipynb


docker image cggh/biipy:v1.6.0

In [2]:
def tabulate(f):
    class Tabulated(etl.Table):
        def __init__(self, *args, **kwargs):
            self.args = args
            self.kwargs = kwargs
        def __iter__(self):
            return f(*self.args, **self.kwargs)
    return Tabulated

In [3]:
@tabulate
def tabulate_core_windows(window_size):
    yield 'chrom', 'start', 'stop'
    for rec in tbl_regions_1b.eq('region_type', 'Core').records():
        for start in range(rec.region_start, rec.region_stop, window_size):
            yield rec.region_chrom, start, start + window_size - 1

In [4]:
tbl_co = (
    etl
    .frompickle(os.path.join(PUBLIC_DIR, 'tbl_co.pickle'))
    .convert('chrom', lambda v: str(v, 'ascii'))
)
display_with_nrows(tbl_co, caption='CO events')


CO events (1194 rows)
0|sample 1|chrom 2|co_pos_mid 3|co_pos_min 4|co_pos_max 5|co_pos_range 6|cross 7|co_from_parent 8|co_to_parent
B1SD/PG0015-C/ERR019044 Pf3D7_01_v3 145052 144877 145227 350 hb3_dd2 hb3 dd2
GC03/PG0021-C/ERR015447 Pf3D7_01_v3 163584 163145 164024 879 hb3_dd2 dd2 hb3
XF12/PG0102-C/ERR029143 Pf3D7_01_v3 206769 205803 207736 1933 7g8_gb4 gb4 7g8
7C159/PG0040-Cx/ERR107475 Pf3D7_01_v3 206905 206074 207736 1662 hb3_dd2 hb3 dd2
CH3_61/PG0033-Cx/ERR175544 Pf3D7_01_v3 206905 206074 207736 1662 hb3_dd2 dd2 hb3

...


In [5]:
tbl_windows = tabulate_core_windows(5000)
tbl_windows


Out[5]:
0|chrom 1|start 2|stop
Pf3D7_01_v3 92901 97900
Pf3D7_01_v3 97901 102900
Pf3D7_01_v3 102901 107900
Pf3D7_01_v3 107901 112900
Pf3D7_01_v3 112901 117900

...


In [10]:
# count COs in windows
tbl_windows_co = (
    tbl_windows
    .intervalleftjoin(tbl_co, lkey='chrom', lstart='start', lstop='stop',
                      rkey='chrom', rstart='co_pos_min', rstop='co_pos_max',
                      include_stop=True)
    .cutout(4)
    .aggregate(key=('chrom', 'start', 'stop'),
               aggregation=lambda vals: collections.Counter([v for v in vals if v is not None]),
               value='cross')
    .rename('value', 'co_count')
    .addfield('co_count_3d7_hb3', lambda row: row.co_count['3d7_hb3'])
    .addfield('co_count_hb3_dd2', lambda row: row.co_count['hb3_dd2'])
    .addfield('co_count_7g8_gb4', lambda row: row.co_count['7g8_gb4'])
)
tbl_windows_co


Out[10]:
0|chrom 1|start 2|stop 3|co_count 4|co_count_3d7_hb3 5|co_count_hb3_dd2 6|co_count_7g8_gb4
Pf3D7_01_v3 92901 97900 Counter() 0 0 0
Pf3D7_01_v3 97901 102900 Counter() 0 0 0
Pf3D7_01_v3 102901 107900 Counter() 0 0 0
Pf3D7_01_v3 107901 112900 Counter() 0 0 0
Pf3D7_01_v3 112901 117900 Counter() 0 0 0

...


In [11]:
tbl_windows_co.valuecounts('co_count_3d7_hb3').displayall()


0|co_count_3d7_hb3 1|count 2|frequency
0 3811 0.912814371257485
1 314 0.07520958083832335
2 50 0.011976047904191617

In [12]:
tbl_windows_co.valuecounts('co_count_hb3_dd2').displayall()


0|co_count_hb3_dd2 1|count 2|frequency
0 3464 0.8297005988023952
1 603 0.1444311377245509
2 101 0.024191616766467066
3 7 0.0016766467065868263

In [13]:
tbl_windows_co.valuecounts('co_count_7g8_gb4').displayall()


0|co_count_7g8_gb4 1|count 2|frequency
0 3688 0.8833532934131737
1 434 0.10395209580838323
2 48 0.011497005988023952
3 4 0.0009580838323353293
4 1 0.00023952095808383233

In [16]:
tbl_hotspots = (
    tbl_windows_co
    .select(lambda row: row.co_count_3d7_hb3 >= 2 or row.co_count_hb3_dd2 >= 2 or row.co_count_7g8_gb4 >= 2)
    .addfield('n_hot', lambda row: sum(1 for v in [row.co_count_3d7_hb3, row.co_count_hb3_dd2, row.co_count_7g8_gb4] if v >=2))
)
tbl_hotspots


Out[16]:
0|chrom 1|start 2|stop 3|co_count 4|co_count_3d7_hb3 5|co_count_hb3_dd2 6|co_count_7g8_gb4 7|n_hot
Pf3D7_01_v3 202901 207900 Counter({'hb3_dd2': 2, '7g8_gb4': 1}) 0 2 1 1
Pf3D7_01_v3 322901 327900 Counter({'3d7_hb3': 2, 'hb3_dd2': 1}) 2 1 0 1
Pf3D7_01_v3 402901 407900 Counter({'7g8_gb4': 2, 'hb3_dd2': 1}) 0 1 2 1
Pf3D7_01_v3 550312 555311 Counter({'7g8_gb4': 2}) 0 0 2 1
Pf3D7_02_v3 205801 210800 Counter({'3d7_hb3': 2}) 2 0 0 1

...


In [17]:
tbl_hotspots.valuecounts('n_hot')


Out[17]:
0|n_hot 1|count 2|frequency
1 197 0.9656862745098039
2 7 0.03431372549019608

In [18]:
tbl_hotspots.nrows()


Out[18]:
204

In [ ]: