In [1]:
%run ../../shared_setup.ipynb
In [2]:
# load variation data
sample_exclusions = dup_samples.copy()
for cross in excessive_recomb_samples:
sample_exclusions[cross] += excessive_recomb_samples[cross]
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE,
sample_exclusions=sample_exclusions,
variant_filter='FILTER_PASS',
call_filter=combined_conf_calls)
samples = {cross: callsets[cross]['calldata'].dtype.names
for cross in CROSSES}
progeny = {cross: samples[cross][2:] for cross in CROSSES}
n_progeny = {cross: len(progeny[cross]) for cross in CROSSES}
print(n_progeny)
print(np.sum(list(n_progeny.values())))
To infer CO events we first mask out genotypes on blocks shorter than 10kb.
In [3]:
min_co_block_size = 10000
callsets_co = {cross: filter_calls(callsets[cross], min_haplen_calls(min_co_block_size))
for cross in CROSSES}
In [4]:
def tabulate_crossovers(cross):
variants, calldata, _ = unpack_callset(callsets_co[cross])
tbl = (
tabulate_switches(variants, calldata)
.addfield('cross', cross)
.rename({'pos': 'co_pos_mid', 'lpos': 'co_pos_min', 'rpos': 'co_pos_max', 'range': 'co_pos_range'})
.addfield('co_from_parent', lambda r: r.cross.split('_')[r['from'] - 7])
.addfield('co_to_parent', lambda r: r.cross.split('_')[r['to'] - 7])
.cutout('from', 'to')
)
return etl.wrap(tbl)
tbl_co = (etl
.cat(*[tabulate_crossovers(cross) for cross in CROSSES])
.sort(key=('chrom', 'co_pos_mid'))
)
tbl_co.totsv(os.path.join(PUBLIC_DIR, 'tbl_co.txt'))
tbl_co.topickle(os.path.join(PUBLIC_DIR, 'tbl_co.pickle'))
display_with_nrows(tbl_co, caption='CO events')
Start by tabulating all the short inheritance blocks individually.
In [5]:
def tabulate_short_inheritance_blocks(cross):
variants, calldata, _ = unpack_callset(callsets[cross])
_, _, tbl_blocks = haplotypes(variants, calldata)
tbl = (
tbl_blocks
.select(lambda r: r.length_min < min_co_block_size and r.nxt_inheritance != -1 and r.prv_inheritance != -1)
.addfield('cross', cross)
.addfield('is_complex', False)
.addfield('blocks', 1)
)
return tbl
tbl_short_blocks = (etl
.cat(*[tabulate_short_inheritance_blocks(cross) for cross in CROSSES])
.sort(key=('sample', 'chrom', 'start_min'))
)
display_with_nrows(tbl_short_blocks, caption='short inheritance blocks')
In [6]:
tbl_short_blocks.valuecounts('sample').head(10)
Out[6]:
In [7]:
df_sb = tbl_short_blocks.valuecounts('sample').todataframe()
plt.hist(df_sb['count']);
Now combine adjacent blocks together.
In [8]:
class MergeAdjacentBlocks(object):
def __init__(self, source):
self.source = source
def __iter__(self):
tbl = etl.wrap(self.source)
fields = tbl.fieldnames()
it = iter(tbl.records())
yield ['sample', 'cross', 'chrom', 'start_min', 'start_mid', 'start_max', 'stop_min', 'stop_mid', 'stop_max', 'length_min', 'length_mid', 'length_max', 'support', 'is_complex', 'blocks']
cur = next(it)
sample = cur.sample
cross = cur.cross
chrom = cur.chrom
start_min = cur.start_min
start_mid = cur.start_mid
start_max = cur.start_max
stop_min = cur.stop_min
stop_mid = cur.stop_mid
stop_max = cur.stop_max
length_min = cur.length_min
length_mid = cur.length_mid
length_max = cur.length_max
support = cur.support
is_complex = cur.is_complex
blocks = cur.blocks
for cur in it:
# are they adjacent?
if sample == cur.sample and chrom == cur.chrom and stop_mid == cur.start_mid:
# yes, merge
stop_min = cur.stop_min
stop_mid = cur.stop_mid
stop_max = cur.stop_max
support += cur.support
length_min = stop_min - start_max
length_max = stop_max - start_min
length_mid = stop_mid - start_mid
is_complex = True
blocks += 1
else:
# yield previous
yield (sample, cross, chrom, start_min, start_mid, start_max, stop_min, stop_mid, stop_max, length_min, length_mid, length_max, support, is_complex, blocks)
# reset
sample = cur.sample
cross = cur.cross
chrom = cur.chrom
start_min = cur.start_min
start_mid = cur.start_mid
start_max = cur.start_max
stop_min = cur.stop_min
stop_mid = cur.stop_mid
stop_max = cur.stop_max
length_min = cur.length_min
length_mid = cur.length_mid
length_max = cur.length_max
support = cur.support
is_complex = cur.is_complex
blocks = cur.blocks
# handle last one left over
yield (sample, cross, chrom, start_min, start_mid, start_max, stop_min, stop_mid, stop_max, length_min, length_mid, length_max, support, is_complex, blocks)
In [9]:
tbl_conversion_tracts = etl.wrap(
MergeAdjacentBlocks(tbl_short_blocks.cutout('prv_inheritance', 'inheritance', 'nxt_inheritance'))
)
display_with_nrows(tbl_conversion_tracts, caption='conversion tracts')
In [10]:
tbl_conversion_tracts.valuecounts('sample').head()
Out[10]:
In [11]:
tbl_conversion_tracts.valuecounts('is_complex')
Out[11]:
In [12]:
tbl_conversion_tracts.valuecounts('blocks')
Out[12]:
In [13]:
X = tbl_conversion_tracts.valuecounts('sample').values('count').list()
plt.hist(X);
In [14]:
tbl_tracts_robust = tbl_conversion_tracts.select(lambda r: r.support > 1 and r.length_min > 100)
display_with_nrows(tbl_tracts_robust, caption='conversion tracts with robust support')
In [15]:
tbl_tracts_robust.valuecounts('is_complex')
Out[15]:
In [16]:
tbl_tracts_robust.valuecounts('blocks')
Out[16]:
In [17]:
tbl_tracts_robust.valuecounts('sample').head(5).display()
tbl_tracts_robust.valuecounts('sample').tail(5).display()
In [18]:
X = tbl_tracts_robust.valuecounts('sample').values('count').list()
plt.hist(X, bins=10);
In [19]:
X = tbl_tracts_robust.values('length_min').list()
plt.hist(X, bins=60);
Now figure out which conversion tracts are associated with COs and which are NCOs...
In [20]:
tbl_tracts_differentiated = (
tbl_tracts_robust
.addfield('facet', lambda r: '%s_%s' % (r.sample, r.chrom))
.intervalleftjoin(tbl_co.addfield('facet', lambda r: '%s_%s' % (r.sample, r.chrom)),
lkey='facet',
rkey='facet',
lstart='start_min',
lstop='stop_max',
rstart='co_pos_min',
rstop='co_pos_max')
.cutout(15, 16, 17, 22, 23, 24, 25)
.rename({
'start_min': 'tract_start_min',
'start_mid': 'tract_start_mid',
'start_max': 'tract_start_max',
'stop_min': 'tract_stop_min',
'stop_mid': 'tract_stop_mid',
'stop_max': 'tract_stop_max',
'length_min': 'tract_length_min',
'length_mid': 'tract_length_mid',
'length_max': 'tract_length_max',
'support': 'tract_support',
'is_complex': 'tract_is_complex',
'blocks': 'tract_blocks',
})
.addfield('tract_type', lambda row: 'NCO' if row.co_pos_min is None else 'CO')
)
tbl_tracts_differentiated.topickle(os.path.join(PUBLIC_DIR, 'tbl_conversion_tracts.pickle'))
tbl_tracts_differentiated.totsv(os.path.join(PUBLIC_DIR, 'tbl_conversion_tracts.txt'))
display_with_nrows(tbl_tracts_differentiated, caption='differentiated conversion tracts')
In [21]:
tbl_tracts_differentiated.valuecounts('tract_type')
Out[21]:
In [22]:
tbl_tracts_co = tbl_tracts_differentiated.eq('tract_type', 'CO')
display_with_nrows(tbl_tracts_co, caption='CO conversion tracts')
In [23]:
tbl_tracts_nco = tbl_tracts_differentiated.eq('tract_type', 'NCO').cutout(15, 16, 17, 18, 19)
display_with_nrows(tbl_tracts_nco, caption='NCO conversion tracts')
In [24]:
tbl_tracts_nco.valuecounts('tract_is_complex')
Out[24]:
In [25]:
tbl_tracts_differentiated.gt('tract_length_min', 15000).displayall()
In [ ]: