In [1]:
%run ../../shared_setup.ipynb
In [2]:
tbl_dups = tbl_samples.duplicates(key=('cross', 'clone'))
tbl_dups.displayall()
In [3]:
tbl_dup_pairs = (
tbl_dups
.cut('cross', 'clone', 'ID')
.join(
tbl_dups.cut('clone', 'ID'),
key='clone'
)
.rename({2: 'ID1', 3: 'ID2'})
.select(lambda row: row.ID1 != row.ID2)
.addfield('pair', lambda row: sorted({row.ID1, row.ID2}))
.cutout('ID1', 'ID2')
.mergeduplicates(key='pair')
.sort(key=('cross', 'clone', 'pair'))
)
tbl_dup_pairs.displayall()
In [4]:
gatk = load_callsets(GATK_CALLSET_FN_TEMPLATE,
crosses=('3d7_hb3', '7g8_gb4'),
variant_filter='FILTER_PASS',
call_filter=gatk_conf_calls)
In [5]:
cortex = load_callsets(CORTEX_CALLSET_FN_TEMPLATE,
crosses=('3d7_hb3', '7g8_gb4'),
variant_filter='FILTER_PASS',
call_filter=cortex_conf_calls)
In [6]:
combined = load_callsets(COMBINED_CALLSET_FN_TEMPLATE,
crosses=('3d7_hb3', '7g8_gb4'),
variant_filter='FILTER_PASS',
call_filter=combined_conf_calls)
In [7]:
def concordance(callsets, cross, pair):
callset = callsets[cross]
sample1, sample2 = pair
g1 = callset['calldata'][sample1]['genotype']
g2 = callset['calldata'][sample2]['genotype']
called = (g1 >= 0) & (g2 >= 0)
concord = called & (g1 == g2)
discord = called & (g1 != g2)
return nnz(concord), nnz(discord)
In [8]:
tbl_dup_pairs_discordance = (
tbl_dup_pairs
.addfield('gatk', lambda row: concordance(gatk, row.cross, row.pair))
.addfield('cortex', lambda row: concordance(cortex, row.cross, row.pair))
.addfield('combined', lambda row: concordance(combined, row.cross, row.pair))
.convert(('gatk', 'cortex', 'combined'), lambda v: '%s/%s' % (v[1], v[0] + v[1]))
.convert('cross', LABELS)
.movefield('pair', 2)
.convert('pair', lambda v: '%s vs %s' % tuple(v))
.rename({'cross': 'Cross',
'clone': 'Clone',
'pair': 'Replicate pair',
'gatk': 'Discordance (BWA/GATK callset)',
'cortex': 'Discordance (Cortex callset)',
'combined': 'Discordance (Combined callset)',})
)
tbl_dup_pairs_discordance.displayall(caption='duplicate discordance', index_header=False)