In [1]:
%run ../../shared_setup.ipynb
In [2]:
gatk_raw = {cross: load_callset(GATK_CALLSET_FN_TEMPLATE.format(cross=cross))
for cross in CROSSES}
In [3]:
cortex_raw = {cross: load_callset(CORTEX_CALLSET_FN_TEMPLATE.format(cross=cross))
for cross in CROSSES}
In [4]:
gatk_query = '~FILTER_NON_SEGREGATING & ~FILTER_NON_CORE & ~FILTER_MISSING_PARENT & ~FILTER_DUP_SITE'
gatk_base = {cross: filter_variants(gatk_raw[cross], gatk_query)
for cross in CROSSES}
cortex_query = """(
~FILTER_NON_SEGREGATING & ~FILTER_NON_CORE & ~FILTER_MISSING_PARENT & ~FILTER_DUP_SITE
& ~FILTER_MAPQ
& ~FILTER_MISMAPPED_UNPLACEABLE
& ~FILTER_MULTIALLELIC
& ~FILTER_OVERLAPPING_SITE
& ~FILTER_PF_FAIL_ERROR
& ~FILTER_PF_FAIL_REPEAT
& ~FILTER_DUP_ALLELE
)"""
cortex_base = {cross: filter_variants(cortex_raw[cross], cortex_query)
for cross in CROSSES}
In [5]:
gatk_snp = {cross: filter_variants(gatk_base[cross], 'is_snp')
for cross in CROSSES}
cortex_snp = {cross: filter_variants(cortex_base[cross], 'is_snp')
for cross in CROSSES}
In [6]:
gatk_indel = {cross: filter_variants(gatk_base[cross], '~is_snp')
for cross in CROSSES}
cortex_indel = {cross: filter_variants(cortex_base[cross], '~is_snp')
for cross in CROSSES}
In [7]:
GATK_MIN_VQSLOD_SNP = {
'3d7_hb3': 2.5, 'hb3_dd2': 3., '7g8_gb4': 4.
}
GATK_MIN_VQSLOD_INDEL = {
'3d7_hb3': 1., 'hb3_dd2': 1.5, '7g8_gb4': 1.8
}
CORTEX_MIN_SITE_CONF_SNP = {
'3d7_hb3': 50, 'hb3_dd2': 50, '7g8_gb4': 50
}
CORTEX_MIN_SITE_CONF_INDEL = {
'3d7_hb3': 200, 'hb3_dd2': 200, '7g8_gb4': 200
}
In [8]:
def plot_me_roc(ax, callset, variant_thresholds, call_thresholds, variant_field, call_field):
# iterate over call thresholds
for min_call_q, color in zip(call_thresholds, 'bgrcmyk'):
# first filter the genotype calls
callset_cflt = filter_calls(callset, '%s >= %s' % (call_field, min_call_q), verbose=False)
# now filter variants where the parent call is missing or filtered (so cannot determine Mendelian status)
pconf = lambda v, c, c2d: np.all(c2d.genotype[:, :2] >= 0, axis=1) & np.any(c2d.genotype > 0, axis=1)
callset_pconf = filter_variants(callset_cflt, pconf, verbose=False)
# setup output arrays
x = list()
y = list()
z = list()
# iterate over variant thresholds
for min_variant_q in variant_thresholds:
# filter the variants
vflt = lambda v, c, c2d: v[variant_field] >= min_variant_q
callset_vflt = filter_variants(callset_pconf, vflt, verbose=False)
# unpack the callset
v, c, c2d = unpack_callset(callset_vflt)
# compute inheritance
p = inheritance(c2d.genotype)
# count calls
n_variants, n_samples = p.shape
n_calls = np.count_nonzero(p != INHERITANCE_MISSING)
n_calls_err = np.count_nonzero(p == INHERITANCE_NONPARENTAL)
# use number of non-missing calls as denominator
pc_calls_err = n_calls_err * 100 / n_calls
n_variants_err = np.count_nonzero(np.any(p == INHERITANCE_NONPARENTAL, axis=1))
pc_variants_err = n_variants_err * 100 / n_variants
x.append(n_variants)
y.append(pc_variants_err)
z.append(pc_calls_err)
# plot line
ax.plot(x, z, marker='o', linestyle='-', lw=1, color=color, label='(%s >= %s)' % (call_field, min_call_q))
# annotate points
for _x, _z, min_variant_q in zip(x, z, variant_thresholds):
if _z < .2:
ax.annotate(str(min_variant_q), xy=(_x, _z), xytext=(0, 0), xycoords='data', textcoords='offset points',
ha='center', va='center', fontsize=5,
bbox=dict(facecolor='white', alpha=1, edgecolor=color, linewidth=.5),
arrowprops=dict(color='gray', arrowstyle='-', shrinkA=0, shrinkB=0))
ax.set_xlabel('number of segregating variants')
ax.set_ylabel('% non-mendelian calls')
ax.grid(axis='y')
In [9]:
fig = plt.figure(figsize=(7, 7))
snp_vqslod_thresholds = .5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 5, 6, 7, 8
indel_vqslod_thresholds = 0, .5, 1, 1.3, 1.5, 1.8, 2, 2.5, 3, 3.5
gq_thresholds = 0, 99
for i, cross in enumerate(CROSSES):
callset = gatk_snp[cross]
ax = fig.add_subplot(3, 2, 2*i+1)
sns.despine(ax=ax)
plot_me_roc(ax, callset,
variant_thresholds=snp_vqslod_thresholds, call_thresholds=gq_thresholds,
variant_field='VQSLOD', call_field='GQ')
ax.set_title('%s SNPs' % LABELS[cross])
ax.set_xlim(0, 16000)
ax.set_xticks(range(0, 16000, 5000))
ax.set_ylim(0, .2)
if i < 2:
ax.set_xlabel('')
if i == 0:
ax.legend(loc='upper left', frameon=True)
# ax.legend(fontsize=8)
callset = gatk_indel[cross]
ax = fig.add_subplot(3, 2, 2*i+2)
sns.despine(ax=ax)
plot_me_roc(ax, callset,
variant_thresholds=indel_vqslod_thresholds, call_thresholds=gq_thresholds,
variant_field='VQSLOD', call_field='GQ')
ax.set_title('%s indels' % LABELS[cross])
ax.set_xlim(0, 30000)
ax.set_xticks(range(0, 30000, 5000))
ax.set_ylim(0, .2)
if i < 2:
ax.set_xlabel('')
ax.set_ylabel('')
# ax.legend(fontsize=8)
fig.suptitle('Alignment-based calling method (BWA/GATK)', fontsize=10, fontweight='bold')
#fig.tight_layout()
fig.subplots_adjust(hspace=.4, wspace=.3, top=.91, left=.1, right=.98, bottom=.1)
fn = '../../artwork/supp/callset_calibration_gatk.{dpi}.{fmt}'
for fmt in 'jpeg', 'png':
for dpi in 120, 300:
fig.savefig(fn.format(dpi=dpi, fmt=fmt), dpi=dpi, jpeg_quality=100)
In [10]:
fig = plt.figure(figsize=(7, 7))
site_conf_thresholds = 0, 20, 50, 100, 200, 500, 800
gt_conf_thresholds = 20, 50
for i, cross in enumerate(CROSSES):
callset = cortex_snp[cross]
ax = fig.add_subplot(3, 2, 2*i+1)
sns.despine(ax=ax)
plot_me_roc(ax, callset,
variant_thresholds=site_conf_thresholds, call_thresholds=gt_conf_thresholds,
variant_field='SITE_CONF', call_field='GT_CONF')
ax.set_title('%s SNPs' % LABELS[cross])
ax.set_xlim(10000, 14000)
ax.set_xticks(range(10000, 14000, 1000))
ax.set_ylim(0, .2)
if i == 0:
ax.legend(loc='upper left', frameon=True)
if i < 2:
ax.set_xlabel('')
# ax.legend(fontsize=8)
callset = cortex_indel[cross]
ax = fig.add_subplot(3, 2, 2*i+2)
sns.despine(ax=ax)
plot_me_roc(ax, callset,
variant_thresholds=site_conf_thresholds, call_thresholds=gt_conf_thresholds,
variant_field='SITE_CONF', call_field='GT_CONF')
ax.set_title('%s indels' % LABELS[cross])
ax.set_xlim(5000, 18000)
ax.set_xticks(range(5000, 18000, 5000))
ax.set_ylim(0, .2)
if i < 2:
ax.set_xlabel('')
ax.set_ylabel('')
# ax.legend(fontsize=8)
fig.suptitle('Assembly-based calling method (Cortex)', fontsize=10, fontweight='bold')
#fig.tight_layout()
fig.subplots_adjust(hspace=.4, wspace=.3, top=.91, left=.1, right=.98, bottom=.1)
fn = '../../artwork/supp/callset_calibration_cortex.{dpi}.{fmt}'
for fmt in 'jpeg', 'png':
for dpi in 120, 300:
fig.savefig(fn.format(dpi=dpi, fmt=fmt), dpi=dpi, jpeg_quality=100)
In [ ]: