In [1]:
# Init & specify annotation target
%run ~/relmapping/annot/notebooks/annot__init__.ipynb
#annot_ = 'annot_ce10'
#annot_ = 'annot_ce10_eLife_full'
#annot_ = 'annot_mapq0_ce10'
annot_ = 'annot_ce10_tissues'
print('annotation target: %s' % (annot_,))
def mp(fp, annot_=annot_): return os.path.join(annot_, 'metrics_lcap', fp)
if annot_ == 'annot_ce10':
l_cond = config['stages']
fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
df_atac = pd.read_csv(fp_atac, sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
step_startbp_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_fwd'
step_startbp_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_rev'
elif annot_ == 'annot_ce10_eLife_full':
l_cond = config['stages']
fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
df_atac = pd.read_csv(fp_atac, sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
step_startbp_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_fwd'
step_startbp_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_rev'
elif annot_ == 'annot_ce10_tissues':
l_cond = config['stages'] + config['lcap823_tissues']
fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
df_atac = pd.read_csv(fp_atac, sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
step_startbp_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_fwd'
step_startbp_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_rev'
elif annot_ == 'annot_mapq0_ce10':
l_cond = config['stages']
fp_atac = os.path.join('annot_ce10', 'accessible_sites.tsv')
df_atac = pd.read_csv(fp_atac, sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
step_startbp_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.startbp_fwd'
step_startbp_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.startbp_rev'
else:
assert False
In [2]:
# Jump test
# 2-condition, 2-replicate one-sided test
def deseq2x2(df_counts):
fh_inp, fp_inp = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_inp_')
fh_out, fp_out = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_out_')
df_counts.to_csv(fp_inp, sep='\t')
!cat {fp_inp} | scripts/deseq2x2_greater.R > {fp_out}
#!wc -l {fp_inp}
#!wc -l {fp_out}
#!tail -n 20 {fp_out}
df_out = pd.read_csv(fp_out, sep='\s+')
assert len(df_counts) == len(df_out)
!rm {fp_inp}
!rm {fp_out}
return df_out
# Count reads & call DESeq
def jump_test(fp_rep1, fp_rep2, chrom, u_start, u_end, d_start, d_end, f):
df_counts = pd.DataFrame()
df_counts['ucount_rep1'] = list(map(f, yp.read_regions(fp_rep1, chrom, u_start, u_end)))
df_counts['ucount_rep2'] = list(map(f, yp.read_regions(fp_rep2, chrom, u_start, u_end)))
df_counts['dcount_rep1'] = list(map(f, yp.read_regions(fp_rep1, chrom, d_start, d_end)))
df_counts['dcount_rep2'] = list(map(f, yp.read_regions(fp_rep2, chrom, d_start, d_end)))
# Sample pseudoreplicates
#np.random.seed(0)
#df_counts['ucount_sum'] = df_counts[ ['ucount_rep1', 'ucount_rep2'] ].sum(axis=1)
#df_counts['dcount_sum'] = df_counts[ ['dcount_rep1', 'dcount_rep2'] ].sum(axis=1)
#df_counts['ucount_rep1'] = list(map(lambda n: np.random.binomial(n, 0.5), df_counts['ucount_sum']))
#df_counts['dcount_rep1'] = list(map(lambda n: np.random.binomial(n, 0.5), df_counts['dcount_sum']))
#df_counts['ucount_rep2'] = df_counts['ucount_sum'] - df_counts['ucount_rep1']
#df_counts['dcount_rep2'] = df_counts['dcount_sum'] - df_counts['dcount_rep1']
df_deseq = deseq2x2(df_counts[['ucount_rep1', 'ucount_rep2', 'dcount_rep1', 'dcount_rep2']])
return pd.concat([df_counts, df_deseq], axis=1)
def itemRgb_(passed):
if passed:
return yp.RED
else:
return yp.BLUE
def jump_test_fwd(stage, step_startbp_fwd=step_startbp_fwd):
if stage in config['stages']:
fp_rep1 = pf('lcap808_%(stage)s_rep1' % locals(), step_startbp_fwd, '.bw', 'lcap808')
fp_rep2 = pf('lcap808_%(stage)s_rep2' % locals(), step_startbp_fwd, '.bw', 'lcap808')
elif stage in config['lcap823_tissues']:
fp_rep1 = pf('lcap823_%(stage)s_rep1' % locals(), step_startbp_fwd, '.bw', 'lcap823')
fp_rep2 = pf('lcap823_%(stage)s_rep2' % locals(), step_startbp_fwd, '.bw', 'lcap823')
else:
assert False
chrom = df_atac['chrom'].tolist()
u_start = (l_atac_peak_pos - 250).tolist()
u_end = (l_atac_peak_pos - 75 + 1).tolist()
d_start = (l_atac_peak_pos + 75).tolist()
d_end = (l_atac_peak_pos + 250 + 1).tolist()
df_jump = jump_test(fp_rep1, fp_rep2, chrom=chrom, u_start=u_start, u_end=u_end,
d_start=d_start, d_end=d_end, f=lambda s: int(np.nan_to_num(np.nansum(s))))
#df_jump['passed'] = (df_jump['log2FoldChange'] > 2) & (df_jump['padj'] < 0.05)
df_jump['passed_jump'] = (df_jump['log2FoldChange'] > 1.5) & (df_jump['padj'] < 0.1)
df_jump['passed_incr'] = (df_jump['ucount_rep1'] == 0) & (df_jump['ucount_rep2'] == 0) \
& (df_jump['dcount_rep1'] >= 1) & (df_jump['dcount_rep2'] >= 1) \
& ((df_jump['dcount_rep1'] + df_jump['dcount_rep2']) >= 3)
df_jump['passed'] = df_jump['passed_jump'] | df_jump['passed_incr']
df_jump['summary'] = ['%s / %.02f / %.3g' % (passed, lfc, padj) \
for passed, lfc, padj in zip(df_jump['passed'], df_jump['log2FoldChange'], df_jump['padj'])]
df_jump.columns = ['lcap_%s_fwd_%s' % (stage, col) for col in df_jump.columns]
write_gffbed(mp('lcap_%s_fwd.bed' % (stage,)),
chrom = chrom,
start = u_start,
thickStart = np.array(u_end) - 1,
thickEnd = np.array(d_start) + 1,
end = d_end,
attr = df_jump,
itemRgb = list(map(itemRgb_, df_jump['lcap_%s_fwd_passed' % (stage,)]
)))
return df_jump
df_jump_fwd = pd.concat(pmap(jump_test_fwd, l_cond, n_jobs=20), axis=1)
df_jump_fwd.to_csv(mp('lcap_all_fwd.tsv'), sep='\t', index=False)
In [3]:
def jump_test_rev(stage, step_startbp_rev=step_startbp_rev):
if stage in config['stages']:
fp_rep1 = pf('lcap808_%(stage)s_rep1' % locals(), step_startbp_rev, '.bw', 'lcap808')
fp_rep2 = pf('lcap808_%(stage)s_rep2' % locals(), step_startbp_rev, '.bw', 'lcap808')
elif stage in config['lcap823_tissues']:
fp_rep1 = pf('lcap823_%(stage)s_rep1' % locals(), step_startbp_rev, '.bw', 'lcap823')
fp_rep2 = pf('lcap823_%(stage)s_rep2' % locals(), step_startbp_rev, '.bw', 'lcap823')
else:
assert False
chrom = df_atac['chrom'].tolist()
d_start = (l_atac_peak_pos - 250).tolist()
d_end = (l_atac_peak_pos - 75 + 1).tolist()
u_start = (l_atac_peak_pos + 75).tolist()
u_end = (l_atac_peak_pos + 250 + 1).tolist()
df_jump = jump_test(fp_rep1, fp_rep2, chrom=chrom, u_start=u_start, u_end=u_end,
d_start=d_start, d_end=d_end, f=lambda s: int(np.nan_to_num(np.nansum(s))))
#df_jump['passed'] = (df_jump['log2FoldChange'] > 2) & (df_jump['padj'] < 0.05)
df_jump['passed_jump'] = (df_jump['log2FoldChange'] > 1.5) & (df_jump['padj'] < 0.1)
df_jump['passed_incr'] = (df_jump['ucount_rep1'] == 0) & (df_jump['ucount_rep2'] == 0) \
& (df_jump['dcount_rep1'] >= 1) & (df_jump['dcount_rep2'] >= 1) \
& ((df_jump['dcount_rep1'] + df_jump['dcount_rep2']) >= 3)
df_jump['passed'] = df_jump['passed_jump'] | df_jump['passed_incr']
df_jump['summary'] = ['%s / %.02f / %.3g' % (passed, lfc, padj) \
for passed, lfc, padj in zip(df_jump['passed'], df_jump['log2FoldChange'], df_jump['padj'])]
df_jump.columns = ['lcap_%s_rev_%s' % (stage, col) for col in df_jump.columns]
write_gffbed(mp('lcap_%s_rev.bed' % (stage,)),
chrom = chrom,
start = d_start,
thickStart = np.array(d_end) - 1,
thickEnd = np.array(u_start) + 1,
end = u_end,
attr = df_jump,
itemRgb = list(map(itemRgb_, df_jump['lcap_%s_rev_passed' % (stage,)]
)))
return df_jump
df_jump_rev = pd.concat(pmap(jump_test_rev, l_cond, n_jobs=20), axis=1)
df_jump_rev.to_csv(mp('lcap_all_rev.tsv'), sep='\t', index=False)
In [4]:
pd.concat([df_jump_fwd['lcap_%s_fwd_passed' % (stage,)].value_counts() for stage in l_cond], axis=1)
Out[4]:
In [5]:
pd.concat([df_jump_rev['lcap_%s_rev_passed' % (stage,)].value_counts() for stage in l_cond], axis=1)
Out[5]:
In [6]:
u_start = (l_atac_peak_pos - 250).tolist()
u_end = (l_atac_peak_pos - 75 + 1).tolist()
d_start = (l_atac_peak_pos + 75).tolist()
d_end = (l_atac_peak_pos + 250 + 1).tolist()
l_passed_any = df_jump_fwd[['lcap_%s_fwd_passed' % (stage,) for stage in l_cond]].any(axis=1)
write_gffbed(mp('lcap_all_fwd.bed'),
chrom = df_atac['chrom'],
start = u_start,
thickStart = np.array(u_end) - 1,
thickEnd = np.array(d_start) + 1,
end = d_end,
attr = df_jump_fwd[['lcap_%s_fwd_summary' % (stage,) for stage in l_cond]],
itemRgb = list(map(itemRgb_, l_passed_any)),
)
l_passed_any = df_jump_rev[['lcap_%s_rev_passed' % (stage,) for stage in l_cond]].any(axis=1)
write_gffbed(mp('lcap_all_rev.bed'),
chrom = df_atac['chrom'],
start = u_start,
thickStart = np.array(u_end) - 1,
thickEnd = np.array(d_start) + 1,
end = d_end,
attr = df_jump_rev[['lcap_%s_rev_summary' % (stage,) for stage in l_cond]],
itemRgb = list(map(itemRgb_, l_passed_any)),
)