In [1]:
%run ~/relmapping/annot_cb/notebooks/annot__init__.ipynb
In [2]:
def mp(fp): return os.path.join('annot_cb', 'metrics_maxgap', fp)
l_cond = [* config['annot_cb']['lcap_samples'].keys() ]
fp_atac = 'annot_cb/accessible_sites_cb.tsv'
df_atac = pd.read_csv(fp_atac, sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
df_exon_fwd = pd.read_csv('annot_cb/metrics_exon/closest_exon_fwd.tsv', sep='\t')
df_exon_rev = pd.read_csv('annot_cb/metrics_exon/closest_exon_rev.tsv', sep='\t')
step_fwd = config['annot_cb']['lcap_filled_fwd'] + '.mean_by_stage'
step_rev = config['annot_cb']['lcap_filled_rev'] + '.mean_by_stage'
In [3]:
print(len(df_exon_fwd), len(df_exon_fwd), 'fwd/rev closest exon records')
In [4]:
# maximum gap between hypersensitive site and downstream first exons
def maxgap(a): return max(len(list(g)) if (k == 0) or (k != k) else 0 for k,g in itertools.groupby(a))
def nanmaxgap(a): return maxgap([e if e == e else 0 for e in a])
def maxgap_fwd_(fp_inp, chroms, atac_modes, exon1_starts, flank_len=250):
l = []
fh = pyBigWig.open(fp_inp)
for (chrom, atac_mode, exon1_start) in itertools.islice(zip(chroms, atac_modes, exon1_starts), None):
# Interval between atac-mode (+flank), and 5'-end of closest downstream exonl clipped by chromosome ends
start_ = max(atac_mode + flank_len, 0)
end_ = min(exon1_start + 1, fh.chroms(chrom))
if start_ < end_:
l.append(nanmaxgap(np.array(fh.values(chrom, int(start_), int(end_)))))
else:
l.append(-1)
fh.close()
return l
def maxgap_rev_(fp_inp, chroms, atac_modes, exon1_ends, flank_len=250):
l = []
fh = pyBigWig.open(fp_inp)
for (chrom, atac_mode, exon1_end) in itertools.islice(zip(chroms, atac_modes, exon1_ends), None):
# Interval between atac-mode (+flank), and 5'-end of closest downstream exonl clipped by chromosome ends
start_ = max(exon1_end - 1, 0)
end_ = min(atac_mode - flank_len + 1, fh.chroms(chrom))
if start_ < end_:
l.append(nanmaxgap(np.array(fh.values(chrom, int(start_), int(end_)))))
else:
l.append(-1)
fh.close()
return l
def maxgap_fwd(stage, step=step_fwd):
fp_fwd = pf('annot_cb_lcap_%s' % (stage,), step, '.bw', 'lcap')
l_maxgap = maxgap_fwd_(fp_fwd, df_atac['chrom'], l_atac_peak_pos, df_exon_fwd['pass1_exon1_start'])
return pd.DataFrame({'maxgap_%s_fwd' % (stage,): l_maxgap})
def maxgap_rev(stage, step=step_rev):
fp_rev = pf('annot_cb_lcap_%s' % (stage,), step, '.bw', 'lcap')
l_maxgap = maxgap_rev_(fp_rev, df_atac['chrom'], l_atac_peak_pos, df_exon_rev['pass1_exon1_end'])
return pd.DataFrame({'maxgap_%s_rev' % (stage,): l_maxgap})
df_maxgap_fwd = pd.concat(pmap(maxgap_fwd, l_cond, n_jobs=15), axis=1)
df_maxgap_rev = pd.concat(pmap(maxgap_rev, l_cond, n_jobs=15), axis=1)
In [5]:
# metrics_maxgap/ write .tsv-files of full maxgap assignments
df_maxgap_fwd.to_csv(mp('maxgap_fwd.tsv'), header=True, index=False, sep='\t')
df_maxgap_rev.to_csv(mp('maxgap_rev.tsv'), header=True, index=False, sep='\t')
In [6]:
# Visualise maxgap tests as .bed-files
def itemRgb_(maxgap_):
if maxgap_ == 0:
return RED
else:
return BLUE
write_gffbed(mp('maxgap_fwd.bed'),
chrom = df_atac['chrom'],
start = df_atac['start'],
end = df_atac['end'],
name = df_maxgap_fwd.min(axis=1),
strand = '+',
attr = df_maxgap_fwd,
itemRgb = map(itemRgb_, df_maxgap_fwd.min(axis=1)),
)
write_gffbed(mp('maxgap_rev.bed'),
chrom = df_atac['chrom'],
start = df_atac['start'],
end = df_atac['end'],
name = df_maxgap_rev.min(axis=1),
strand = '-',
attr = df_maxgap_rev,
itemRgb = map(itemRgb_, df_maxgap_rev.min(axis=1)),
)
!wc -l {mp('maxgap_fwd.bed')}
!wc -l {mp('maxgap_rev.bed')}