In [1]:
%run ~/relmapping/annot/notebooks/annot__init__.ipynb
annot_ = 'annot_ce10_eLife_full'
def mp(fp, annot_=annot_): return os.path.join(annot_, 'metrics_atac', fp)
In [2]:
fp_dm_pe = pf('atac_dm', 'tg_pe.bwa_pe.p10_keep.macs2_pe_lt200.concave75_idr', '_0.001.bed', 'atac716_dm')
fp_dm_se = pf('atac_dm', 'tg_pe.bwa_pe.p10_keep.macs2_se_extsize150_shiftm75_keepdup_all.concave_idr', '_0.001.bed', 'atac716_dm')
fp_am_se = pf('atac_am1', 'tg_se.bwa_se.q10_keep.macs2_se_extsize150_shiftm75_keepdup_all.concave150_idr', '_0.001.bed', 'atac716_am')
!wc -l {fp_dm_pe}
!wc -l {fp_dm_se}
!wc -l {fp_am_se}
df_dm_pe = pd.read_csv(fp_dm_pe, sep='\t', comment='#', names=yp.NAMES_BED6)
df_dm_se = pd.read_csv(fp_dm_se, sep='\t', comment='#', names=yp.NAMES_BED6)
df_am_se = pd.read_csv(fp_am_se, sep='\t', comment='#', names=yp.NAMES_BED6)
fp_dm_pe_bw = pf('atac808_wt', 'tg_pe.bwa_pe.rm_unmapped_pe.rm_chrM.rm_blacklist.rm_q10.macs2_pe_lt200.mean_by_series', '_treat_pileup.bw', 'atac808')
fp_dm_se_bw = pf('atac808_wt', 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_q10.macs2_se_extsize150_shiftm75_keepdup_all.mean_by_series', '_treat_pileup.bw', 'atac808')
fp_am_se_bw = pf('atac808_glp1', 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_q10.macs2_se_extsize150_shiftm75_keepdup_all.mean_by_series', '_treat_pileup.bw', 'atac808')
assert os.path.isfile(fp_dm_pe_bw)
assert os.path.isfile(fp_dm_se_bw)
assert os.path.isfile(fp_am_se_bw)
def com_(a_, offset):
a = np.nan_to_num(a_)
a_com = scipy.ndimage.measurements.center_of_mass(a)[0]
if a_com != a_com: a_com = a.shape[0] / 2
return int(a_com + offset)
df_dm_pe['atac_peak_accessibility'] = list(map(com_, yp.read_regions(fp_dm_pe_bw, df_dm_pe.chrom.tolist(), df_dm_pe.start.tolist(), df_dm_pe.end.tolist()), df_dm_pe.start.tolist()))
df_dm_se['atac_peak_accessibility'] = list(map(com_, yp.read_regions(fp_dm_se_bw, df_dm_se.chrom.tolist(), df_dm_se.start.tolist(), df_dm_se.end.tolist()), df_dm_se.start.tolist()))
df_am_se['atac_peak_accessibility'] = list(map(com_, yp.read_regions(fp_am_se_bw, df_am_se.chrom.tolist(), df_am_se.start.tolist(), df_am_se.end.tolist()), df_am_se.start.tolist()))
#df_dm_pe['atac_peak_accessibility'] = summit_pos(df_dm_pe)
#df_dm_se['atac_peak_accessibility'] = summit_pos(df_dm_se)
#df_am_se['atac_peak_accessibility'] = summit_pos(df_am_se)
#def summit_(a_, offset):
# return int(yp.nanargmax_median(np.nan_to_num(a_)) + offset)
#df_dm_pe['atac_peak_accessibility'] = list(map(summit_, yp.read_regions(fp_dm_pe_bw, df_dm_pe.chrom.tolist(), df_dm_pe.start.tolist(), df_dm_pe.end.tolist()), df_dm_pe.start.tolist()))
#df_dm_se['atac_peak_accessibility'] = list(map(summit_, yp.read_regions(fp_dm_se_bw, df_dm_se.chrom.tolist(), df_dm_se.start.tolist(), df_dm_se.end.tolist()), df_dm_se.start.tolist()))
#df_am_se['atac_peak_accessibility'] = list(map(summit_, yp.read_regions(fp_am_se_bw, df_am_se.chrom.tolist(), df_am_se.start.tolist(), df_am_se.end.tolist()), df_am_se.start.tolist()))
df_dm_pe['atac_source'] = 'atac_wt_pe'
df_dm_se['atac_source'] = 'atac_wt_se'
df_am_se['atac_source'] = 'atac_glp1_se'
del df_dm_pe['strand']
del df_dm_se['strand']
del df_am_se['strand']
In [3]:
df_dm_pe.to_csv(mp('atac_wt_pe.bed'), sep='\t', header=False, index=False)
df_dm_se.to_csv(mp('atac_wt_se.bed'), sep='\t', header=False, index=False)
df_am_se.to_csv(mp('atac_glp1_se.bed'), sep='\t', header=False, index=False)
!wc -l {mp('atac_wt_pe.bed')}
!wc -l {mp('atac_wt_se.bed')}
!wc -l {mp('atac_glp1_se.bed')}
In [4]:
# All peaks in wt_pe
!cp {mp('atac_wt_pe.bed')} {mp('atac_wt_glp1.bed')}
!wc -l {mp('atac_wt_glp1.bed')}
# Add wt_se peaks not in wt_pe
!bedtools subtract -A -a {mp('atac_wt_se.bed')} -b {mp('atac_wt_pe.bed')} >> {mp('atac_wt_glp1.bed')}
!sort -k 1,1 -k 2,2n -k 3,3n -o {mp('atac_wt_glp1.bed')} {mp('atac_wt_glp1.bed')}
!wc -l {mp('atac_wt_glp1.bed')}
# Add glp1_se peaks not in wt_pe+wt_se
!bedtools subtract -A -a {mp('atac_glp1_se.bed')} -b {mp('atac_wt_glp1.bed')} >> {mp('atac_wt_glp1.bed')}
!sort -k 1,1 -k 2,2n -k 3,3n -o {mp('atac_wt_glp1.bed')} {mp('atac_wt_glp1.bed')}
!wc -l {mp('atac_wt_glp1.bed')}
!sed -i '1i#track gffTags=on' {mp('atac_wt_glp1.bed')}
!wc -l {mp('atac_wt_glp1.bed')}
In [5]:
# Write .tsv file with all the columns
df_atac = pd.read_csv(mp('atac_wt_glp1.bed'),
names=['chrom', 'start', 'end', 'name', 'score', 'atac_peak_accessibility', 'atac_source'], sep='\t', comment='#')
df_atac['start'] = df_atac['atac_peak_accessibility'] - 75
df_atac['end'] = df_atac['atac_peak_accessibility'] + 75 + 1
In [6]:
# Peak heights for every hypersensitive site
for stage in itertools.islice(config['stages'], None):
fp_ = 'atac814_geo/tracks/atac_%s.bw' % (stage,)
print(stage, os.path.isfile(fp_))
df_atac['atac_%(stage)s_height' % locals()] = [*map(np.nanmax,
yp.read_regions(fp_, df_atac.chrom.tolist(), df_atac.start.tolist(), df_atac.end.tolist()))]
In [7]:
df_attr_ = pd.concat([
df_atac[['atac_%s_height' % stage_ for stage_ in config['stages']]].applymap(lambda x: '%.2f' % x),
df_atac['atac_source']
], axis=1)
fp_ = os.path.join(annot_, 'accessible_sites.bed')
write_gffbed(fp_,
chrom = df_atac['chrom'],
start = df_atac['start'],
end = df_atac['end'],
attr = df_attr_,
itemRgb = '0,0,178',
)
!wc -l {fp_}
In [8]:
# Write output table
fp_ = os.path.join(annot_, 'accessible_sites.tsv')
df_ = df_atac[yp.NAMES_BED3 + ['atac_%s_height' % stage_ for stage_ in config['stages']] + ['atac_source']]
df_.to_csv(fp_, sep='\t', index=False, float_format='%.2f')
!wc -l {fp_}