In [1]:
%run ~/relmapping/annot/notebooks/annot__init__.ipynb
annot_ = 'annot_ce10_tissues'
def mp(fp, annot_=annot_): return os.path.join(annot_, 'metrics_atac', fp)


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
"""
step = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_q10.macs2_se_extsize150_shiftm75_keepdup_all'
suffix = '_treat_pileup.bw'
prefix = 'atac824'

!scripts/yapc/yapc annot_ce10_tissues/metrics_atac/atac_tissues \
    hypod {pf('atac824_hypod_rep1', step, suffix, prefix)} {pf('atac824_hypod_rep2', step, suffix, prefix)} \
    neurons {pf('atac824_neurons_rep1', step, suffix, prefix)} {pf('atac824_neurons_rep2', step, suffix, prefix)} \
    gonad {pf('atac824_gonad_rep1', step, suffix, prefix)} {pf('atac824_gonad_rep2', step, suffix, prefix)} \
    muscle {pf('atac824_muscle_rep1', step, suffix, prefix)} {pf('atac824_muscle_rep2', step, suffix, prefix)} \
    intest {pf('atac824_intest_rep1', step, suffix, prefix)} {pf('atac824_intest_rep2', step, suffix, prefix)} \
    --smoothing-window-width 150 --fixed-peak-halfwidth 75
"""


Out[2]:
"\nstep = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_q10.macs2_se_extsize150_shiftm75_keepdup_all'\nsuffix = '_treat_pileup.bw'\nprefix = 'atac824'\n\n!scripts/yapc/yapc annot_ce10_tissues/metrics_atac/atac_tissues     hypod {pf('atac824_hypod_rep1', step, suffix, prefix)} {pf('atac824_hypod_rep2', step, suffix, prefix)}     neurons {pf('atac824_neurons_rep1', step, suffix, prefix)} {pf('atac824_neurons_rep2', step, suffix, prefix)}     gonad {pf('atac824_gonad_rep1', step, suffix, prefix)} {pf('atac824_gonad_rep2', step, suffix, prefix)}     muscle {pf('atac824_muscle_rep1', step, suffix, prefix)} {pf('atac824_muscle_rep2', step, suffix, prefix)}     intest {pf('atac824_intest_rep1', step, suffix, prefix)} {pf('atac824_intest_rep2', step, suffix, prefix)}     --smoothing-window-width 150 --fixed-peak-halfwidth 75\n"

In [3]:
df_wt_glp1 = pd.read_csv('annot_ce10/accessible_sites.tsv', sep='\t')[['chrom', 'start', 'end', 'atac_source']]
print('%d peaks from wt/glp1-mapping' % (len(df_wt_glp1),))
df_wt_glp1.head()


42245 peaks from wt/glp1-mapping
Out[3]:
chrom start end atac_source
0 chrI 1900 2051 atac_wt_se
1 chrI 3826 3977 atac_wt_pe
2 chrI 4276 4427 atac_wt_se
3 chrI 11272 11423 atac_wt_pe
4 chrI 13070 13221 atac_wt_pe

In [4]:
df_tissues = read_gffbed('annot_ce10_tissues/metrics_atac/atac_tissues_0.001.bed')[yp.NAMES_BED3]
df_tissues['atac_source'] = 'atac_tissues'
print('%d peaks from tissue-specific data' % (len(df_tissues,)))
df_tissues.head()


19556 peaks from tissue-specific data
Out[4]:
chrom start end atac_source
0 chrI 327 478 atac_tissues
1 chrI 1888 2039 atac_tissues
2 chrI 3826 3977 atac_tissues
3 chrI 11188 11339 atac_tissues
4 chrI 15368 15519 atac_tissues

In [5]:
bt_tissues_only = BedTool.from_dataframe(df_tissues).subtract(b=BedTool.from_dataframe(df_wt_glp1), A=True)
df_tissues_only = pd.read_csv(bt_tissues_only.fn, sep='\t', names=df_wt_glp1.columns)
print('%d peaks added from tissue-specific data' % (len(df_tissues_only),))
df_tissues_only.sample(20)#head()


629 peaks added from tissue-specific data
Out[5]:
chrom start end atac_source
543 chrX 6590107 6590258 atac_tissues
339 chrIV 11650201 11650352 atac_tissues
53 chrI 9166828 9166979 atac_tissues
357 chrIV 16399353 16399504 atac_tissues
555 chrX 8025532 8025683 atac_tissues
455 chrV 14097824 14097975 atac_tissues
22 chrI 4116213 4116364 atac_tissues
139 chrII 7717461 7717612 atac_tissues
327 chrIV 9740759 9740910 atac_tissues
491 chrX 1384026 1384177 atac_tissues
229 chrIII 7805232 7805383 atac_tissues
343 chrIV 12213035 12213186 atac_tissues
182 chrIII 110 261 atac_tissues
88 chrII 158687 158838 atac_tissues
104 chrII 1153771 1153922 atac_tissues
64 chrI 10401044 10401195 atac_tissues
333 chrIV 10659047 10659198 atac_tissues
535 chrX 5593920 5594071 atac_tissues
561 chrX 8755906 8756057 atac_tissues
219 chrIII 5564464 5564615 atac_tissues

In [6]:
fp_ = os.path.join(annot_, 'accessible_sites.tsv')
df_merged = pd.concat([df_wt_glp1, df_tissues_only], axis=0, ignore_index=True).sort_values(yp.NAMES_BED3).reset_index(drop=True)
print('%d peaks in final set' % (len(df_merged),))
df_merged.head()
df_merged[yp.NAMES_BED3 + ['atac_source']].to_csv(fp_, sep='\t', index=False, float_format='%.2f')
!wc -l {fp_}


42874 peaks in final set
42875 annot_ce10_tissues/accessible_sites.tsv