In [1]:
# Init & specify annotation target
%run ~/relmapping/annot/notebooks/annot__init__.ipynb

#annot_ = 'annot_mapq0_ce10'
annot_ = 'annot_eLife_revised'
print('annotation target: %s' % (annot_,))

def mp(fp, annot_=annot_):
    if annot_ == 'annot_eLife_revised':
        return os.path.join(annot_, '_annot', 'metrics_scap', fp)
    return os.path.join(annot_, 'metrics_scap', fp)

if annot_ == 'annot_eLife_revised':
    fp_atac = os.path.join(annot_, 'Figure 1-source data 1. Accessible sites.txt')
    step_fwd = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_q10.firstbp_fwd.gt0x2'
    step_rev = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_q10.firstbp_rev.gt0x2'

elif annot_ == 'annot_ce10_tissues':
    fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
    step_fwd = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_non_coding.rm_q10.firstbp_fwd.gt0x2'
    step_rev = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_non_coding.rm_q10.firstbp_rev.gt0x2'

elif annot_ == 'annot_mapq0_ce10':
    fp_atac = os.path.join('annot_ce10', 'accessible_sites.tsv') # bootstrap using q10-filtered peaks
    step_fwd = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_non_coding.firstbp_fwd.gt0x2'
    step_rev = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_non_coding.firstbp_rev.gt0x2'

else:
    assert False


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping
annotation target: annot_eLife_revised

In [2]:
# Load long cap jump & exon data
df_atac = df_atac_()
print(len(df_atac), 'ATAC-seq sites')


42245 ATAC-seq sites

In [3]:
# scap_count <- <scap tags within region>; scap_mode_raw <- <scap mode position>; scap_mode_count <- <tags at mode>
fp_fwd = pf('scap815_wt_all', step_fwd, '.bw', 'scap815')
fp_rev = pf('scap815_wt_all', step_rev, '.bw', 'scap815')

flank_len = 125
chroms_ = list(df_atac['chrom'])
starts_ = list(df_atac['mode'] - flank_len)
ends_ =  list(df_atac['mode'] + flank_len)

df_scap_fwd = pd.DataFrame()
df_scap_rev = pd.DataFrame()
df_scap_fwd['scap_count'] = [*map(lambda c: int(np.nansum(c)), yp.read_regions(fp_fwd, chroms_, starts_, ends_))]
df_scap_rev['scap_count'] = [*map(lambda c: int(np.nansum(c)), yp.read_regions(fp_rev, chroms_, starts_, ends_))]

def scap_mode_count_(c): return int(np.nan_to_num(np.nanmax(c)))
df_scap_fwd['scap_mode_count'] = [*map(scap_mode_count_, yp.read_regions(fp_fwd, chroms_, starts_, ends_))]
df_scap_rev['scap_mode_count'] = [*map(scap_mode_count_, yp.read_regions(fp_rev, chroms_, starts_, ends_))]

df_scap_fwd['scap_pass'] = df_scap_fwd['scap_mode_count'] >= 2
df_scap_rev['scap_pass'] = df_scap_rev['scap_mode_count'] >= 2

n = float(len(df_atac))
n_scap_fwd = len(df_scap_fwd.query('scap_mode_count >= 2'))
n_scap_rev = len(df_scap_rev.query('scap_mode_count >= 2'))
n_scap_bid = sum((df_scap_fwd['scap_mode_count'] >= 2) & (df_scap_rev['scap_mode_count'] >= 2))
n_scap_either = sum((df_scap_fwd['scap_mode_count'] >= 2) | (df_scap_rev['scap_mode_count'] >= 2))
print(n_scap_fwd, n_scap_rev, n_scap_bid, n_scap_either, 'fwd/rev/both/either in pooled scap')
print(n_scap_fwd / n, n_scap_rev / n, n_scap_bid / n, n_scap_either / n)


28948 28868 23059 34757 fwd/rev/both/either in pooled scap
0.6852408569061428 0.6833471416735708 0.545839744348 0.822748254231

In [4]:
# "Extrapolate" likely regions of txn initiation based on peak accessibility
# On average, short cap modes are spaced by 120bp
# Extrapolate short cap modes by peak accessibility +/- 60 (120bp/2) if no short cap mode was found
# (These "extrapolated" short cap modes can be discarded by filtering for scap_mode_count > 0)
def scap_mode_fwd_(scap_mode_raw, scap_mode_count, atac_mode):
    if scap_mode_count > 0:
        return scap_mode_raw
    else:
        return atac_mode + 60

def scap_mode_rev_(scap_mode_raw, scap_mode_count, atac_mode):
    if scap_mode_count > 0:
        return scap_mode_raw
    else:
        return atac_mode - 60

def scap_mode_raw_(s, c): return int(s + yp.nanargmax_median(c))
df_scap_fwd['scap_mode_raw'] = [*map(scap_mode_raw_, starts_, yp.read_regions(fp_fwd, chroms_, starts_, ends_))]
df_scap_rev['scap_mode_raw'] = [*map(scap_mode_raw_, starts_, yp.read_regions(fp_rev, chroms_, starts_, ends_))]

df_scap_fwd['scap_mode'] = [*map(scap_mode_fwd_, df_scap_fwd['scap_mode_raw'], df_scap_fwd['scap_mode_count'], df_atac['mode'])]
df_scap_rev['scap_mode'] = [*map(scap_mode_rev_, df_scap_rev['scap_mode_raw'], df_scap_rev['scap_mode_count'], df_atac['mode'])]

m = (df_scap_fwd['scap_mode_count'] >= 2) & (df_scap_rev['scap_mode_count'] >= 2)
print(np.median(df_scap_fwd[m]['scap_mode_raw'] - df_scap_rev[m]['scap_mode_raw']), 'median mode >= 2 tags')
m = (df_scap_fwd['scap_mode_count'] >= 10) & (df_scap_rev['scap_mode_count'] >= 10)
print(np.median(df_scap_fwd[m]['scap_mode_raw'] - df_scap_rev[m]['scap_mode_raw']), 'median mode (>= 10 tags)')

print('%d / %d sites with extrapolated txn initiation' % (
    len(df_scap_fwd.query('scap_mode_count == 0')), 
    len(df_scap_rev.query('scap_mode_count == 0'))
))


104.0 median mode >= 2 tags
113.0 median mode (>= 10 tags)
13297 / 13377 sites with extrapolated txn initiation

In [6]:
# scap_%strand.bed <- txn initiation statistics
def itemRgb_(flag): return yp.ORANGE if flag else yp.BLUE

l_scap_col = ['scap_count', 'scap_mode_count', 'scap_pass', 'scap_mode_raw', 'scap_mode']

write_gffbed(mp('scap_fwd.bed'),
    chrom = df_atac['chrom'],
    start = list(df_atac['mode'] - flank_len),
    end = list(df_atac['mode'] + flank_len),
    name = df_scap_fwd['scap_mode_count'],
    strand = '+',
    attr = df_scap_fwd[l_scap_col],
    itemRgb = map(itemRgb_, df_scap_fwd['scap_pass']),
)

write_gffbed(mp('scap_rev.bed'),
    chrom = df_atac['chrom'],
    start = list(df_atac['mode'] - flank_len),
    end = list(df_atac['mode'] + flank_len),
    name = df_scap_rev['scap_mode_count'],
    strand = '-',
    attr = df_scap_rev[l_scap_col],
    itemRgb = map(itemRgb_, df_scap_rev['scap_pass']),
)

df_scap_fwd[l_scap_col].to_csv(mp('scap_fwd.tsv'), header=True, index=False, sep='\t')
df_scap_rev[l_scap_col].to_csv(mp('scap_rev.tsv'), header=True, index=False, sep='\t')

In [7]:
!md5sum annot_eLife_full/metrics_scap/*


c6ee8ad6b7e931b41c8f77047c7dc2bf  annot_eLife_full/metrics_scap/scap_fwd.bed
e0d06f28ecf6834e7d52e96c0c28294f  annot_eLife_full/metrics_scap/scap_fwd.tsv
de6f51b4f8dd44f834d065dffea58635  annot_eLife_full/metrics_scap/scap_rev.bed
3f3e44843d29445218d8ee2335b6cde2  annot_eLife_full/metrics_scap/scap_rev.tsv

In [8]:
!md5sum annot_eLife_revised/_annot/metrics_scap/*


c6ee8ad6b7e931b41c8f77047c7dc2bf  annot_eLife_revised/_annot/metrics_scap/scap_fwd.bed
e0d06f28ecf6834e7d52e96c0c28294f  annot_eLife_revised/_annot/metrics_scap/scap_fwd.tsv
de6f51b4f8dd44f834d065dffea58635  annot_eLife_revised/_annot/metrics_scap/scap_rev.bed
3f3e44843d29445218d8ee2335b6cde2  annot_eLife_revised/_annot/metrics_scap/scap_rev.tsv