In [1]:
# Init & specify annotation target
%run ~/relmapping/annot/notebooks/annot__init__.ipynb
#annot_ = 'annot_mapq0_ce10'
annot_ = 'annot_eLife_revised'
print('annotation target: %s' % (annot_,))
def mp(fp, annot_=annot_):
if annot_ == 'annot_eLife_revised':
return os.path.join(annot_, '_annot', 'metrics_scap', fp)
return os.path.join(annot_, 'metrics_scap', fp)
if annot_ == 'annot_eLife_revised':
fp_atac = os.path.join(annot_, 'Figure 1-source data 1. Accessible sites.txt')
step_fwd = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_q10.firstbp_fwd.gt0x2'
step_rev = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_q10.firstbp_rev.gt0x2'
elif annot_ == 'annot_ce10_tissues':
fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
step_fwd = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_non_coding.rm_q10.firstbp_fwd.gt0x2'
step_rev = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_non_coding.rm_q10.firstbp_rev.gt0x2'
elif annot_ == 'annot_mapq0_ce10':
fp_atac = os.path.join('annot_ce10', 'accessible_sites.tsv') # bootstrap using q10-filtered peaks
step_fwd = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_non_coding.firstbp_fwd.gt0x2'
step_rev = 'tg_se.bwa_se.rm_unmapped.rm_chrM.rm_blacklist.rm_non_coding.firstbp_rev.gt0x2'
else:
assert False
In [2]:
# Load long cap jump & exon data
df_atac = df_atac_()
print(len(df_atac), 'ATAC-seq sites')
In [3]:
# scap_count <- <scap tags within region>; scap_mode_raw <- <scap mode position>; scap_mode_count <- <tags at mode>
fp_fwd = pf('scap815_wt_all', step_fwd, '.bw', 'scap815')
fp_rev = pf('scap815_wt_all', step_rev, '.bw', 'scap815')
flank_len = 125
chroms_ = list(df_atac['chrom'])
starts_ = list(df_atac['mode'] - flank_len)
ends_ = list(df_atac['mode'] + flank_len)
df_scap_fwd = pd.DataFrame()
df_scap_rev = pd.DataFrame()
df_scap_fwd['scap_count'] = [*map(lambda c: int(np.nansum(c)), yp.read_regions(fp_fwd, chroms_, starts_, ends_))]
df_scap_rev['scap_count'] = [*map(lambda c: int(np.nansum(c)), yp.read_regions(fp_rev, chroms_, starts_, ends_))]
def scap_mode_count_(c): return int(np.nan_to_num(np.nanmax(c)))
df_scap_fwd['scap_mode_count'] = [*map(scap_mode_count_, yp.read_regions(fp_fwd, chroms_, starts_, ends_))]
df_scap_rev['scap_mode_count'] = [*map(scap_mode_count_, yp.read_regions(fp_rev, chroms_, starts_, ends_))]
df_scap_fwd['scap_pass'] = df_scap_fwd['scap_mode_count'] >= 2
df_scap_rev['scap_pass'] = df_scap_rev['scap_mode_count'] >= 2
n = float(len(df_atac))
n_scap_fwd = len(df_scap_fwd.query('scap_mode_count >= 2'))
n_scap_rev = len(df_scap_rev.query('scap_mode_count >= 2'))
n_scap_bid = sum((df_scap_fwd['scap_mode_count'] >= 2) & (df_scap_rev['scap_mode_count'] >= 2))
n_scap_either = sum((df_scap_fwd['scap_mode_count'] >= 2) | (df_scap_rev['scap_mode_count'] >= 2))
print(n_scap_fwd, n_scap_rev, n_scap_bid, n_scap_either, 'fwd/rev/both/either in pooled scap')
print(n_scap_fwd / n, n_scap_rev / n, n_scap_bid / n, n_scap_either / n)
In [4]:
# "Extrapolate" likely regions of txn initiation based on peak accessibility
# On average, short cap modes are spaced by 120bp
# Extrapolate short cap modes by peak accessibility +/- 60 (120bp/2) if no short cap mode was found
# (These "extrapolated" short cap modes can be discarded by filtering for scap_mode_count > 0)
def scap_mode_fwd_(scap_mode_raw, scap_mode_count, atac_mode):
if scap_mode_count > 0:
return scap_mode_raw
else:
return atac_mode + 60
def scap_mode_rev_(scap_mode_raw, scap_mode_count, atac_mode):
if scap_mode_count > 0:
return scap_mode_raw
else:
return atac_mode - 60
def scap_mode_raw_(s, c): return int(s + yp.nanargmax_median(c))
df_scap_fwd['scap_mode_raw'] = [*map(scap_mode_raw_, starts_, yp.read_regions(fp_fwd, chroms_, starts_, ends_))]
df_scap_rev['scap_mode_raw'] = [*map(scap_mode_raw_, starts_, yp.read_regions(fp_rev, chroms_, starts_, ends_))]
df_scap_fwd['scap_mode'] = [*map(scap_mode_fwd_, df_scap_fwd['scap_mode_raw'], df_scap_fwd['scap_mode_count'], df_atac['mode'])]
df_scap_rev['scap_mode'] = [*map(scap_mode_rev_, df_scap_rev['scap_mode_raw'], df_scap_rev['scap_mode_count'], df_atac['mode'])]
m = (df_scap_fwd['scap_mode_count'] >= 2) & (df_scap_rev['scap_mode_count'] >= 2)
print(np.median(df_scap_fwd[m]['scap_mode_raw'] - df_scap_rev[m]['scap_mode_raw']), 'median mode >= 2 tags')
m = (df_scap_fwd['scap_mode_count'] >= 10) & (df_scap_rev['scap_mode_count'] >= 10)
print(np.median(df_scap_fwd[m]['scap_mode_raw'] - df_scap_rev[m]['scap_mode_raw']), 'median mode (>= 10 tags)')
print('%d / %d sites with extrapolated txn initiation' % (
len(df_scap_fwd.query('scap_mode_count == 0')),
len(df_scap_rev.query('scap_mode_count == 0'))
))
In [6]:
# scap_%strand.bed <- txn initiation statistics
def itemRgb_(flag): return yp.ORANGE if flag else yp.BLUE
l_scap_col = ['scap_count', 'scap_mode_count', 'scap_pass', 'scap_mode_raw', 'scap_mode']
write_gffbed(mp('scap_fwd.bed'),
chrom = df_atac['chrom'],
start = list(df_atac['mode'] - flank_len),
end = list(df_atac['mode'] + flank_len),
name = df_scap_fwd['scap_mode_count'],
strand = '+',
attr = df_scap_fwd[l_scap_col],
itemRgb = map(itemRgb_, df_scap_fwd['scap_pass']),
)
write_gffbed(mp('scap_rev.bed'),
chrom = df_atac['chrom'],
start = list(df_atac['mode'] - flank_len),
end = list(df_atac['mode'] + flank_len),
name = df_scap_rev['scap_mode_count'],
strand = '-',
attr = df_scap_rev[l_scap_col],
itemRgb = map(itemRgb_, df_scap_rev['scap_pass']),
)
df_scap_fwd[l_scap_col].to_csv(mp('scap_fwd.tsv'), header=True, index=False, sep='\t')
df_scap_rev[l_scap_col].to_csv(mp('scap_rev.tsv'), header=True, index=False, sep='\t')
In [7]:
!md5sum annot_eLife_full/metrics_scap/*
In [8]:
!md5sum annot_eLife_revised/_annot/metrics_scap/*