In [1]:
# Init & specify annotation target
%run ~/relmapping/annot/notebooks/annot__init__.ipynb

#annot_ = 'annot_ce10'
#annot_ = 'annot_mapq0_ce10'
#annot_ = 'annot_ce10_eLife_full'
annot_ = 'annot_ce10_tissues'
print('annotation target: %s' % (annot_,))

def mp(fp, annot_=annot_): return os.path.join(annot_, 'metrics_maxgap', fp)

if annot_ == 'annot_ce10':
    l_cond = config['stages']
    fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
    df_atac = pd.read_csv(fp_atac, sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    df_exon_fwd = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_fwd.tsv'), sep='\t')
    df_exon_rev = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_rev.tsv'), sep='\t')    
    step_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.filled_fwd.mean_by_stage'
    step_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.filled_rev.mean_by_stage'

elif annot_ == 'annot_ce10_eLife_full':
    l_cond = config['stages']
    fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
    df_atac = pd.read_csv(fp_atac, sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    df_exon_fwd = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_fwd.tsv'), sep='\t')
    df_exon_rev = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_rev.tsv'), sep='\t')    
    step_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.filled_fwd.mean_by_stage'
    step_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.filled_rev.mean_by_stage'

elif annot_ == 'annot_mapq0_ce10':
    l_cond = config['stages']
    fp_atac = os.path.join('annot_ce10', 'accessible_sites.tsv')
    df_atac = pd.read_csv(fp_atac, sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    # Bootsrap from annot_ce10
    df_exon_fwd = pd.read_csv(os.path.join('annot_ce10', 'metrics_exon', 'closest_exon_fwd.tsv'), sep='\t')
    df_exon_rev = pd.read_csv(os.path.join('annot_ce10', 'metrics_exon', 'closest_exon_rev.tsv'), sep='\t')    
    step_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.filled_fwd.mean_by_stage'
    step_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.filled_rev.mean_by_stage'

elif annot_ == 'annot_ce10_tissues':
    l_cond = config['stages'] + config['lcap823_tissues']
    fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
    df_atac = pd.read_csv(fp_atac, sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    df_exon_fwd = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_fwd.tsv'), sep='\t')
    df_exon_rev = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_rev.tsv'), sep='\t')    
    step_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.filled_fwd.mean_by_stage'
    step_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.filled_rev.mean_by_stage'

else:
    assert False


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping
annotation target: annot_ce10_tissues
/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (3,5,6,7,9,10,11,12,13,14,19,44,46,47,48,50,51,52,54,55,56,64,66,67,68,70,71,72,74,75,76) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [2]:
# Load long cap jump & exon data
df_atac.head()


Out[2]:
chrom start end atac_source
0 chrI 327 478 atac_tissues
1 chrI 1900 2051 atac_wt_se
2 chrI 3826 3977 atac_wt_pe
3 chrI 4276 4427 atac_wt_se
4 chrI 11272 11423 atac_wt_pe

In [3]:
print(len(df_exon_fwd), len(df_exon_fwd), 'fwd/rev closest exon records')


42874 42874 fwd/rev closest exon records

In [4]:
# maximum gap between hypersensitive site and downstream first exons
def maxgap(a): return max(len(list(g)) if (k == 0) or (k != k) else 0 for k,g in itertools.groupby(a))

def nanmaxgap(a): return maxgap([e if e == e else 0 for e in a])

def maxgap_fwd_(fp_inp, chroms, atac_modes, exon1_starts, flank_len=250):
    l = []
    fh = pyBigWig.open(fp_inp)
    for (chrom, atac_mode, exon1_start) in itertools.islice(zip(chroms, atac_modes, exon1_starts), None):
        # Interval between atac-mode (+flank), and 5'-end of closest downstream exonl clipped by chromosome ends
        start_ = max(atac_mode + flank_len, 0)
        end_ = min(exon1_start + 1, fh.chroms(chrom))
        if start_ < end_:
            l.append(nanmaxgap(np.array(fh.values(chrom, int(start_), int(end_)))))
        else:
            l.append(-1)
    fh.close()
    return l

def maxgap_rev_(fp_inp, chroms, atac_modes, exon1_ends, flank_len=250):
    l = []
    fh = pyBigWig.open(fp_inp)
    for (chrom, atac_mode, exon1_end) in itertools.islice(zip(chroms, atac_modes, exon1_ends), None):
        # Interval between atac-mode (+flank), and 5'-end of closest downstream exonl clipped by chromosome ends
        start_ = max(exon1_end - 1, 0)
        end_ = min(atac_mode - flank_len + 1, fh.chroms(chrom))
        if start_ < end_:
            l.append(nanmaxgap(np.array(fh.values(chrom, int(start_), int(end_)))))
        else:
            l.append(-1)
    fh.close()
    return l

def maxgap_fwd(stage, step=step_fwd):
    if stage in config['stages']:
        fp_fwd = pf('lcap808_%s' % (stage,), step, '.bw', 'lcap808')
    elif stage in config['lcap823_tissues']:
        fp_fwd = pf('lcap823_%s' % (stage,), step, '.bw', 'lcap823')
    else:
        assert False
    l_maxgap = maxgap_fwd_(fp_fwd, df_atac['chrom'], l_atac_peak_pos, df_exon_fwd['pass1_exon1_start'])
    return pd.DataFrame({'maxgap_%s_fwd' % (stage,): l_maxgap})

def maxgap_rev(stage, step=step_rev):
    if stage in config['stages']:
        fp_rev = pf('lcap808_%s' % (stage,), step, '.bw', 'lcap808')
    elif stage in config['lcap823_tissues']:
        fp_rev = pf('lcap823_%s' % (stage,), step, '.bw', 'lcap823')
    else:
        assert False
    l_maxgap = maxgap_rev_(fp_rev, df_atac['chrom'], l_atac_peak_pos, df_exon_rev['pass1_exon1_end'])
    return pd.DataFrame({'maxgap_%s_rev' % (stage,): l_maxgap})

df_maxgap_fwd = pd.concat(pmap(maxgap_fwd, l_cond, n_jobs=15), axis=1)
df_maxgap_rev = pd.concat(pmap(maxgap_rev, l_cond, n_jobs=15), axis=1)


[Parallel(n_jobs=15)]: Done  15 out of  15 | elapsed:  1.9min finished
[Parallel(n_jobs=15)]: Done  15 out of  15 | elapsed:  2.1min finished

In [5]:
# metrics_maxgap/ write .tsv-files of full maxgap assignments
df_maxgap_fwd.to_csv(mp('maxgap_fwd.tsv'), header=True, index=False, sep='\t')
df_maxgap_rev.to_csv(mp('maxgap_rev.tsv'), header=True, index=False, sep='\t')

In [6]:
# Visualise maxgap tests as .bed-files
def itemRgb_(maxgap_):
    if maxgap_ == 0:
        return yp.RED
    else:
        return yp.BLUE

write_gffbed(mp('maxgap_fwd.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_maxgap_fwd.min(axis=1),
    strand = '+',
    attr = df_maxgap_fwd,
    itemRgb = map(itemRgb_, df_maxgap_fwd.min(axis=1)),
)

write_gffbed(mp('maxgap_rev.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_maxgap_rev.min(axis=1),
    strand = '-',
    attr = df_maxgap_rev,
    itemRgb = map(itemRgb_, df_maxgap_rev.min(axis=1)),
)

!wc -l {mp('maxgap_fwd.bed')}
!wc -l {mp('maxgap_rev.bed')}


42875 annot_ce10_tissues/metrics_maxgap/maxgap_fwd.bed
42875 annot_ce10_tissues/metrics_maxgap/maxgap_rev.bed

In [ ]: