In [1]:
%run ~/relmapping/annot_cb/notebooks/annot__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
def mp(fp): return os.path.join('annot_cb', 'metrics_maxgap', fp)

l_cond = [* config['annot_cb']['lcap_samples'].keys() ]
fp_atac = 'annot_cb/accessible_sites_cb.tsv'
df_atac = pd.read_csv(fp_atac, sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
df_exon_fwd = pd.read_csv('annot_cb/metrics_exon/closest_exon_fwd.tsv', sep='\t')
df_exon_rev = pd.read_csv('annot_cb/metrics_exon/closest_exon_rev.tsv', sep='\t')
step_fwd = config['annot_cb']['lcap_filled_fwd'] + '.mean_by_stage'
step_rev = config['annot_cb']['lcap_filled_rev'] + '.mean_by_stage'


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (3,5,6,7,9,10,11,12,13,14,18,42,44,45,46,48,49,50,52,53,54,61,63,64,65,67,68,69,71,72,73) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [3]:
print(len(df_exon_fwd), len(df_exon_fwd), 'fwd/rev closest exon records')


8545 8545 fwd/rev closest exon records

In [4]:
# maximum gap between hypersensitive site and downstream first exons
def maxgap(a): return max(len(list(g)) if (k == 0) or (k != k) else 0 for k,g in itertools.groupby(a))

def nanmaxgap(a): return maxgap([e if e == e else 0 for e in a])

def maxgap_fwd_(fp_inp, chroms, atac_modes, exon1_starts, flank_len=250):
    l = []
    fh = pyBigWig.open(fp_inp)
    for (chrom, atac_mode, exon1_start) in itertools.islice(zip(chroms, atac_modes, exon1_starts), None):
        # Interval between atac-mode (+flank), and 5'-end of closest downstream exonl clipped by chromosome ends
        start_ = max(atac_mode + flank_len, 0)
        end_ = min(exon1_start + 1, fh.chroms(chrom))
        if start_ < end_:
            l.append(nanmaxgap(np.array(fh.values(chrom, int(start_), int(end_)))))
        else:
            l.append(-1)
    fh.close()
    return l

def maxgap_rev_(fp_inp, chroms, atac_modes, exon1_ends, flank_len=250):
    l = []
    fh = pyBigWig.open(fp_inp)
    for (chrom, atac_mode, exon1_end) in itertools.islice(zip(chroms, atac_modes, exon1_ends), None):
        # Interval between atac-mode (+flank), and 5'-end of closest downstream exonl clipped by chromosome ends
        start_ = max(exon1_end - 1, 0)
        end_ = min(atac_mode - flank_len + 1, fh.chroms(chrom))
        if start_ < end_:
            l.append(nanmaxgap(np.array(fh.values(chrom, int(start_), int(end_)))))
        else:
            l.append(-1)
    fh.close()
    return l

def maxgap_fwd(stage, step=step_fwd):
    fp_fwd = pf('annot_cb_lcap_%s' % (stage,), step, '.bw', 'lcap')
    l_maxgap = maxgap_fwd_(fp_fwd, df_atac['chrom'], l_atac_peak_pos, df_exon_fwd['pass1_exon1_start'])
    return pd.DataFrame({'maxgap_%s_fwd' % (stage,): l_maxgap})

def maxgap_rev(stage, step=step_rev):
    fp_rev = pf('annot_cb_lcap_%s' % (stage,), step, '.bw', 'lcap')
    l_maxgap = maxgap_rev_(fp_rev, df_atac['chrom'], l_atac_peak_pos, df_exon_rev['pass1_exon1_end'])
    return pd.DataFrame({'maxgap_%s_rev' % (stage,): l_maxgap})

df_maxgap_fwd = pd.concat(pmap(maxgap_fwd, l_cond, n_jobs=15), axis=1)
df_maxgap_rev = pd.concat(pmap(maxgap_rev, l_cond, n_jobs=15), axis=1)


[Parallel(n_jobs=15)]: Done   1 out of   1 | elapsed:   11.2s finished
[Parallel(n_jobs=15)]: Done   1 out of   1 | elapsed:   11.3s finished

In [5]:
# metrics_maxgap/ write .tsv-files of full maxgap assignments
df_maxgap_fwd.to_csv(mp('maxgap_fwd.tsv'), header=True, index=False, sep='\t')
df_maxgap_rev.to_csv(mp('maxgap_rev.tsv'), header=True, index=False, sep='\t')

In [6]:
# Visualise maxgap tests as .bed-files
def itemRgb_(maxgap_):
    if maxgap_ == 0:
        return RED
    else:
        return BLUE

write_gffbed(mp('maxgap_fwd.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_maxgap_fwd.min(axis=1),
    strand = '+',
    attr = df_maxgap_fwd,
    itemRgb = map(itemRgb_, df_maxgap_fwd.min(axis=1)),
)

write_gffbed(mp('maxgap_rev.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    name = df_maxgap_rev.min(axis=1),
    strand = '-',
    attr = df_maxgap_rev,
    itemRgb = map(itemRgb_, df_maxgap_rev.min(axis=1)),
)

!wc -l {mp('maxgap_fwd.bed')}
!wc -l {mp('maxgap_rev.bed')}


8546 annot_cb/metrics_maxgap/maxgap_fwd.bed
8546 annot_cb/metrics_maxgap/maxgap_rev.bed