In [1]:
%run ~/relmapping/annot_cb/notebooks/annot__init__.ipynb
annot_ = 'annot_cb'
def mp(fp, annot_=annot_): return os.path.join(annot_, 'metrics_type', fp)


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
l_cond = list(config['annot_cb']['lcap_samples'].keys())
df_atac = pd.read_csv(os.path.join(annot_, 'accessible_sites_cb.tsv'), sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)

df_lcap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_fwd.tsv'), sep='\t', low_memory=False)
df_lcap_rev = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_rev.tsv'), sep='\t', low_memory=False)
df_exon_fwd = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_fwd.tsv'), sep='\t', low_memory=False)
df_exon_rev = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_rev.tsv'), sep='\t', low_memory=False)
df_maxgap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_fwd.tsv'), sep='\t')
df_maxgap_rev = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_rev.tsv'), sep='\t')

df_prom_fwd = df_atac[['chrom', 'start', 'end']].copy()
df_prom_rev = df_atac[['chrom', 'start', 'end']].copy()
df_prom_fwd['strand'] = '+'
df_prom_rev['strand'] = '-'

In [3]:
# df_pass1_gene_%strand: WS260_ce10 gene coordinates, excluding 250bp downstream of TSS
df_ = pd.concat([
    read_wbgtf('annot_cb/canonical_geneset/WS260_CB4.genes.pseudogene.gtf.gz', parse_attr=True, coords_adj=True),
    read_wbgtf('annot_cb/canonical_geneset/WS260_CB4.genes.protein_coding.gtf.gz', parse_attr=True, coords_adj=True),
], axis=0)

df_pass1_gene_fwd = df_.query('strand == "+"').reset_index(drop=True)[['chrom', 'start', 'end', 'gene_id']]
df_pass1_gene_rev = df_.query('strand == "-"').reset_index(drop=True)[['chrom', 'start', 'end', 'gene_id']]

print(len(df_pass1_gene_fwd), len(df_pass1_gene_rev), 'fwd/rev genes')

# do not mask 250bp downstream of TSS
df_pass1_gene_fwd['start'] = df_pass1_gene_fwd['start'] + 250
df_pass1_gene_rev['end'] = df_pass1_gene_rev['end'] - 250

df_pass1_gene_fwd = df_pass1_gene_fwd.query('start < end')
df_pass1_gene_rev = df_pass1_gene_rev.query('start < end')
print(len(df_pass1_gene_fwd), len(df_pass1_gene_rev), 'fwd/rev genes longer than 250bp kept')


11005 10986 fwd/rev genes
10664 10650 fwd/rev genes longer than 250bp kept

In [4]:
# df_pass1_gpos_%strand[gene_chrom/start/end]: closest gene; if overlaps multiple genes, choose one with the more distal TSS
def df_pos():
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    df_ = pd.concat([df_atac['chrom'], l_atac_peak_pos, l_atac_peak_pos + 1], axis=1).copy()
    df_.columns = ['chrom', 'start', 'end']
    return df_

def df_closest_(df_a, df_b, b_prefix, **kwargs):
    df_a_sort = df_a
    df_b_sort = df_b.sort_values(list(df_b.columns[:3]))
    fn_ = BedTool.from_dataframe(df_a).closest(BedTool.from_dataframe(df_b_sort).fn, **kwargs).fn
    names_ = list(itertools.chain(df_a.columns.values,
        ['%s_%s' % (b_prefix, col) for col in df_b.columns.values],
    ))
    df_ = pd.read_csv(fn_, sep='\t', names=names_)
    return df_#[names_[len(df_a.columns):]]

df_pass1_gpos_fwd_ = df_closest_(df_pos(), df_pass1_gene_fwd, 'pass1_gpos', t='all')
df_pass1_gpos_rev_ = df_closest_(df_pos(), df_pass1_gene_rev, 'pass1_gpos', t='all')
print(len(df_pass1_gpos_fwd_), len(df_pass1_gpos_rev_), 'fwd/rev gene body hits')


8554 8571 fwd/rev gene body hits

In [5]:
# df_pass1_gpos_%strand[is_nearly_intergenic]: site is outside of gene bodies, or up to 250bp from the TSS
df_pass1_gpos_fwd_['is_nearly_intergenic'] = ~((df_pass1_gpos_fwd_['pass1_gpos_start'] <= df_pass1_gpos_fwd_['start'])\
                                           & (df_pass1_gpos_fwd_['end'] <= df_pass1_gpos_fwd_['pass1_gpos_end']))
df_pass1_gpos_rev_['is_nearly_intergenic'] = ~((df_pass1_gpos_rev_['pass1_gpos_start'] <= df_pass1_gpos_rev_['start'])\
                                           & (df_pass1_gpos_rev_['end'] <= df_pass1_gpos_rev_['pass1_gpos_end']))

l_ = 'is_nearly_intergenic'
d_ = {'is_nearly_intergenic': np.any}
df_exon_fwd[l_] = df_pass1_gpos_fwd_.groupby(['chrom', 'start', 'end']).agg(d_).reset_index()[l_]
df_exon_rev[l_] = df_pass1_gpos_rev_.groupby(['chrom', 'start', 'end']).agg(d_).reset_index()[l_]
print(len(df_exon_fwd), len(df_exon_rev))
print(len(df_exon_fwd.query('is_nearly_intergenic')))
print(len(df_exon_rev.query('is_nearly_intergenic')))

# Visualise maxgap as .bed-files
def itemRgb_(is_nearly_intergenic_):
    if is_nearly_intergenic_:
        return ORANGE
    else:
        return BLUE

write_gffbed(mp('is_nearly_intergenic_fwd.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    #name = df_maxgap_fwd.min(axis=1),
    strand = '+',
    #attr = df_pass1_gpos_fwd_,
    itemRgb = map(itemRgb_, df_exon_fwd['is_nearly_intergenic']),
)

write_gffbed(mp('is_nearly_intergenic_rev.bed'),
    chrom = df_atac['chrom'],
    start = df_atac['start'],
    end = df_atac['end'],
    #name = df_maxgap_rev.min(axis=1),
    strand = '-',
    #attr = df_maxgap_rev,
    itemRgb = map(itemRgb_, df_exon_rev['is_nearly_intergenic']),
)


8545 8545
7233
7285

In [6]:
# df_prom_%strand[annot_detailed_%stage]: stage-specific detailed annotation, excluding lowconf
d_detailed_annot = collections.OrderedDict([
    ('coding_promoter_jump', 'coding_promoter'),
    ('coding_promoter_incr', 'coding_promoter'),
    ('coding_promoter_jump_incr', 'coding_promoter'),
    ('coding_promoter_lowconf', 'coding_promoter'),
    ('pseudogene_promoter_jump', 'pseudogene_promoter'),
    ('pseudogene_promoter_incr', 'pseudogene_promoter'),
    ('pseudogene_promoter_jump_incr', 'pseudogene_promoter'),
    ('pseudogene_promoter_lowconf', 'pseudogene_promoter'),
    ('non-coding_RNA', 'non-coding_RNA'),
    ('unknown_promoter_jump', 'unknown_promoter'),
    #('unknown_promoter_jump_incr', 'unknown_promoter'),
    #('transcription_initiation', 'transcription_initiation'),
    ('no_transcription', 'no_transcription'),
])

d_annot_strand_legend = collections.OrderedDict([
    ('coding_promoter', RED),
    ('pseudogene_promoter', ORANGE),
    ('non-coding_RNA', BLACK),
    ('unknown_promoter', YELLOW),
    #('transcription_initiation', GREEN),
    ('no_transcription', BLUE),
])

d_annot_legend = collections.OrderedDict([
    ('coding_promoter', RED),
    ('pseudogene_promoter', ORANGE),
    ('non-coding_RNA', BLACK),
    ('unknown_promoter', YELLOW),
    #('putative_enhancer', GREEN),
    ('other_element', BLUE),
])

def annot_detailed_(chrom, start, end, jump, incr, maxgap, exon1_dist, exon2_dist, pass2_exon1_dist, 
                    #scap_pass, scap_diff, 
                    gene_biotype, is_exon1_first, is_nearly_intergenic):
    """
    #is_gene_biotype = (gene_biotype == 'protein_coding') | (gene_biotype == 'pseudogene')
    #v = False
    #if chrom == "chrI" and start == 183421 and end == 183568: v = True
    #if (chrom == "chrX") and (start == 1125552) and (end == 1125703):
    #    v = True
    #    print(jump, maxgap <= 0, scap_diff >= 0, has_scap)
    #if start == 11829601-1: print(incr, jump, has_scap, n_txn_evidence)
    """

    near_exon1 = abs(exon1_dist) < 250
    near_exon2 = abs(exon2_dist) < 250
    near_pass2_exon1 = abs(pass2_exon1_dist) < 250
    #nearly_upstream_of_tss = exon1_dist > -250

    # We annotated accessible sites as coding_promoter or pseudogene_promoter if they fulfilled the following
    # four criteria (p1 to p4).
    # 1) The accessible site had transcription initiation, and passed at least one of the elongation tests 
    #    (jump or incr), or passed both elongation tests (jump and incr).
    #p1 = (int(scap_pass) + int(jump) + int(incr)) >= 2
    p1 = (int(jump) + int(incr)) >= 1
    # 2) Transcription initiation mode at the accessible site was either upstream of the closest first exon, 
    #    or, in the presence of a UTR, up to 250bp downstream within the UTR.
    #p2 = nearly_upstream_of_tss and (scap_diff >= 0)
    #p2 = (scap_diff >= 0)
    p2 = True # (NA as we don't have short cap)
    # 3) The region from peak accessibility to the closest first exon did not contain the 5' end of a non-first exon.
    p3 = is_exon1_first
    # 4) Distal sites (peak accessibility >250bp from the closest first exon) were additionally required to 
    # (a) have continuous long cap coverage from 250bp downstream of peak accessibility to the closest first exon, and 
    # (b) be further than 250bp away from any non-first exon.
    p4 = near_exon1 or ((maxgap <= 0) and not near_exon2)
    if p1 and p2 and p3 and p4:
        #if v: print('coding_promoter')
        if jump:#scap_pass and jump:
            return 'coding_promoter_jump' if (gene_biotype == 'protein_coding') else 'pseudogene_promoter_jump'
        elif incr:
            return 'coding_promoter_incr' if (gene_biotype == 'protein_coding') else 'pseudogene_promoter_incr'
        #elif jump and incr:
        #    return 'coding_promoter_jump_incr' if (gene_biotype == 'protein_coding') else 'pseudogene_promoter_jump_incr'
        else:
            assert False

    # Sites within 250bp of the 5' end of an annotated tRNA, snoRNA, miRNA, snRNA or rRNA were annotated as 
    # non-coding_RNA.
    elif near_pass2_exon1:
        #if v: print('non-coding_RNA')
        return 'non-coding_RNA'

    # Next, intergenic sites more than 250bp away from annotated exons that had initiating transcription, 
    # and passed the jump test were annotated as unknown_promoter.
    elif is_nearly_intergenic and not(near_exon1 or near_exon2) and jump:
        return 'unknown_promoter_jump'

    # All remaining sites were annotated as transcription_initiation or no_transcription based on 
    # whether they had transcription initiation.
    else:
        #if v: print('no_transcription')
        return 'no_transcription'

for stage in itertools.islice(l_cond, None):
    print(stage, 'fwd')
    df_prom_fwd['annot_detailed_%s' % (stage,)] = [*map(annot_detailed_,
        # chrom, start, end
        df_atac['chrom'], df_atac['start'], df_atac['end'],
        # jump, incr
        df_lcap_fwd['lcap_%s_fwd_passed_jump' % (stage,)], df_lcap_fwd['lcap_%s_fwd_passed_incr' % (stage,)],
        # maxgap
        df_maxgap_fwd['maxgap_%s_fwd' % (stage,)],
        # exon1_dist
        df_exon_fwd['pass1_exon1_start'] - l_atac_peak_pos,
        # exon2_dist
        df_exon_fwd['pass1_exon2_start'] - l_atac_peak_pos,
        # pass2_exon2_dist
        df_exon_fwd['pass2_exon1_start'] - l_atac_peak_pos,
        # scap_pass
        #df_scap_fwd['scap_pass'],
        # scap_diff
        #pd.to_numeric(df_exon_fwd['pass1_exon1_init_cutoff_pos'], errors='coerce') - df_scap_fwd['scap_mode'],
        # gene_biotype
        df_exon_fwd['pass1_exon1_gene_biotype'],
        # is_exon1_first
        df_exon_fwd['pass1_exon1_distance'] <= df_exon_fwd['pass1_exon2_downstream_distance'],
        # is_nearly_intergenic
        df_exon_fwd['is_nearly_intergenic'],
    )]
    print(stage, 'rev')
    df_prom_rev['annot_detailed_%s' % (stage,)] = [*map(annot_detailed_,
        # chrom, start, end
        df_atac['chrom'], df_atac['start'], df_atac['end'],
        # jump, incr
        df_lcap_rev['lcap_%s_rev_passed_jump' % (stage,)], df_lcap_rev['lcap_%s_rev_passed_incr' % (stage,)],
        # maxgap
        df_maxgap_rev['maxgap_%s_rev' % (stage,)],
        # exon1_dist
        l_atac_peak_pos - (df_exon_rev['pass1_exon1_end'] - 1),
        # exon2_dist
        l_atac_peak_pos - (df_exon_rev['pass1_exon2_end'] - 1),
        # pass2_exon2_dist
        l_atac_peak_pos - (df_exon_rev['pass2_exon1_end'] - 1),
        # scap_pass
        #df_scap_rev['scap_pass'],
        # scap_diff
        #df_scap_rev['scap_mode'] - pd.to_numeric(df_exon_rev['pass1_exon1_init_cutoff_pos'], errors='coerce'),
        # gene_biotype
        df_exon_rev['pass1_exon1_gene_biotype'],
        # is_exon1_first
        df_exon_rev['pass1_exon2_downstream_distance'] <= df_exon_rev['pass1_exon1_distance'],
        # is_nearly_intergenic
        df_exon_rev['is_nearly_intergenic'],
    )]


wt_ya fwd
wt_ya rev

In [7]:
# df_prom_%strand[annot_detailed_all]: stage-aggregated detailed annotation, excluding lowconf
def annot_detailed_summary_(l_annot):
    l_ranking = list(d_detailed_annot.keys())
    l_annot_i = [l_ranking.index(annot) for annot in l_annot]
    return l_ranking[min(l_annot_i)]

df_prom_fwd['annot_detailed_summary'] = list(map(lambda ir: annot_detailed_summary_(ir[1].tolist()),
    df_prom_fwd[['annot_detailed_%(stage)s' % locals() for stage in l_cond]].iterrows()))

df_prom_rev['annot_detailed_summary'] = list(map(lambda ir: annot_detailed_summary_(ir[1].tolist()),
    df_prom_rev[['annot_detailed_%(stage)s' % locals() for stage in l_cond]].iterrows()))

In [8]:
# Forward strand annotation summary table (excl. lowconf)
df_fwd_ = pd.concat([df_prom_fwd['annot_detailed_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_detailed_annot.keys()]
df_fwd_.columns = l_cond + ['summary']
#df_fwd_.to_latex('%(dm_step)s_fig/prom_summary_fwd.latex' % locals())
df_fwd_.fillna(0, downcast='infer')


Out[8]:
wt_ya summary
coding_promoter_jump 1346 1346
coding_promoter_incr 216 216
coding_promoter_jump_incr 0 0
coding_promoter_lowconf 0 0
pseudogene_promoter_jump 4 4
pseudogene_promoter_incr 2 2
pseudogene_promoter_jump_incr 0 0
pseudogene_promoter_lowconf 0 0
non-coding_RNA 328 328
unknown_promoter_jump 96 96
no_transcription 6553 6553

In [9]:
# Reverse strand annotation summary table (excl. lowconf)
df_rev_ = pd.concat([df_prom_rev['annot_detailed_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_detailed_annot.keys()]
df_rev_.columns = l_cond + ['summary']
#df_rev_.to_latex('%(dm_step)s_fig/prom_summary_rev.latex' % locals())
df_rev_.fillna(0, downcast='infer')


Out[9]:
wt_ya summary
coding_promoter_jump 1233 1233
coding_promoter_incr 317 317
coding_promoter_jump_incr 0 0
coding_promoter_lowconf 0 0
pseudogene_promoter_jump 3 3
pseudogene_promoter_incr 3 3
pseudogene_promoter_jump_incr 0 0
pseudogene_promoter_lowconf 0 0
non-coding_RNA 357 357
unknown_promoter_jump 58 58
no_transcription 6574 6574

In [10]:
# (Skipped low-confidence annotation, as it relies on short cap)

In [11]:
# Forward strand annotation summary table (incl. lowconf across all stages but not the origin stage)
df_fwd_ = pd.concat([df_prom_fwd['annot_detailed_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_detailed_annot.keys()]
df_fwd_.columns = l_cond + ['summary']
#df_fwd_.to_latex('%(dm_step)s_fig/prom_summary_fwd.latex' % locals())
df_fwd_.fillna(0, downcast='infer')


Out[11]:
wt_ya summary
coding_promoter_jump 1346 1346
coding_promoter_incr 216 216
coding_promoter_jump_incr 0 0
coding_promoter_lowconf 0 0
pseudogene_promoter_jump 4 4
pseudogene_promoter_incr 2 2
pseudogene_promoter_jump_incr 0 0
pseudogene_promoter_lowconf 0 0
non-coding_RNA 328 328
unknown_promoter_jump 96 96
no_transcription 6553 6553

In [12]:
# Reverse strand annotation summary table (incl. lowconf across all stages but not the origin stage)
df_rev_ = pd.concat([df_prom_rev['annot_detailed_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_detailed_annot.keys()]
df_rev_.columns = l_cond + ['summary']
#df_rev_.to_latex('%(dm_step)s_fig/prom_summary_rev.latex' % locals())
df_rev_.fillna(0, downcast='infer')


Out[12]:
wt_ya summary
coding_promoter_jump 1233 1233
coding_promoter_incr 317 317
coding_promoter_jump_incr 0 0
coding_promoter_lowconf 0 0
pseudogene_promoter_jump 3 3
pseudogene_promoter_incr 3 3
pseudogene_promoter_jump_incr 0 0
pseudogene_promoter_lowconf 0 0
non-coding_RNA 357 357
unknown_promoter_jump 58 58
no_transcription 6574 6574

In [13]:
# annot_detailed => collapse by method
for s_ in l_cond + ['summary']:
    df_prom_fwd['annot_%s' % (s_,)] = list(map(lambda annot_detailed: d_detailed_annot[annot_detailed], 
                                               df_prom_fwd['annot_detailed_%s' % (s_,)]))
    df_prom_rev['annot_%s' % (s_,)] = list(map(lambda annot_detailed: d_detailed_annot[annot_detailed], 
                                               df_prom_rev['annot_detailed_%s' % (s_,)]))

In [14]:
# Forward strand annotation (collapsed across methods)
df_fwd_ = pd.concat([df_prom_fwd['annot_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_annot_strand_legend.keys()]
df_fwd_.columns = l_cond + ['summary']
#df_fwd_.to_latex('%(dm_step)s_fig/prom_summary_rev.latex' % locals())
#plt.subplot(111, frame_on=False)
#plt.gca().xaxis.set_visible(False)
#plt.gca().yaxis.set_visible(False)
#pd.plotting.table(data=df_fwd_, ax=plt.gca(), loc='center left')
#plt.savefig('annot/FigA_mapping/annot_crosstab_fwd.pdf', bbox_inches='tight')
df_fwd_


Out[14]:
wt_ya summary
coding_promoter 1562 1562
pseudogene_promoter 6 6
non-coding_RNA 328 328
unknown_promoter 96 96
no_transcription 6553 6553

In [15]:
# Reverse strand annotation summary (collapsed across methods)
df_rev_ = pd.concat([df_prom_rev['annot_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_annot_strand_legend.keys()]
df_rev_.columns = l_cond + ['summary']
#df_rev_.to_latex('%(dm_step)s_fig/prom_summary_rev.latex' % locals())
#plt.subplot(111, frame_on=False)
#plt.gca().xaxis.set_visible(False)
#plt.gca().yaxis.set_visible(False)
#pd.plotting.table(data=df_rev_, ax=plt.gca(), loc='center left')
#plt.savefig('annot/FigA_mapping/annot_crosstab_rev.pdf', bbox_inches='tight')
df_rev_


Out[15]:
wt_ya summary
coding_promoter 1550 1550
pseudogene_promoter 6 6
non-coding_RNA 357 357
unknown_promoter 58 58
no_transcription 6574 6574

In [16]:
# Cross-tabulation of strand-specific summary annotation
pd.crosstab(pd.Categorical(df_prom_fwd['annot_summary']), pd.Categorical(df_prom_rev['annot_summary'])).loc[list(d_annot_strand_legend.keys()), list(d_annot_strand_legend.keys())]


Out[16]:
col_0 coding_promoter pseudogene_promoter non-coding_RNA unknown_promoter no_transcription
row_0
coding_promoter 479 2 7 20 1054
pseudogene_promoter 0 0 0 0 6
non-coding_RNA 3 0 21 0 304
unknown_promoter 26 0 0 1 69
no_transcription 1042 4 329 37 5141

In [17]:
# S2_outron-extended_genes.bed <- genes extended by outron regions as defined by (aggregated) promoter annotations
def df_outron_extended_genes_(df_prom_fwd_, df_prom_rev_):

    # df_prom_bounds <- prom min/max boundaries, based on promoter annotation
    df_prom_bounds_fwd = df_prom_fwd_.groupby(['pass1_exon1_gene_id']).agg({
        'chrom': lambda s: list(set(s))[0],
        'start': np.min,
        'end': np.max,
    }).reset_index()[['chrom', 'start', 'end', 'pass1_exon1_gene_id']]
    df_prom_bounds_fwd.columns = ['chrom', 'start', 'end', 'gene_id']#, 'locus_id']

    df_prom_bounds_rev = df_prom_rev_.groupby(['pass1_exon1_gene_id']).agg({
        'chrom': lambda s: list(set(s))[0],
        'start': np.min,
        'end': np.max,
    }).reset_index()[['chrom', 'start', 'end', 'pass1_exon1_gene_id']]
    df_prom_bounds_rev.columns = ['chrom', 'start', 'end', 'gene_id']#, 'locus_id']

    df_prom_bounds = pd.concat([df_prom_bounds_fwd, df_prom_bounds_rev], axis=0, ignore_index=False)\
        .sort_values(['chrom', 'start', 'end']).reset_index(drop=True)
    print('%d fwd %d rev %d pooled' % (len(df_prom_bounds_fwd), len(df_prom_bounds_rev), len(df_prom_bounds)))

    # Write a .bed-file of all the regions that changed
    df_prom_bounds.to_csv(mp('_df_prom_bounds.bed'), index=False, header=False, sep='\t')
    !wc -l {mp('_df_prom_bounds.bed')}

    # df_gene_bounds <- gene min/max boundaries, based on annotation
    df_transcripts_ = read_wbgtf('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot.gtf.gz', parse_attr=False)
    df_transcripts = df_gfftags_unpack(df_transcripts_, name='attribute')
    
    # Add display_id
    #fp_geneIDs = 'wget/ftp.wormbase.org/pub/wormbase/releases/WS260/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WS260.geneIDs.txt.gz'
    #df_geneIDs = pd.read_csv(fp_geneIDs, sep=',', names=('na', 'gene_id', 'locus', 'sequence_id', 'status'))[['gene_id', 'locus', 'sequence_id', 'status']]
    #
    #def display_id_(locus, sequence_id, gene_id):
    #    if locus == locus:
    #        return locus
    #    elif sequence_id == sequence_id:
    #        return sequence_id
    #    else:
    #        return gene_id
    #
    #df_geneIDs['locus_id'] = list(map(display_id_, df_geneIDs['locus'], df_geneIDs['sequence_id'], df_geneIDs['gene_id']))
    #df_transcripts = pd.merge(df_transcripts, df_geneIDs[['gene_id', 'locus_id']], left_on='gene_id', right_on='gene_id')

    df_gene_bounds = df_transcripts.groupby(['gene_id']).agg({
        'chrom': lambda s: list(set(s))[0],
        'start': np.min,
        'end': np.max,
    }).reset_index()[['chrom', 'start', 'end', 'gene_id']].sort_values(['chrom', 'start', 'end']).reset_index(drop=True)

    df_gene_bounds.to_csv(mp('_df_gene_bounds.bed'), index=False, header=False, sep='\t')
    !wc -l {mp('_df_gene_bounds.bed')}
    
    df_ = df_prom_bounds.merge(df_gene_bounds, how='outer', on=['gene_id'], suffixes=('_prom', '_gene'))
    df_['start'] = list(map(int, map(np.nanmin, zip(df_['start_prom'], df_['start_gene']))))
    df_['end'] = list(map(int, map(np.nanmax, zip(df_['end_prom'], df_['end_gene']))))
    df_ = df_[['chrom_gene', 'start', 'end', 'gene_id']].sort_values(['chrom_gene', 'start', 'end']).reset_index(drop=True)
    df_.columns = [['chrom', 'start', 'end', 'gene_id']]
    return df_

q_ = '(annot_summary == "coding_promoter") | (annot_summary == "pseudogene_promoter") | (annot_summary == "non-coding_RNA")'
df_outron_extended_genes = df_outron_extended_genes_(
    pd.concat([df_prom_fwd, df_exon_fwd['pass1_exon1_gene_id']], axis=1).query(q_)[['chrom', 'start', 'end', 'annot_summary', 'pass1_exon1_gene_id']],
    pd.concat([df_prom_rev, df_exon_rev['pass1_exon1_gene_id']], axis=1).query(q_)[['chrom', 'start', 'end', 'annot_summary', 'pass1_exon1_gene_id']],
)
df_outron_extended_genes.to_csv(mp('outron-extended_genes.bed'), index=False, header=False, sep='\t')
!wc -l {mp('outron-extended_genes.bed')}


1772 fwd 1774 rev 3546 pooled
3546 annot_cb/metrics_type/_df_prom_bounds.bed
22788 annot_cb/metrics_type/_df_gene_bounds.bed
22788 annot_cb/metrics_type/outron-extended_genes.bed

In [18]:
# df_associated_gene <- associated gene assignments
df_associated_gene = pd.read_csv(BedTool.from_dataframe(df_atac[['chrom', 'start', 'end']]).map(
    BedTool.from_dataframe(df_outron_extended_genes).fn, c='4', o='distinct',).fn, sep='\t',
    names=NAMES_BED9[:4])
df_associated_gene.columns = ['chrom', 'start', 'end', 'associated_gene_id']#, 'associated_locus_id']

In [22]:
# Table S2 -- unstranded summary annotation
def name_(annot_fwd, annot_rev, locus_id_fwd, locus_id_rev, enhancer_locus_id):
    if (annot_fwd in config['annot_with_gene_id']) and not(annot_rev in config['annot_with_gene_id']):
        return locus_id_fwd
    elif not(annot_fwd in config['annot_with_gene_id']) and (annot_rev in config['annot_with_gene_id']):
        return locus_id_rev
    elif (annot_fwd in config['annot_with_gene_id']) and (annot_rev in config['annot_with_gene_id']):
        return '%s / %s' % (locus_id_rev, locus_id_fwd)
    elif enhancer_locus_id != '.': # show "likeliest target gene for an enhancer" in brackets for all remaining (=not coding_promoter or non_coding_promoter)
        return '(%s)' % (enhancer_locus_id,)
    else:
        return ''

def annot_merge_(annot_fwd, annot_rev):
    if annot_fwd == 'coding_promoter' or annot_rev == 'coding_promoter':
        return 'coding_promoter'
    elif annot_fwd == 'pseudogene_promoter' or annot_rev == 'pseudogene_promoter':
        return 'pseudogene_promoter'
    elif annot_fwd == 'non-coding_RNA' or annot_rev == 'non-coding_RNA':
        return 'non-coding_RNA'
    elif annot_fwd == 'unknown_promoter' or annot_rev == 'unknown_promoter':
        return 'unknown_promoter'
    elif annot_fwd == 'transcription_initiation' or annot_rev == 'transcription_initiation':
        return 'putative_enhancer'
    elif annot_fwd == 'no_transcription' or annot_rev == 'no_transcription':
        return 'other_element'
    assert False

def strand_merge_(annot_fwd, annot_rev):
    promoter_fwd = (annot_fwd in config['annot_with_gene_id'])
    promoter_rev = (annot_rev in config['annot_with_gene_id'])
    lcap_fwd = annot_fwd == 'unknown_promoter'
    lcap_rev = annot_rev == 'unknown_promoter'
    if promoter_fwd and promoter_rev:
        return '.'
    elif promoter_fwd and (not promoter_rev):
        return '+'
    elif (not promoter_fwd) and promoter_rev:
        return '-'
    # in the absence of promoter annotations with an "annotated" target, use unknown_promoter annotations for strand
    elif lcap_fwd and lcap_rev:
        return '.'
    elif lcap_fwd and (not lcap_rev):
        return '+'
    elif (not lcap_fwd) and lcap_rev:
        return '-'
    else:
        return '.'

df_regl = df_atac[['chrom', 'start', 'end']].copy()
#df_regl['tss_fwd'] = df_scap_fwd['scap_mode']
#df_regl['tss_rev'] = df_scap_rev['scap_mode']
#df_regl['scap_pass_fwd'] = df_scap_fwd['scap_pass']
#df_regl['scap_pass_rev'] = df_scap_rev['scap_pass']
#df_regl['scap_mode_fwd'] = df_scap_fwd['scap_mode']
#df_regl['scap_mode_rev'] = df_scap_rev['scap_mode']
#df_regl['scap_mode_count_fwd'] = df_scap_fwd['scap_mode_count']
#df_regl['scap_mode_count_rev'] = df_scap_rev['scap_mode_count']
#df_regl['scap_count_fwd'] = df_scap_fwd['scap_count']
#df_regl['scap_count_rev'] = df_scap_rev['scap_count']
df_regl['annot_fwd'] = df_prom_fwd['annot_summary']
df_regl['annot_rev'] = df_prom_rev['annot_summary']
df_regl['annot_detailed_fwd'] = df_prom_fwd['annot_detailed_summary']
df_regl['annot_detailed_rev'] = df_prom_rev['annot_detailed_summary']
df_regl['annot'] = list(map(annot_merge_, df_prom_fwd['annot_summary'], df_prom_rev['annot_summary']))
df_regl['strand'] = list(map(strand_merge_, df_prom_fwd['annot_summary'], df_prom_rev['annot_summary']))

def agg_pass1_pass2(annot, pass1_val, pass2_val):
    if annot in ['coding_promoter', 'pseudogene_promoter']:
        return pass1_val
    elif annot == 'non-coding_RNA':
        return pass2_val
    else:
        return ''

df_regl['promoter_gene_id_fwd'] = list(map(agg_pass1_pass2, df_regl['annot_fwd'],
                                    df_exon_fwd['pass1_exon1_gene_id'], df_exon_fwd['pass2_exon1_gene_id']))
df_regl['promoter_gene_id_rev'] = list(map(agg_pass1_pass2, df_regl['annot_rev'],
                                    df_exon_rev['pass1_exon1_gene_id'], df_exon_rev['pass2_exon1_gene_id']))

#df_regl['promoter_locus_id_fwd'] = list(map(agg_pass1_pass2, df_regl['annot_fwd'],
#                                    df_exon_fwd['pass1_exon1_locus_id'], df_exon_fwd['pass2_exon1_locus_id']))
#df_regl['promoter_locus_id_rev'] = list(map(agg_pass1_pass2, df_regl['annot_rev'],
#                                    df_exon_rev['pass1_exon1_locus_id'], df_exon_rev['pass2_exon1_locus_id']))

df_regl['promoter_gene_biotype_fwd'] = list(map(agg_pass1_pass2, df_regl['annot_fwd'],
                                    df_exon_fwd['pass1_exon1_gene_biotype'], df_exon_fwd['pass2_exon1_gene_biotype']))
df_regl['promoter_gene_biotype_rev'] = list(map(agg_pass1_pass2, df_regl['annot_rev'],
                                    df_exon_rev['pass1_exon1_gene_biotype'], df_exon_rev['pass2_exon1_gene_biotype']))

df_regl['associated_gene_id'] = df_associated_gene['associated_gene_id']
#df_regl['associated_locus_id'] = df_associated_gene['associated_locus_id']

# For briggsae, use gene_ids for displaying
df_regl['label'] = list(map(name_,
    df_regl['annot_fwd'], df_regl['annot_rev'],                     
    df_regl['promoter_gene_id_fwd'], df_regl['promoter_gene_id_rev'],
    df_regl['associated_gene_id']
))

In [23]:
df_prom_fwd.to_csv(mp('prom_fwd.tsv'), sep='\t', index=False)
df_prom_rev.to_csv(mp('prom_rev.tsv'), sep='\t', index=False)
df_regl.to_csv(mp('regl.tsv'), sep='\t', index=False)

In [24]:
# regulatory_annotation_%strand.bed: stranded .bed-files showing the annotation in individual stages
write_gffbed(os.path.join(annot_, 'regulatory_annotation_cb_fwd.bed'),
        chrom = df_regl['chrom'],
        start = df_regl['start'],
        end = df_regl['end'],
        name = df_regl['promoter_gene_id_fwd'],
        strand = '+',
        itemRgb = list(map(lambda annot: d_annot_strand_legend[annot], df_regl['annot_fwd'])),
        attr = df_prom_fwd[['annot_%s' % (stage,) for stage in l_cond]],
)

write_gffbed(os.path.join(annot_, 'regulatory_annotation_cb_rev.bed'),
        chrom = df_regl['chrom'],
        start = df_regl['start'],
        end = df_regl['end'],
        name = df_regl['promoter_gene_id_rev'],
        strand = '-',
        itemRgb = list(map(lambda annot: d_annot_strand_legend[annot], df_regl['annot_rev'])),
        attr = df_prom_rev[['annot_%s' % (stage,) for stage in l_cond]],
)

In [25]:
# regulatory_annotation_ce10.bed: final annotation as a single, unstranded .bed-file (ce10)
def write_ce10_(fp_, df_regl_):
    write_gffbed(fp_,
        chrom = df_regl_['chrom'],
        start = df_regl_['start'],
        end = df_regl_['end'],
        name = df_regl_['label'],
        strand = df_regl_['strand'],
        itemRgb = list(map(lambda annot: d_annot_legend[annot], df_regl_['annot'])),
        attr = df_regl_[['annot', 'annot_fwd', 'annot_rev', 'annot_detailed_fwd', 'annot_detailed_rev']],
    )

write_ce10_(os.path.join(annot_, 'reg_elements_cb.bed'), df_regl)