In [1]:
%run ~/relmapping/annot_cb/notebooks/annot__init__.ipynb
annot_ = 'annot_cb'
def mp(fp, annot_=annot_): return os.path.join(annot_, 'metrics_type', fp)
In [2]:
l_cond = list(config['annot_cb']['lcap_samples'].keys())
df_atac = pd.read_csv(os.path.join(annot_, 'accessible_sites_cb.tsv'), sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
df_lcap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_fwd.tsv'), sep='\t', low_memory=False)
df_lcap_rev = pd.read_csv(os.path.join(annot_, 'metrics_lcap', 'lcap_all_rev.tsv'), sep='\t', low_memory=False)
df_exon_fwd = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_fwd.tsv'), sep='\t', low_memory=False)
df_exon_rev = pd.read_csv(os.path.join(annot_, 'metrics_exon', 'closest_exon_rev.tsv'), sep='\t', low_memory=False)
df_maxgap_fwd = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_fwd.tsv'), sep='\t')
df_maxgap_rev = pd.read_csv(os.path.join(annot_, 'metrics_maxgap', 'maxgap_rev.tsv'), sep='\t')
df_prom_fwd = df_atac[['chrom', 'start', 'end']].copy()
df_prom_rev = df_atac[['chrom', 'start', 'end']].copy()
df_prom_fwd['strand'] = '+'
df_prom_rev['strand'] = '-'
In [3]:
# df_pass1_gene_%strand: WS260_ce10 gene coordinates, excluding 250bp downstream of TSS
df_ = pd.concat([
read_wbgtf('annot_cb/canonical_geneset/WS260_CB4.genes.pseudogene.gtf.gz', parse_attr=True, coords_adj=True),
read_wbgtf('annot_cb/canonical_geneset/WS260_CB4.genes.protein_coding.gtf.gz', parse_attr=True, coords_adj=True),
], axis=0)
df_pass1_gene_fwd = df_.query('strand == "+"').reset_index(drop=True)[['chrom', 'start', 'end', 'gene_id']]
df_pass1_gene_rev = df_.query('strand == "-"').reset_index(drop=True)[['chrom', 'start', 'end', 'gene_id']]
print(len(df_pass1_gene_fwd), len(df_pass1_gene_rev), 'fwd/rev genes')
# do not mask 250bp downstream of TSS
df_pass1_gene_fwd['start'] = df_pass1_gene_fwd['start'] + 250
df_pass1_gene_rev['end'] = df_pass1_gene_rev['end'] - 250
df_pass1_gene_fwd = df_pass1_gene_fwd.query('start < end')
df_pass1_gene_rev = df_pass1_gene_rev.query('start < end')
print(len(df_pass1_gene_fwd), len(df_pass1_gene_rev), 'fwd/rev genes longer than 250bp kept')
In [4]:
# df_pass1_gpos_%strand[gene_chrom/start/end]: closest gene; if overlaps multiple genes, choose one with the more distal TSS
def df_pos():
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
df_ = pd.concat([df_atac['chrom'], l_atac_peak_pos, l_atac_peak_pos + 1], axis=1).copy()
df_.columns = ['chrom', 'start', 'end']
return df_
def df_closest_(df_a, df_b, b_prefix, **kwargs):
df_a_sort = df_a
df_b_sort = df_b.sort_values(list(df_b.columns[:3]))
fn_ = BedTool.from_dataframe(df_a).closest(BedTool.from_dataframe(df_b_sort).fn, **kwargs).fn
names_ = list(itertools.chain(df_a.columns.values,
['%s_%s' % (b_prefix, col) for col in df_b.columns.values],
))
df_ = pd.read_csv(fn_, sep='\t', names=names_)
return df_#[names_[len(df_a.columns):]]
df_pass1_gpos_fwd_ = df_closest_(df_pos(), df_pass1_gene_fwd, 'pass1_gpos', t='all')
df_pass1_gpos_rev_ = df_closest_(df_pos(), df_pass1_gene_rev, 'pass1_gpos', t='all')
print(len(df_pass1_gpos_fwd_), len(df_pass1_gpos_rev_), 'fwd/rev gene body hits')
In [5]:
# df_pass1_gpos_%strand[is_nearly_intergenic]: site is outside of gene bodies, or up to 250bp from the TSS
df_pass1_gpos_fwd_['is_nearly_intergenic'] = ~((df_pass1_gpos_fwd_['pass1_gpos_start'] <= df_pass1_gpos_fwd_['start'])\
& (df_pass1_gpos_fwd_['end'] <= df_pass1_gpos_fwd_['pass1_gpos_end']))
df_pass1_gpos_rev_['is_nearly_intergenic'] = ~((df_pass1_gpos_rev_['pass1_gpos_start'] <= df_pass1_gpos_rev_['start'])\
& (df_pass1_gpos_rev_['end'] <= df_pass1_gpos_rev_['pass1_gpos_end']))
l_ = 'is_nearly_intergenic'
d_ = {'is_nearly_intergenic': np.any}
df_exon_fwd[l_] = df_pass1_gpos_fwd_.groupby(['chrom', 'start', 'end']).agg(d_).reset_index()[l_]
df_exon_rev[l_] = df_pass1_gpos_rev_.groupby(['chrom', 'start', 'end']).agg(d_).reset_index()[l_]
print(len(df_exon_fwd), len(df_exon_rev))
print(len(df_exon_fwd.query('is_nearly_intergenic')))
print(len(df_exon_rev.query('is_nearly_intergenic')))
# Visualise maxgap as .bed-files
def itemRgb_(is_nearly_intergenic_):
if is_nearly_intergenic_:
return ORANGE
else:
return BLUE
write_gffbed(mp('is_nearly_intergenic_fwd.bed'),
chrom = df_atac['chrom'],
start = df_atac['start'],
end = df_atac['end'],
#name = df_maxgap_fwd.min(axis=1),
strand = '+',
#attr = df_pass1_gpos_fwd_,
itemRgb = map(itemRgb_, df_exon_fwd['is_nearly_intergenic']),
)
write_gffbed(mp('is_nearly_intergenic_rev.bed'),
chrom = df_atac['chrom'],
start = df_atac['start'],
end = df_atac['end'],
#name = df_maxgap_rev.min(axis=1),
strand = '-',
#attr = df_maxgap_rev,
itemRgb = map(itemRgb_, df_exon_rev['is_nearly_intergenic']),
)
In [6]:
# df_prom_%strand[annot_detailed_%stage]: stage-specific detailed annotation, excluding lowconf
d_detailed_annot = collections.OrderedDict([
('coding_promoter_jump', 'coding_promoter'),
('coding_promoter_incr', 'coding_promoter'),
('coding_promoter_jump_incr', 'coding_promoter'),
('coding_promoter_lowconf', 'coding_promoter'),
('pseudogene_promoter_jump', 'pseudogene_promoter'),
('pseudogene_promoter_incr', 'pseudogene_promoter'),
('pseudogene_promoter_jump_incr', 'pseudogene_promoter'),
('pseudogene_promoter_lowconf', 'pseudogene_promoter'),
('non-coding_RNA', 'non-coding_RNA'),
('unknown_promoter_jump', 'unknown_promoter'),
#('unknown_promoter_jump_incr', 'unknown_promoter'),
#('transcription_initiation', 'transcription_initiation'),
('no_transcription', 'no_transcription'),
])
d_annot_strand_legend = collections.OrderedDict([
('coding_promoter', RED),
('pseudogene_promoter', ORANGE),
('non-coding_RNA', BLACK),
('unknown_promoter', YELLOW),
#('transcription_initiation', GREEN),
('no_transcription', BLUE),
])
d_annot_legend = collections.OrderedDict([
('coding_promoter', RED),
('pseudogene_promoter', ORANGE),
('non-coding_RNA', BLACK),
('unknown_promoter', YELLOW),
#('putative_enhancer', GREEN),
('other_element', BLUE),
])
def annot_detailed_(chrom, start, end, jump, incr, maxgap, exon1_dist, exon2_dist, pass2_exon1_dist,
#scap_pass, scap_diff,
gene_biotype, is_exon1_first, is_nearly_intergenic):
"""
#is_gene_biotype = (gene_biotype == 'protein_coding') | (gene_biotype == 'pseudogene')
#v = False
#if chrom == "chrI" and start == 183421 and end == 183568: v = True
#if (chrom == "chrX") and (start == 1125552) and (end == 1125703):
# v = True
# print(jump, maxgap <= 0, scap_diff >= 0, has_scap)
#if start == 11829601-1: print(incr, jump, has_scap, n_txn_evidence)
"""
near_exon1 = abs(exon1_dist) < 250
near_exon2 = abs(exon2_dist) < 250
near_pass2_exon1 = abs(pass2_exon1_dist) < 250
#nearly_upstream_of_tss = exon1_dist > -250
# We annotated accessible sites as coding_promoter or pseudogene_promoter if they fulfilled the following
# four criteria (p1 to p4).
# 1) The accessible site had transcription initiation, and passed at least one of the elongation tests
# (jump or incr), or passed both elongation tests (jump and incr).
#p1 = (int(scap_pass) + int(jump) + int(incr)) >= 2
p1 = (int(jump) + int(incr)) >= 1
# 2) Transcription initiation mode at the accessible site was either upstream of the closest first exon,
# or, in the presence of a UTR, up to 250bp downstream within the UTR.
#p2 = nearly_upstream_of_tss and (scap_diff >= 0)
#p2 = (scap_diff >= 0)
p2 = True # (NA as we don't have short cap)
# 3) The region from peak accessibility to the closest first exon did not contain the 5' end of a non-first exon.
p3 = is_exon1_first
# 4) Distal sites (peak accessibility >250bp from the closest first exon) were additionally required to
# (a) have continuous long cap coverage from 250bp downstream of peak accessibility to the closest first exon, and
# (b) be further than 250bp away from any non-first exon.
p4 = near_exon1 or ((maxgap <= 0) and not near_exon2)
if p1 and p2 and p3 and p4:
#if v: print('coding_promoter')
if jump:#scap_pass and jump:
return 'coding_promoter_jump' if (gene_biotype == 'protein_coding') else 'pseudogene_promoter_jump'
elif incr:
return 'coding_promoter_incr' if (gene_biotype == 'protein_coding') else 'pseudogene_promoter_incr'
#elif jump and incr:
# return 'coding_promoter_jump_incr' if (gene_biotype == 'protein_coding') else 'pseudogene_promoter_jump_incr'
else:
assert False
# Sites within 250bp of the 5' end of an annotated tRNA, snoRNA, miRNA, snRNA or rRNA were annotated as
# non-coding_RNA.
elif near_pass2_exon1:
#if v: print('non-coding_RNA')
return 'non-coding_RNA'
# Next, intergenic sites more than 250bp away from annotated exons that had initiating transcription,
# and passed the jump test were annotated as unknown_promoter.
elif is_nearly_intergenic and not(near_exon1 or near_exon2) and jump:
return 'unknown_promoter_jump'
# All remaining sites were annotated as transcription_initiation or no_transcription based on
# whether they had transcription initiation.
else:
#if v: print('no_transcription')
return 'no_transcription'
for stage in itertools.islice(l_cond, None):
print(stage, 'fwd')
df_prom_fwd['annot_detailed_%s' % (stage,)] = [*map(annot_detailed_,
# chrom, start, end
df_atac['chrom'], df_atac['start'], df_atac['end'],
# jump, incr
df_lcap_fwd['lcap_%s_fwd_passed_jump' % (stage,)], df_lcap_fwd['lcap_%s_fwd_passed_incr' % (stage,)],
# maxgap
df_maxgap_fwd['maxgap_%s_fwd' % (stage,)],
# exon1_dist
df_exon_fwd['pass1_exon1_start'] - l_atac_peak_pos,
# exon2_dist
df_exon_fwd['pass1_exon2_start'] - l_atac_peak_pos,
# pass2_exon2_dist
df_exon_fwd['pass2_exon1_start'] - l_atac_peak_pos,
# scap_pass
#df_scap_fwd['scap_pass'],
# scap_diff
#pd.to_numeric(df_exon_fwd['pass1_exon1_init_cutoff_pos'], errors='coerce') - df_scap_fwd['scap_mode'],
# gene_biotype
df_exon_fwd['pass1_exon1_gene_biotype'],
# is_exon1_first
df_exon_fwd['pass1_exon1_distance'] <= df_exon_fwd['pass1_exon2_downstream_distance'],
# is_nearly_intergenic
df_exon_fwd['is_nearly_intergenic'],
)]
print(stage, 'rev')
df_prom_rev['annot_detailed_%s' % (stage,)] = [*map(annot_detailed_,
# chrom, start, end
df_atac['chrom'], df_atac['start'], df_atac['end'],
# jump, incr
df_lcap_rev['lcap_%s_rev_passed_jump' % (stage,)], df_lcap_rev['lcap_%s_rev_passed_incr' % (stage,)],
# maxgap
df_maxgap_rev['maxgap_%s_rev' % (stage,)],
# exon1_dist
l_atac_peak_pos - (df_exon_rev['pass1_exon1_end'] - 1),
# exon2_dist
l_atac_peak_pos - (df_exon_rev['pass1_exon2_end'] - 1),
# pass2_exon2_dist
l_atac_peak_pos - (df_exon_rev['pass2_exon1_end'] - 1),
# scap_pass
#df_scap_rev['scap_pass'],
# scap_diff
#df_scap_rev['scap_mode'] - pd.to_numeric(df_exon_rev['pass1_exon1_init_cutoff_pos'], errors='coerce'),
# gene_biotype
df_exon_rev['pass1_exon1_gene_biotype'],
# is_exon1_first
df_exon_rev['pass1_exon2_downstream_distance'] <= df_exon_rev['pass1_exon1_distance'],
# is_nearly_intergenic
df_exon_rev['is_nearly_intergenic'],
)]
In [7]:
# df_prom_%strand[annot_detailed_all]: stage-aggregated detailed annotation, excluding lowconf
def annot_detailed_summary_(l_annot):
l_ranking = list(d_detailed_annot.keys())
l_annot_i = [l_ranking.index(annot) for annot in l_annot]
return l_ranking[min(l_annot_i)]
df_prom_fwd['annot_detailed_summary'] = list(map(lambda ir: annot_detailed_summary_(ir[1].tolist()),
df_prom_fwd[['annot_detailed_%(stage)s' % locals() for stage in l_cond]].iterrows()))
df_prom_rev['annot_detailed_summary'] = list(map(lambda ir: annot_detailed_summary_(ir[1].tolist()),
df_prom_rev[['annot_detailed_%(stage)s' % locals() for stage in l_cond]].iterrows()))
In [8]:
# Forward strand annotation summary table (excl. lowconf)
df_fwd_ = pd.concat([df_prom_fwd['annot_detailed_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_detailed_annot.keys()]
df_fwd_.columns = l_cond + ['summary']
#df_fwd_.to_latex('%(dm_step)s_fig/prom_summary_fwd.latex' % locals())
df_fwd_.fillna(0, downcast='infer')
Out[8]:
In [9]:
# Reverse strand annotation summary table (excl. lowconf)
df_rev_ = pd.concat([df_prom_rev['annot_detailed_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_detailed_annot.keys()]
df_rev_.columns = l_cond + ['summary']
#df_rev_.to_latex('%(dm_step)s_fig/prom_summary_rev.latex' % locals())
df_rev_.fillna(0, downcast='infer')
Out[9]:
In [10]:
# (Skipped low-confidence annotation, as it relies on short cap)
In [11]:
# Forward strand annotation summary table (incl. lowconf across all stages but not the origin stage)
df_fwd_ = pd.concat([df_prom_fwd['annot_detailed_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_detailed_annot.keys()]
df_fwd_.columns = l_cond + ['summary']
#df_fwd_.to_latex('%(dm_step)s_fig/prom_summary_fwd.latex' % locals())
df_fwd_.fillna(0, downcast='infer')
Out[11]:
In [12]:
# Reverse strand annotation summary table (incl. lowconf across all stages but not the origin stage)
df_rev_ = pd.concat([df_prom_rev['annot_detailed_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_detailed_annot.keys()]
df_rev_.columns = l_cond + ['summary']
#df_rev_.to_latex('%(dm_step)s_fig/prom_summary_rev.latex' % locals())
df_rev_.fillna(0, downcast='infer')
Out[12]:
In [13]:
# annot_detailed => collapse by method
for s_ in l_cond + ['summary']:
df_prom_fwd['annot_%s' % (s_,)] = list(map(lambda annot_detailed: d_detailed_annot[annot_detailed],
df_prom_fwd['annot_detailed_%s' % (s_,)]))
df_prom_rev['annot_%s' % (s_,)] = list(map(lambda annot_detailed: d_detailed_annot[annot_detailed],
df_prom_rev['annot_detailed_%s' % (s_,)]))
In [14]:
# Forward strand annotation (collapsed across methods)
df_fwd_ = pd.concat([df_prom_fwd['annot_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_annot_strand_legend.keys()]
df_fwd_.columns = l_cond + ['summary']
#df_fwd_.to_latex('%(dm_step)s_fig/prom_summary_rev.latex' % locals())
#plt.subplot(111, frame_on=False)
#plt.gca().xaxis.set_visible(False)
#plt.gca().yaxis.set_visible(False)
#pd.plotting.table(data=df_fwd_, ax=plt.gca(), loc='center left')
#plt.savefig('annot/FigA_mapping/annot_crosstab_fwd.pdf', bbox_inches='tight')
df_fwd_
Out[14]:
In [15]:
# Reverse strand annotation summary (collapsed across methods)
df_rev_ = pd.concat([df_prom_rev['annot_%(stage)s' % locals()].value_counts() for stage in l_cond + ['summary']], axis=1).loc[d_annot_strand_legend.keys()]
df_rev_.columns = l_cond + ['summary']
#df_rev_.to_latex('%(dm_step)s_fig/prom_summary_rev.latex' % locals())
#plt.subplot(111, frame_on=False)
#plt.gca().xaxis.set_visible(False)
#plt.gca().yaxis.set_visible(False)
#pd.plotting.table(data=df_rev_, ax=plt.gca(), loc='center left')
#plt.savefig('annot/FigA_mapping/annot_crosstab_rev.pdf', bbox_inches='tight')
df_rev_
Out[15]:
In [16]:
# Cross-tabulation of strand-specific summary annotation
pd.crosstab(pd.Categorical(df_prom_fwd['annot_summary']), pd.Categorical(df_prom_rev['annot_summary'])).loc[list(d_annot_strand_legend.keys()), list(d_annot_strand_legend.keys())]
Out[16]:
In [17]:
# S2_outron-extended_genes.bed <- genes extended by outron regions as defined by (aggregated) promoter annotations
def df_outron_extended_genes_(df_prom_fwd_, df_prom_rev_):
# df_prom_bounds <- prom min/max boundaries, based on promoter annotation
df_prom_bounds_fwd = df_prom_fwd_.groupby(['pass1_exon1_gene_id']).agg({
'chrom': lambda s: list(set(s))[0],
'start': np.min,
'end': np.max,
}).reset_index()[['chrom', 'start', 'end', 'pass1_exon1_gene_id']]
df_prom_bounds_fwd.columns = ['chrom', 'start', 'end', 'gene_id']#, 'locus_id']
df_prom_bounds_rev = df_prom_rev_.groupby(['pass1_exon1_gene_id']).agg({
'chrom': lambda s: list(set(s))[0],
'start': np.min,
'end': np.max,
}).reset_index()[['chrom', 'start', 'end', 'pass1_exon1_gene_id']]
df_prom_bounds_rev.columns = ['chrom', 'start', 'end', 'gene_id']#, 'locus_id']
df_prom_bounds = pd.concat([df_prom_bounds_fwd, df_prom_bounds_rev], axis=0, ignore_index=False)\
.sort_values(['chrom', 'start', 'end']).reset_index(drop=True)
print('%d fwd %d rev %d pooled' % (len(df_prom_bounds_fwd), len(df_prom_bounds_rev), len(df_prom_bounds)))
# Write a .bed-file of all the regions that changed
df_prom_bounds.to_csv(mp('_df_prom_bounds.bed'), index=False, header=False, sep='\t')
!wc -l {mp('_df_prom_bounds.bed')}
# df_gene_bounds <- gene min/max boundaries, based on annotation
df_transcripts_ = read_wbgtf('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot.gtf.gz', parse_attr=False)
df_transcripts = df_gfftags_unpack(df_transcripts_, name='attribute')
# Add display_id
#fp_geneIDs = 'wget/ftp.wormbase.org/pub/wormbase/releases/WS260/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WS260.geneIDs.txt.gz'
#df_geneIDs = pd.read_csv(fp_geneIDs, sep=',', names=('na', 'gene_id', 'locus', 'sequence_id', 'status'))[['gene_id', 'locus', 'sequence_id', 'status']]
#
#def display_id_(locus, sequence_id, gene_id):
# if locus == locus:
# return locus
# elif sequence_id == sequence_id:
# return sequence_id
# else:
# return gene_id
#
#df_geneIDs['locus_id'] = list(map(display_id_, df_geneIDs['locus'], df_geneIDs['sequence_id'], df_geneIDs['gene_id']))
#df_transcripts = pd.merge(df_transcripts, df_geneIDs[['gene_id', 'locus_id']], left_on='gene_id', right_on='gene_id')
df_gene_bounds = df_transcripts.groupby(['gene_id']).agg({
'chrom': lambda s: list(set(s))[0],
'start': np.min,
'end': np.max,
}).reset_index()[['chrom', 'start', 'end', 'gene_id']].sort_values(['chrom', 'start', 'end']).reset_index(drop=True)
df_gene_bounds.to_csv(mp('_df_gene_bounds.bed'), index=False, header=False, sep='\t')
!wc -l {mp('_df_gene_bounds.bed')}
df_ = df_prom_bounds.merge(df_gene_bounds, how='outer', on=['gene_id'], suffixes=('_prom', '_gene'))
df_['start'] = list(map(int, map(np.nanmin, zip(df_['start_prom'], df_['start_gene']))))
df_['end'] = list(map(int, map(np.nanmax, zip(df_['end_prom'], df_['end_gene']))))
df_ = df_[['chrom_gene', 'start', 'end', 'gene_id']].sort_values(['chrom_gene', 'start', 'end']).reset_index(drop=True)
df_.columns = [['chrom', 'start', 'end', 'gene_id']]
return df_
q_ = '(annot_summary == "coding_promoter") | (annot_summary == "pseudogene_promoter") | (annot_summary == "non-coding_RNA")'
df_outron_extended_genes = df_outron_extended_genes_(
pd.concat([df_prom_fwd, df_exon_fwd['pass1_exon1_gene_id']], axis=1).query(q_)[['chrom', 'start', 'end', 'annot_summary', 'pass1_exon1_gene_id']],
pd.concat([df_prom_rev, df_exon_rev['pass1_exon1_gene_id']], axis=1).query(q_)[['chrom', 'start', 'end', 'annot_summary', 'pass1_exon1_gene_id']],
)
df_outron_extended_genes.to_csv(mp('outron-extended_genes.bed'), index=False, header=False, sep='\t')
!wc -l {mp('outron-extended_genes.bed')}
In [18]:
# df_associated_gene <- associated gene assignments
df_associated_gene = pd.read_csv(BedTool.from_dataframe(df_atac[['chrom', 'start', 'end']]).map(
BedTool.from_dataframe(df_outron_extended_genes).fn, c='4', o='distinct',).fn, sep='\t',
names=NAMES_BED9[:4])
df_associated_gene.columns = ['chrom', 'start', 'end', 'associated_gene_id']#, 'associated_locus_id']
In [22]:
# Table S2 -- unstranded summary annotation
def name_(annot_fwd, annot_rev, locus_id_fwd, locus_id_rev, enhancer_locus_id):
if (annot_fwd in config['annot_with_gene_id']) and not(annot_rev in config['annot_with_gene_id']):
return locus_id_fwd
elif not(annot_fwd in config['annot_with_gene_id']) and (annot_rev in config['annot_with_gene_id']):
return locus_id_rev
elif (annot_fwd in config['annot_with_gene_id']) and (annot_rev in config['annot_with_gene_id']):
return '%s / %s' % (locus_id_rev, locus_id_fwd)
elif enhancer_locus_id != '.': # show "likeliest target gene for an enhancer" in brackets for all remaining (=not coding_promoter or non_coding_promoter)
return '(%s)' % (enhancer_locus_id,)
else:
return ''
def annot_merge_(annot_fwd, annot_rev):
if annot_fwd == 'coding_promoter' or annot_rev == 'coding_promoter':
return 'coding_promoter'
elif annot_fwd == 'pseudogene_promoter' or annot_rev == 'pseudogene_promoter':
return 'pseudogene_promoter'
elif annot_fwd == 'non-coding_RNA' or annot_rev == 'non-coding_RNA':
return 'non-coding_RNA'
elif annot_fwd == 'unknown_promoter' or annot_rev == 'unknown_promoter':
return 'unknown_promoter'
elif annot_fwd == 'transcription_initiation' or annot_rev == 'transcription_initiation':
return 'putative_enhancer'
elif annot_fwd == 'no_transcription' or annot_rev == 'no_transcription':
return 'other_element'
assert False
def strand_merge_(annot_fwd, annot_rev):
promoter_fwd = (annot_fwd in config['annot_with_gene_id'])
promoter_rev = (annot_rev in config['annot_with_gene_id'])
lcap_fwd = annot_fwd == 'unknown_promoter'
lcap_rev = annot_rev == 'unknown_promoter'
if promoter_fwd and promoter_rev:
return '.'
elif promoter_fwd and (not promoter_rev):
return '+'
elif (not promoter_fwd) and promoter_rev:
return '-'
# in the absence of promoter annotations with an "annotated" target, use unknown_promoter annotations for strand
elif lcap_fwd and lcap_rev:
return '.'
elif lcap_fwd and (not lcap_rev):
return '+'
elif (not lcap_fwd) and lcap_rev:
return '-'
else:
return '.'
df_regl = df_atac[['chrom', 'start', 'end']].copy()
#df_regl['tss_fwd'] = df_scap_fwd['scap_mode']
#df_regl['tss_rev'] = df_scap_rev['scap_mode']
#df_regl['scap_pass_fwd'] = df_scap_fwd['scap_pass']
#df_regl['scap_pass_rev'] = df_scap_rev['scap_pass']
#df_regl['scap_mode_fwd'] = df_scap_fwd['scap_mode']
#df_regl['scap_mode_rev'] = df_scap_rev['scap_mode']
#df_regl['scap_mode_count_fwd'] = df_scap_fwd['scap_mode_count']
#df_regl['scap_mode_count_rev'] = df_scap_rev['scap_mode_count']
#df_regl['scap_count_fwd'] = df_scap_fwd['scap_count']
#df_regl['scap_count_rev'] = df_scap_rev['scap_count']
df_regl['annot_fwd'] = df_prom_fwd['annot_summary']
df_regl['annot_rev'] = df_prom_rev['annot_summary']
df_regl['annot_detailed_fwd'] = df_prom_fwd['annot_detailed_summary']
df_regl['annot_detailed_rev'] = df_prom_rev['annot_detailed_summary']
df_regl['annot'] = list(map(annot_merge_, df_prom_fwd['annot_summary'], df_prom_rev['annot_summary']))
df_regl['strand'] = list(map(strand_merge_, df_prom_fwd['annot_summary'], df_prom_rev['annot_summary']))
def agg_pass1_pass2(annot, pass1_val, pass2_val):
if annot in ['coding_promoter', 'pseudogene_promoter']:
return pass1_val
elif annot == 'non-coding_RNA':
return pass2_val
else:
return ''
df_regl['promoter_gene_id_fwd'] = list(map(agg_pass1_pass2, df_regl['annot_fwd'],
df_exon_fwd['pass1_exon1_gene_id'], df_exon_fwd['pass2_exon1_gene_id']))
df_regl['promoter_gene_id_rev'] = list(map(agg_pass1_pass2, df_regl['annot_rev'],
df_exon_rev['pass1_exon1_gene_id'], df_exon_rev['pass2_exon1_gene_id']))
#df_regl['promoter_locus_id_fwd'] = list(map(agg_pass1_pass2, df_regl['annot_fwd'],
# df_exon_fwd['pass1_exon1_locus_id'], df_exon_fwd['pass2_exon1_locus_id']))
#df_regl['promoter_locus_id_rev'] = list(map(agg_pass1_pass2, df_regl['annot_rev'],
# df_exon_rev['pass1_exon1_locus_id'], df_exon_rev['pass2_exon1_locus_id']))
df_regl['promoter_gene_biotype_fwd'] = list(map(agg_pass1_pass2, df_regl['annot_fwd'],
df_exon_fwd['pass1_exon1_gene_biotype'], df_exon_fwd['pass2_exon1_gene_biotype']))
df_regl['promoter_gene_biotype_rev'] = list(map(agg_pass1_pass2, df_regl['annot_rev'],
df_exon_rev['pass1_exon1_gene_biotype'], df_exon_rev['pass2_exon1_gene_biotype']))
df_regl['associated_gene_id'] = df_associated_gene['associated_gene_id']
#df_regl['associated_locus_id'] = df_associated_gene['associated_locus_id']
# For briggsae, use gene_ids for displaying
df_regl['label'] = list(map(name_,
df_regl['annot_fwd'], df_regl['annot_rev'],
df_regl['promoter_gene_id_fwd'], df_regl['promoter_gene_id_rev'],
df_regl['associated_gene_id']
))
In [23]:
df_prom_fwd.to_csv(mp('prom_fwd.tsv'), sep='\t', index=False)
df_prom_rev.to_csv(mp('prom_rev.tsv'), sep='\t', index=False)
df_regl.to_csv(mp('regl.tsv'), sep='\t', index=False)
In [24]:
# regulatory_annotation_%strand.bed: stranded .bed-files showing the annotation in individual stages
write_gffbed(os.path.join(annot_, 'regulatory_annotation_cb_fwd.bed'),
chrom = df_regl['chrom'],
start = df_regl['start'],
end = df_regl['end'],
name = df_regl['promoter_gene_id_fwd'],
strand = '+',
itemRgb = list(map(lambda annot: d_annot_strand_legend[annot], df_regl['annot_fwd'])),
attr = df_prom_fwd[['annot_%s' % (stage,) for stage in l_cond]],
)
write_gffbed(os.path.join(annot_, 'regulatory_annotation_cb_rev.bed'),
chrom = df_regl['chrom'],
start = df_regl['start'],
end = df_regl['end'],
name = df_regl['promoter_gene_id_rev'],
strand = '-',
itemRgb = list(map(lambda annot: d_annot_strand_legend[annot], df_regl['annot_rev'])),
attr = df_prom_rev[['annot_%s' % (stage,) for stage in l_cond]],
)
In [25]:
# regulatory_annotation_ce10.bed: final annotation as a single, unstranded .bed-file (ce10)
def write_ce10_(fp_, df_regl_):
write_gffbed(fp_,
chrom = df_regl_['chrom'],
start = df_regl_['start'],
end = df_regl_['end'],
name = df_regl_['label'],
strand = df_regl_['strand'],
itemRgb = list(map(lambda annot: d_annot_legend[annot], df_regl_['annot'])),
attr = df_regl_[['annot', 'annot_fwd', 'annot_rev', 'annot_detailed_fwd', 'annot_detailed_rev']],
)
write_ce10_(os.path.join(annot_, 'reg_elements_cb.bed'), df_regl)