In [1]:
%run ~/relmapping/annot/notebooks/annot__init__.ipynb
In [2]:
fp_ = 'WS260_ce10/WS260_ce10.genes.protein_coding.gtf.gz'
df_genes = yp.read_wbgtf(fp_, parse_attr=True, coords_adj=True)
print(len(df_genes))
In [3]:
# df_genes['tss_wormbase'] <- 5' end of the annotation in WS260_ce10
def tss_wormbase_(start, end, strand):
if strand != '-':
return start
else:
return end - 1
df_genes['tss_wormbase'] = [*map(tss_wormbase_, df_genes['start'], df_genes['end'], df_genes['strand'])]
In [4]:
# df_jaenes2018_fwd, df_jaenes2018_rev <- representative promoters by short cap counts at the mode
fp_ = 'annot_eLife_revised/Figure 2-source data 1. Regulatory annotation.txt'
df_regl = pd.read_csv(fp_, sep='\t')
df_scap_fwd = pd.read_csv('annot_eLife_full/metrics_scap/scap_fwd.tsv', sep='\t')
df_scap_rev = pd.read_csv('annot_eLife_full/metrics_scap/scap_rev.tsv', sep='\t')
df_regl['tss_count_fwd_ce10'] = df_scap_fwd['scap_mode_count']
df_regl['tss_count_rev_ce10'] = df_scap_rev['scap_mode_count']
def fwd_(df_):
return df_.sort_values(['tss_count_fwd_ce10'], ascending=False).iloc[0]
def rev_(df_):
return df_.sort_values(['tss_count_rev_ce10'], ascending=False).iloc[0]
df_jaenes2018_fwd = df_regl.query('annot_fwd == "coding_promoter"').groupby(['promoter_gene_id_fwd']).apply(fwd_)
df_jaenes2018_rev = df_regl.query('annot_rev == "coding_promoter"').groupby(['promoter_gene_id_rev']).apply(rev_)
write_gffbed('TSSs_one_per_gene/df_jaenes2018_fwd.bed',
chrom = df_jaenes2018_fwd['chrom_ce10'],
start = df_jaenes2018_fwd['start_ce10'],
end = df_jaenes2018_fwd['end_ce10'],
)
write_gffbed('TSSs_one_per_gene/df_jaenes2018_rev.bed',
chrom = df_jaenes2018_rev['chrom_ce10'],
start = df_jaenes2018_rev['start_ce10'],
end = df_jaenes2018_rev['end_ce10'],
)
In [5]:
df_fwd_ = df_jaenes2018_fwd[['tss_fwd_ce10', 'tss_count_fwd_ce10']].copy()
df_rev_ = df_jaenes2018_rev[['tss_rev_ce10', 'tss_count_rev_ce10']].copy()
df_fwd_.index.name = 'gene_id'
df_rev_.index.name = 'gene_id'
df_fwd_.columns = ['tss_jaenes2018', 'tss_jaenes2018_count']
df_rev_.columns = ['tss_jaenes2018', 'tss_jaenes2018_count']
df_ = pd.concat([df_fwd_, df_rev_], axis=0)
df_tss = df_genes.merge(df_, how='left', left_on='gene_id', right_index=True)
In [16]:
def tss_(tss_wormbase, tss_jaenes2018, tss_jaenes2018_count):
if tss_jaenes2018 == tss_jaenes2018:
return tss_jaenes2018
else:
return tss_wormbase
def col_(tss_wormbase, tss_jaenes2018, tss_jaenes2018_count):
if tss_jaenes2018 == tss_jaenes2018:
return yp.RED if tss_jaenes2018_count >= 2 else yp.YELLOW
else:
return yp.BLUE
df_tss['tss'] = [*map(tss_, df_tss['tss_wormbase'], df_tss['tss_jaenes2018'], df_tss['tss_jaenes2018_count'])]
df_tss['col'] = [*map(col_, df_tss['tss_wormbase'], df_tss['tss_jaenes2018'], df_tss['tss_jaenes2018_count'])]
In [19]:
df_tss['feature_start'] = [*map(min, df_tss['start'], df_tss['tss'])]
df_tss['feature_end'] = [*map(max, df_tss['tss'] + 1, df_tss['end'])]
write_gffbed('TSSs_one_per_gene/TSSs_one_per_gene.bed',
chrom = df_tss['chrom'],
start = df_tss['feature_start'].map(int),
end = df_tss['feature_end'].map(int),
name = df_tss['gene_id'],
strand = df_tss['strand'],
itemRgb = df_tss['col'],
attr = df_tss[['tss_jaenes2018', 'tss_jaenes2018_count', 'tss_wormbase']]
)
In [ ]: