In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb
In [2]:
fp_ = 'WS260_ce10/WS260_ce10.genes.protein_coding.gtf.gz'
df_genes = yp.read_wbgtf(fp_, parse_attr=True, coords_adj=True)
print(len(df_genes))
In [3]:
df_regl = df_source_data_()
In [4]:
df_fwd_ = df_regl.query('annot_fwd == "coding_promoter"').set_index('promoter_gene_id_fwd')
df_rev_ = df_regl.query('annot_rev == "coding_promoter"').set_index('promoter_gene_id_rev')
df_prom_fwd_ = df_fwd_.groupby(['promoter_gene_id_fwd'])['pos'].apply(np.min)
df_prom_rev_ = df_rev_.groupby(['promoter_gene_id_rev'])['pos'].apply(np.max)
print(len(df_prom_fwd_) + len(df_prom_rev_))
df_prom_ = pd.DataFrame(pd.concat([df_prom_fwd_, df_prom_rev_], axis=0))
print(len(df_prom_))
In [5]:
def start_adj_(start, pos, strand):
if strand == '+':
return int(np.nanmin([start, pos]))
else:
return start
def end_adj_(end, pos, strand):
if strand == '-':
return int(np.nanmax([end, pos + 1]))
else:
return end
df_ = df_genes.merge(df_prom_, how='left', left_on='gene_id', right_index=True)
df_['start_adj'] = [ *map(start_adj_, df_['start'], df_['pos'], df_['strand']) ]
df_['end_adj'] = [ *map(end_adj_, df_['end'], df_['pos'], df_['strand']) ]
print(len(df_))
In [6]:
fp_ = 'annot_eLife_revised/_fig/Fig2S5/outron-extended_protein_coding_genes.bed'
df_[['chrom', 'start_adj', 'end_adj', 'gene_id', 'score', 'strand']]\
.sort_values(['chrom', 'start_adj', 'end_adj', 'strand'])\
.to_csv(fp_, header=False, index=False, sep='\t')
!wc -l {fp_}