In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb
In [2]:
def vp(fp): return os.path.join('annot_eLife_revised/_fig/Fig2S5', fp) # "verbose path"
In [3]:
df_regl = df_source_data_()
In [4]:
fp_ = vp('outron-extended_protein_coding_genes.bed')
df_regions = pd.read_csv(fp_, sep='\t')
In [5]:
df_regl_gene = BedTool.from_dataframe(df_regl[yp.NAMES_BED3]).map(
b=BedTool.from_dataframe(df_regions).fn,
c='4,5,6', o='count,count,distinct', # name, score, strand
).to_dataframe(names=yp.NAMES_BED6)
In [6]:
def type_(annot_maj, annot_min, strand_maj, count_gene, strand_gene):
unique_antisense_gene = (count_gene == 1) and (
(strand_maj == '+' and strand_gene == '-') or
(strand_maj == '-' and strand_gene == '+')
)
if annot_maj == "unassigned_promoter":
if unique_antisense_gene:
if annot_min == 'coding_promoter':
return 'PROMPT_uaRNA'
else:
return 'genic_region_antisense'
elif count_gene == 0:
return 'intergenic'
else:
return 'other'
else:
return '.'
df_regl['unassigned_promoter_type_fwd'] = [* map(type_, df_regl['annot_fwd'], df_regl['annot_rev'],
itertools.repeat('+', len(df_regl)), df_regl_gene['name'], df_regl_gene['strand']) ]
df_regl['unassigned_promoter_type_rev'] = [* map(type_, df_regl['annot_rev'], df_regl['annot_fwd'],
itertools.repeat('-', len(df_regl)), df_regl_gene['name'], df_regl_gene['strand']) ]
df_ = pd.concat([df_regl['unassigned_promoter_type_fwd'], df_regl['unassigned_promoter_type_rev']]).value_counts()
print(df_)
print('Total: ', df_[['PROMPT_uaRNA', 'genic_region_antisense', 'intergenic', 'other']].values.sum())
In [7]:
df_pct_ = pd.DataFrame(df_[['PROMPT_uaRNA', 'genic_region_antisense', 'intergenic', 'other']])
df_pct_.columns = ['counts']
df_pct_['pct'] = 100*df_pct_['counts'] / df_pct_['counts'].sum()
df_pct_ = df_pct_.round(1)
df_pct_.to_csv(vp('Fig2S5_unknown_promoter_type.tsv'), sep='\t')
df_pct_
Out[7]:
In [8]:
for type_ in df_regl['unassigned_promoter_type_fwd'].value_counts().sort_index().index.tolist():
if type_ == '.': continue
fp_ = vp('unassigned_promoter_fwd_%(type_)s.bed' % locals())
df_regl.query('unassigned_promoter_type_fwd == "%(type_)s"' % locals())[yp.NAMES_BED3].to_csv(fp_, sep='\t', index=False, header=False)
!wc -l {fp_}
for type_ in df_regl['unassigned_promoter_type_rev'].value_counts().sort_index().index.tolist():
if type_ == '.': continue
fp_ = vp('unassigned_promoter_rev_%(type_)s.bed' % locals())
df_regl.query('unassigned_promoter_type_rev == "%(type_)s"' % locals())[yp.NAMES_BED3].to_csv(fp_, sep='\t', index=False, header=False)
!wc -l {fp_}
In [13]:
col_fwd_ = ['chrom', 'start', 'end', 'unassigned_promoter_type_fwd']
col_rev_ = ['chrom', 'start', 'end', 'unassigned_promoter_type_rev']
q_fwd_ = 'unassigned_promoter_type_fwd != "."'
q_rev_ = 'unassigned_promoter_type_rev != "."'
df_regl[col_fwd_].query(q_fwd_).to_csv(vp('Fig2S5_unassigned_promoter_type_fwd.bed'), sep='\t', index=None, header=None)
df_regl[col_rev_].query(q_rev_).to_csv(vp('Fig2S5_unassigned_promoter_type_rev.bed'), sep='\t', index=None, header=None)
!wc -l {vp('Fig2S5_unassigned_promoter_type_fwd.bed')}
!wc -l {vp('Fig2S5_unassigned_promoter_type_rev.bed')}
In [ ]: