In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
def vp(fp): return os.path.join('annot_eLife_revised/_fig/Fig2S5', fp) # "verbose path"

In [3]:
df_regl = df_source_data_()

In [4]:
fp_ = vp('outron-extended_protein_coding_genes.bed')
df_regions = pd.read_csv(fp_, sep='\t')

In [5]:
df_regl_gene = BedTool.from_dataframe(df_regl[yp.NAMES_BED3]).map(
    b=BedTool.from_dataframe(df_regions).fn,
    c='4,5,6', o='count,count,distinct', # name, score, strand
).to_dataframe(names=yp.NAMES_BED6)

In [6]:
def type_(annot_maj, annot_min, strand_maj, count_gene, strand_gene):
    unique_antisense_gene = (count_gene == 1) and (
        (strand_maj == '+' and strand_gene == '-') or 
        (strand_maj == '-' and strand_gene == '+')
    )
    if annot_maj == "unassigned_promoter":
        if unique_antisense_gene:
            if annot_min == 'coding_promoter':
                return 'PROMPT_uaRNA'
            else:
                return 'genic_region_antisense'
        elif count_gene == 0:
            return 'intergenic'
        else:
            return 'other'
    else:
        return '.'

df_regl['unassigned_promoter_type_fwd'] = [* map(type_, df_regl['annot_fwd'], df_regl['annot_rev'], 
    itertools.repeat('+', len(df_regl)), df_regl_gene['name'], df_regl_gene['strand']) ]
df_regl['unassigned_promoter_type_rev'] = [* map(type_, df_regl['annot_rev'], df_regl['annot_fwd'],
    itertools.repeat('-', len(df_regl)), df_regl_gene['name'], df_regl_gene['strand']) ]

df_ = pd.concat([df_regl['unassigned_promoter_type_fwd'], df_regl['unassigned_promoter_type_rev']]).value_counts()
print(df_)
print('Total: ', df_[['PROMPT_uaRNA',  'genic_region_antisense', 'intergenic', 'other']].values.sum())


.                         81384
PROMPT_uaRNA               1194
genic_region_antisense     1090
intergenic                  674
other                       148
dtype: int64
Total:  3106

In [7]:
df_pct_ = pd.DataFrame(df_[['PROMPT_uaRNA',  'genic_region_antisense', 'intergenic', 'other']])
df_pct_.columns = ['counts']
df_pct_['pct'] = 100*df_pct_['counts'] / df_pct_['counts'].sum()
df_pct_ = df_pct_.round(1)
df_pct_.to_csv(vp('Fig2S5_unknown_promoter_type.tsv'), sep='\t')
df_pct_


Out[7]:
counts pct
PROMPT_uaRNA 1194 38.4
genic_region_antisense 1090 35.1
intergenic 674 21.7
other 148 4.8

In [8]:
for type_ in df_regl['unassigned_promoter_type_fwd'].value_counts().sort_index().index.tolist():
    if type_ == '.': continue
    fp_ = vp('unassigned_promoter_fwd_%(type_)s.bed' % locals())
    df_regl.query('unassigned_promoter_type_fwd == "%(type_)s"' % locals())[yp.NAMES_BED3].to_csv(fp_, sep='\t', index=False, header=False)
    !wc -l {fp_}

for type_ in df_regl['unassigned_promoter_type_rev'].value_counts().sort_index().index.tolist():
    if type_ == '.': continue
    fp_ = vp('unassigned_promoter_rev_%(type_)s.bed' % locals())
    df_regl.query('unassigned_promoter_type_rev == "%(type_)s"' % locals())[yp.NAMES_BED3].to_csv(fp_, sep='\t', index=False, header=False)
    !wc -l {fp_}


581 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_fwd_PROMPT_uaRNA.bed
524 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_fwd_genic_region_antisense.bed
338 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_fwd_intergenic.bed
65 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_fwd_other.bed
613 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_rev_PROMPT_uaRNA.bed
566 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_rev_genic_region_antisense.bed
336 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_rev_intergenic.bed
83 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_rev_other.bed

In [13]:
col_fwd_ = ['chrom', 'start', 'end', 'unassigned_promoter_type_fwd']
col_rev_ = ['chrom', 'start', 'end', 'unassigned_promoter_type_rev']
q_fwd_ = 'unassigned_promoter_type_fwd != "."'
q_rev_ = 'unassigned_promoter_type_rev != "."'
df_regl[col_fwd_].query(q_fwd_).to_csv(vp('Fig2S5_unassigned_promoter_type_fwd.bed'), sep='\t', index=None, header=None)
df_regl[col_rev_].query(q_rev_).to_csv(vp('Fig2S5_unassigned_promoter_type_rev.bed'), sep='\t', index=None, header=None)
!wc -l {vp('Fig2S5_unassigned_promoter_type_fwd.bed')}
!wc -l {vp('Fig2S5_unassigned_promoter_type_rev.bed')}


1508 annot_eLife_revised/_fig/Fig2S5/Fig2S5_unassigned_promoter_type_fwd.bed
1598 annot_eLife_revised/_fig/Fig2S5/Fig2S5_unassigned_promoter_type_rev.bed

In [ ]: