notebook.community

Edit and run



In [1]:

    
%run ~/relmapping/annot/notebooks/__init__.ipynb









    



/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools






    



os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping



In [2]:

    
def vp(fp): return os.path.join('annot_eLife_revised/_fig/Fig2S5', fp) # "verbose path"



In [3]:

    
df_regl = df_source_data_()



In [4]:

    
fp_ = vp('outron-extended_protein_coding_genes.bed')
df_regions = pd.read_csv(fp_, sep='\t')



In [5]:

    
df_regl_gene = BedTool.from_dataframe(df_regl[yp.NAMES_BED3]).map(
    b=BedTool.from_dataframe(df_regions).fn,
    c='4,5,6', o='count,count,distinct', # name, score, strand
).to_dataframe(names=yp.NAMES_BED6)



In [6]:

    
def type_(annot_maj, annot_min, strand_maj, count_gene, strand_gene):
    unique_antisense_gene = (count_gene == 1) and (
        (strand_maj == '+' and strand_gene == '-') or 
        (strand_maj == '-' and strand_gene == '+')
    )
    if annot_maj == "unassigned_promoter":
        if unique_antisense_gene:
            if annot_min == 'coding_promoter':
                return 'PROMPT_uaRNA'
            else:
                return 'genic_region_antisense'
        elif count_gene == 0:
            return 'intergenic'
        else:
            return 'other'
    else:
        return '.'

df_regl['unassigned_promoter_type_fwd'] = [* map(type_, df_regl['annot_fwd'], df_regl['annot_rev'], 
    itertools.repeat('+', len(df_regl)), df_regl_gene['name'], df_regl_gene['strand']) ]
df_regl['unassigned_promoter_type_rev'] = [* map(type_, df_regl['annot_rev'], df_regl['annot_fwd'],
    itertools.repeat('-', len(df_regl)), df_regl_gene['name'], df_regl_gene['strand']) ]

df_ = pd.concat([df_regl['unassigned_promoter_type_fwd'], df_regl['unassigned_promoter_type_rev']]).value_counts()
print(df_)
print('Total: ', df_[['PROMPT_uaRNA',  'genic_region_antisense', 'intergenic', 'other']].values.sum())









    



.                         81384
PROMPT_uaRNA               1194
genic_region_antisense     1090
intergenic                  674
other                       148
dtype: int64
Total:  3106



In [7]:

    
df_pct_ = pd.DataFrame(df_[['PROMPT_uaRNA',  'genic_region_antisense', 'intergenic', 'other']])
df_pct_.columns = ['counts']
df_pct_['pct'] = 100*df_pct_['counts'] / df_pct_['counts'].sum()
df_pct_ = df_pct_.round(1)
df_pct_.to_csv(vp('Fig2S5_unknown_promoter_type.tsv'), sep='\t')
df_pct_









    Out[7]:







  
    
      
      counts
      pct
    
  
  
    
      PROMPT_uaRNA
      1194
      38.4
    
    
      genic_region_antisense
      1090
      35.1
    
    
      intergenic
      674
      21.7
    
    
      other
      148
      4.8



In [8]:

    
for type_ in df_regl['unassigned_promoter_type_fwd'].value_counts().sort_index().index.tolist():
    if type_ == '.': continue
    fp_ = vp('unassigned_promoter_fwd_%(type_)s.bed' % locals())
    df_regl.query('unassigned_promoter_type_fwd == "%(type_)s"' % locals())[yp.NAMES_BED3].to_csv(fp_, sep='\t', index=False, header=False)
    !wc -l {fp_}

for type_ in df_regl['unassigned_promoter_type_rev'].value_counts().sort_index().index.tolist():
    if type_ == '.': continue
    fp_ = vp('unassigned_promoter_rev_%(type_)s.bed' % locals())
    df_regl.query('unassigned_promoter_type_rev == "%(type_)s"' % locals())[yp.NAMES_BED3].to_csv(fp_, sep='\t', index=False, header=False)
    !wc -l {fp_}









    



581 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_fwd_PROMPT_uaRNA.bed
524 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_fwd_genic_region_antisense.bed
338 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_fwd_intergenic.bed
65 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_fwd_other.bed
613 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_rev_PROMPT_uaRNA.bed
566 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_rev_genic_region_antisense.bed
336 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_rev_intergenic.bed
83 annot_eLife_revised/_fig/Fig2S5/unassigned_promoter_rev_other.bed



In [13]:

    
col_fwd_ = ['chrom', 'start', 'end', 'unassigned_promoter_type_fwd']
col_rev_ = ['chrom', 'start', 'end', 'unassigned_promoter_type_rev']
q_fwd_ = 'unassigned_promoter_type_fwd != "."'
q_rev_ = 'unassigned_promoter_type_rev != "."'
df_regl[col_fwd_].query(q_fwd_).to_csv(vp('Fig2S5_unassigned_promoter_type_fwd.bed'), sep='\t', index=None, header=None)
df_regl[col_rev_].query(q_rev_).to_csv(vp('Fig2S5_unassigned_promoter_type_rev.bed'), sep='\t', index=None, header=None)
!wc -l {vp('Fig2S5_unassigned_promoter_type_fwd.bed')}
!wc -l {vp('Fig2S5_unassigned_promoter_type_rev.bed')}









    



1508 annot_eLife_revised/_fig/Fig2S5/Fig2S5_unassigned_promoter_type_fwd.bed
1598 annot_eLife_revised/_fig/Fig2S5/Fig2S5_unassigned_promoter_type_rev.bed



In [ ]:

	counts	pct
PROMPT_uaRNA	1194	38.4
genic_region_antisense	1090	35.1
intergenic	674	21.7
other	148	4.8