Small RNA binding sites

Now we select UTRs that have arcZ, dsrA, and rprA binding sites and relate them to the genes upregulated in stationary phase and UTRs dependant on BCM


In [1]:
import pandas as pd
import numpy as np

import os
import sys

utils_path = os.path.abspath(os.path.join('..'))
if utils_path not in sys.path:
    sys.path.append(utils_path)
    
from utils.rpos import *

In [3]:
arcz_targets = get_predictions('../ref/arcZ.csv')
dsra_targets = get_predictions('../ref/dsrA.csv')
rpra_targets = get_predictions('../ref/rprA.csv')

In [5]:
arcz_targets.head()


Out[5]:
coord_3 coord_5 energy locus rank z_score
0 3348564 3348535 -38.31 b3209 1 -10.81
1 2672671 2672649 -26.59 b2544 2 -6.55
2 2640457 2640442 -21.41 b2516 3 -4.66
3 3933169 3933148 -20.38 b3749 4 -4.29
4 3278629 3278605 -20.32 b3133 5 -4.27

In [9]:
bed_from_df(arcz_targets, '../results/arcz.bed',
            fields=['chrom', 'coord_5', 'coord_3', 'locus'],
            field_overrides={'chrom': 'gi|556503834|ref|NC_000913.3|'})

bed_from_df(dsra_targets, '../results/dsra.bed',
            fields=['chrom', 'coord_5', 'coord_3', 'locus'],
            field_overrides={'chrom': 'gi|556503834|ref|NC_000913.3|'})

bed_from_df(rpra_targets, '../results/rpra.bed',
            fields=['chrom', 'coord_5', 'coord_3', 'locus'],
            field_overrides={'chrom': 'gi|556503834|ref|NC_000913.3|'})

In [7]:
import pybedtools

In [38]:
def fix_bed(bedfile, newfile):
    with open(bedfile) as fi, open(newfile, 'w') as fo:
        for line in fi:
            fields = line.split('\t')
            if int(fields[1]) > int(fields[2]):
                fields[1], fields[2] = fields[2], fields[1]
            fo.write('\t'.join(fields))

In [39]:
fix_bed('../ref/5utrs.bed', '../ref/5utrs.fix.bed')

In [40]:
!head ../ref/5utrs.fix.bed


gi|556503834|ref|NC_000913.3|	148	190	+	thrL
gi|556503834|ref|NC_000913.3|	148	190	+	thrL
gi|556503834|ref|NC_000913.3|	5030	5234	+	yaaX
gi|556503834|ref|NC_000913.3|	6459	6587	-	yaaA
gi|556503834|ref|NC_000913.3|	6459	6615	-	yaaA
gi|556503834|ref|NC_000913.3|	7959	8017	-	yaaJ
gi|556503834|ref|NC_000913.3|	8191	8238	+	talB
gi|556503834|ref|NC_000913.3|	10643	10830	+	htgA
gi|556503834|ref|NC_000913.3|	10644	10830	+	htgA
gi|556503834|ref|NC_000913.3|	11356	11542	-	yaaW

In [21]:
utrs = pybedtools.BedTool('../ref/5utrs.fix.bed')
arcz = pybedtools.BedTool('../results/redux/arcz.bed')
dsra = pybedtools.BedTool('../results/redux/dsra.bed')
rpra = pybedtools.BedTool('../results/redux/rpra.bed')

utrs.intersect(arcz, wa=True, wb=True).saveas('../results/redux/arcz_hits.all.bed')
utrs.intersect(dsra, wa=True, wb=True).saveas('../results/redux/dsra_hits.all.bed')
utrs.intersect(rpra, wa=True, wb=True).saveas('../results/redux/rpra_hits.all.bed')


Out[21]:
<BedTool(../../results/redux/rpra_hits.all.bed)>

In [23]:
!cat ../results/arcz_hits.all.bed | wc -l


589

In [24]:
!cat ../results/dsra_hits.all.bed | wc -l


597

In [25]:
!cat ../results/rpra_hits.all.bed | wc -l


604

In [26]:
genes = pd.DataFrame.from_csv('../results/d_offset200_win80_ratio2.csv', sep='\t')

In [28]:
gene_list = set(genes['gene'])
len(gene_list)


Out[28]:
222

In [29]:
def genes_from_bed(bedfile):
    genes = []
    with open(bedfile) as fi:
        for line in fi:
            fields = line.split('\t')
            genes.append(fields[4])
    return set(genes)

In [30]:
arcz_genes = gene_list & genes_from_bed('../results/redux/arcz_hits.all.bed')
len(arcz_genes)


Out[30]:
59

In [31]:
dsra_genes = gene_list & genes_from_bed('../results/redux/dsra_hits.all.bed')
len(dsra_genes)


Out[31]:
61

In [32]:
rpra_genes = gene_list & genes_from_bed('../results/redux/rpra_hits.all.bed')
len(rpra_genes)


Out[32]:
46

In [33]:
arcz_genes


Out[33]:
{'alx',
 'astE',
 'cspH',
 'dcuR',
 'dxr',
 'entD',
 'fic',
 'fimB',
 'ftsK',
 'gntR',
 'ibpB',
 'ihfA',
 'leuU',
 'minE',
 'mobA',
 'motA',
 'oxc',
 'pgpB',
 'purT',
 'puuP',
 'pyrE',
 'rpoS',
 'rsmD',
 'rsmI',
 'slyX',
 'tgt',
 'thiM',
 'waaU',
 'yaeF',
 'yaiS',
 'yajI',
 'ybbY',
 'ybcN',
 'ybdR',
 'ybhH',
 'ycaI',
 'ycaL',
 'yceF',
 'yceO',
 'ydaE',
 'ydfK',
 'yecH',
 'yegX',
 'yehK',
 'yfhR',
 'yfiM',
 'yfjM',
 'yghA',
 'yhiD',
 'yhjR',
 'yicO',
 'yjiA',
 'yjtD',
 'ymfJ',
 'ymiA',
 'ynfC',
 'yobB',
 'yqeC',
 'yqhC'}

In [34]:
dsra_genes


Out[34]:
{'alx',
 'astE',
 'creA',
 'dgkA',
 'ecpB',
 'elaA',
 'fic',
 'focA',
 'frdA',
 'hdfR',
 'ihfA',
 'insJ',
 'lysC',
 'macB',
 'minE',
 'moaA',
 'mutM',
 'nudC',
 'pgpB',
 'potF',
 'ppdD',
 'prc',
 'rimL',
 'rpoS',
 'rssA',
 'slyX',
 'smf',
 'speB',
 'waaY',
 'yaaX',
 'yahE',
 'yaiY',
 'ybcH',
 'ybcN',
 'ybeR',
 'ybhH',
 'yccX',
 'ydaS',
 'yedY',
 'yehK',
 'yehL',
 'yeiB',
 'yeiW',
 'yejK',
 'yfbO',
 'yfhR',
 'ygdG',
 'yghA',
 'ygiD',
 'yhbU',
 'yhiD',
 'yicO',
 'yihF',
 'yihO',
 'yjiA',
 'ykgB',
 'ymfJ',
 'ymgE',
 'ymiA',
 'ynfC',
 'yqgB'}

In [35]:
rpra_genes


Out[35]:
{'bamA',
 'creA',
 'crp',
 'cspH',
 'dcuR',
 'dmlA',
 'ecpB',
 'elfD',
 'fimB',
 'ftsK',
 'hofM',
 'ihfA',
 'mngA',
 'mobA',
 'mqsA',
 'nudC',
 'proP',
 'puuP',
 'rimL',
 'rpoS',
 'rsmI',
 'slyX',
 'waaQ',
 'waaY',
 'yahM',
 'yaiY',
 'ybcH',
 'ybcN',
 'ybdR',
 'ybeR',
 'ybhH',
 'yceO',
 'ycfJ',
 'ycjD',
 'yehK',
 'yeiB',
 'yeiW',
 'yejK',
 'yfiM',
 'yhbU',
 'yhcM',
 'yihL',
 'yobB',
 'yqeI',
 'yqgB',
 'yrbN'}

In [36]:
arcz_genes & dsra_genes & rpra_genes


Out[36]:
{'ihfA', 'rpoS', 'slyX', 'ybcN', 'ybhH', 'yehK'}

In [37]:
len(arcz_genes | dsra_genes | rpra_genes )


Out[37]:
117

In [ ]: