In [1]:
import pandas as pd
import pybedtools
import os
import sys

utils_path = os.path.abspath(os.path.join('..'))
if utils_path not in sys.path:
    sys.path.append(utils_path)
    
from utils.rpos import *

In [2]:
pybedtools.cleanup(remove_all=True)

In [3]:
chromsizes = {'NC_000913.3': (0, 4641651)}
arcz = pybedtools.BedTool('../results/arcz.fixed.bed')
dsra = pybedtools.BedTool('../results/dsra.fixed.bed')
rpra = pybedtools.BedTool('../results/rpra.fixed.bed')
utr5 = pybedtools.BedTool('../results/wt_tmut.utrs.corr.old.bed')

In [4]:
utr5_arcz = utr5.intersect(arcz, wa=True, wb=True)
utr5_arcz.saveas('../results/utr5_arcz.bed')

arcz_genes = set()
with open('../results/utr5_arcz.bed') as fi:
    for line in fi:
        fields = line.strip().split('\t')
        arcz_genes.add(fields[4])
        
len(arcz_genes)


Out[4]:
73

In [5]:
utr5_dsra = utr5.intersect(dsra, wa=True, wb=True)
utr5_dsra.saveas('../results/utr5_dsra.bed')

dsra_genes = set()
with open('../results/utr5_dsra.bed') as fi:
    for line in fi:
        fields = line.strip().split('\t')
        dsra_genes.add(fields[4])
        
len(dsra_genes)


Out[5]:
83

In [6]:
utr5_rpra = utr5.intersect(rpra, wa=True, wb=True)
utr5_rpra.saveas('../results/utr5_rpra.bed')

rpra_genes = set()
with open('../results/utr5_rpra.bed') as fi:
    for line in fi:
        fields = line.strip().split('\t')
        rpra_genes.add(fields[4])
        
len(rpra_genes)


Out[6]:
66

In [7]:
genes = set()
with open('../results/s1_s2_intersect.txt') as fi:
    for line in fi:
        genes.add(line.strip())
        
len(genes)


Out[7]:
223

In [8]:
len(genes & arcz_genes)


Out[8]:
45

In [9]:
len(genes & dsra_genes)


Out[9]:
54

In [10]:
len(genes & rpra_genes)


Out[10]:
42

In [11]:
len(genes & arcz_genes & dsra_genes)


Out[11]:
14

In [12]:
len(genes & arcz_genes & rpra_genes)


Out[12]:
10

In [13]:
len(genes & rpra_genes & dsra_genes)


Out[13]:
13

In [15]:
[g if g in genes else '' for g in arcz_genes]


Out[15]:
['napF',
 '',
 '',
 'gabD',
 'yicO',
 'yodC',
 'metE',
 '',
 'yfgG',
 'acrE',
 'flhD',
 'xdhA',
 'pbpC',
 'dcuR',
 '',
 '',
 'ybgI',
 '',
 '',
 'pck',
 'nuoN',
 'gpsA',
 'yfiM',
 'hypF',
 'ynfC',
 '',
 '',
 '',
 'yhfL',
 'ybaY',
 'clpS',
 'cutC',
 '',
 'yqhA',
 'ykgC',
 'fimB',
 '',
 'mglA',
 'hcaT',
 'rutR',
 '',
 'tdcR',
 '',
 'ybbY',
 'ydcJ',
 'yahN',
 'ydcS',
 'yihV',
 'yjjB',
 '',
 '',
 '',
 'yphB',
 'astE',
 'yqeC',
 '',
 '',
 '',
 '',
 '',
 '',
 'ecpA',
 '',
 '',
 'rpoS',
 '',
 'fucP',
 '',
 'yobF',
 'yobB',
 'ygfS',
 '',
 'minE']

In [22]:
res = []
for gene in genes:
    c = []
    if gene in arcz_genes:
        c.append('arcZ')
    if gene in dsra_genes:
        c.append('dsrA')
    if gene in rpra_genes:
        c.append('rprA')
    res.append(','.join(c))
    
d = {'genes': list(genes), 'sRNA': res}

In [24]:
df = pd.DataFrame.from_dict(d)
df.to_csv(''../results/s1_s2_intersect.sRNA.csv')

In [ ]: