notebook.community

Edit and run



In [1]:

    
import pandas as pd
import pybedtools
import os
import sys

utils_path = os.path.abspath(os.path.join('..'))
if utils_path not in sys.path:
    sys.path.append(utils_path)
    
from utils.rpos import *



In [2]:

    
pybedtools.cleanup(remove_all=True)



In [3]:

    
chromsizes = {'NC_000913.3': (0, 4641651)}
arcz = pybedtools.BedTool('../results/arcz.fixed.bed')
dsra = pybedtools.BedTool('../results/dsra.fixed.bed')
rpra = pybedtools.BedTool('../results/rpra.fixed.bed')
utr5 = pybedtools.BedTool('../results/wt_tmut.utrs.corr.old.bed')



In [4]:

    
utr5_arcz = utr5.intersect(arcz, wa=True, wb=True)
utr5_arcz.saveas('../results/utr5_arcz.bed')

arcz_genes = set()
with open('../results/utr5_arcz.bed') as fi:
    for line in fi:
        fields = line.strip().split('\t')
        arcz_genes.add(fields[4])
        
len(arcz_genes)









    Out[4]:





73



In [5]:

    
utr5_dsra = utr5.intersect(dsra, wa=True, wb=True)
utr5_dsra.saveas('../results/utr5_dsra.bed')

dsra_genes = set()
with open('../results/utr5_dsra.bed') as fi:
    for line in fi:
        fields = line.strip().split('\t')
        dsra_genes.add(fields[4])
        
len(dsra_genes)









    Out[5]:





83



In [6]:

    
utr5_rpra = utr5.intersect(rpra, wa=True, wb=True)
utr5_rpra.saveas('../results/utr5_rpra.bed')

rpra_genes = set()
with open('../results/utr5_rpra.bed') as fi:
    for line in fi:
        fields = line.strip().split('\t')
        rpra_genes.add(fields[4])
        
len(rpra_genes)









    Out[6]:





66



In [7]:

    
genes = set()
with open('../results/s1_s2_intersect.txt') as fi:
    for line in fi:
        genes.add(line.strip())
        
len(genes)









    Out[7]:





223



In [8]:

    
len(genes & arcz_genes)









    Out[8]:





45



In [9]:

    
len(genes & dsra_genes)









    Out[9]:





54



In [10]:

    
len(genes & rpra_genes)









    Out[10]:





42



In [11]:

    
len(genes & arcz_genes & dsra_genes)









    Out[11]:





14



In [12]:

    
len(genes & arcz_genes & rpra_genes)









    Out[12]:





10



In [13]:

    
len(genes & rpra_genes & dsra_genes)









    Out[13]:





13



In [15]:

    
[g if g in genes else '' for g in arcz_genes]









    Out[15]:





['napF',
 '',
 '',
 'gabD',
 'yicO',
 'yodC',
 'metE',
 '',
 'yfgG',
 'acrE',
 'flhD',
 'xdhA',
 'pbpC',
 'dcuR',
 '',
 '',
 'ybgI',
 '',
 '',
 'pck',
 'nuoN',
 'gpsA',
 'yfiM',
 'hypF',
 'ynfC',
 '',
 '',
 '',
 'yhfL',
 'ybaY',
 'clpS',
 'cutC',
 '',
 'yqhA',
 'ykgC',
 'fimB',
 '',
 'mglA',
 'hcaT',
 'rutR',
 '',
 'tdcR',
 '',
 'ybbY',
 'ydcJ',
 'yahN',
 'ydcS',
 'yihV',
 'yjjB',
 '',
 '',
 '',
 'yphB',
 'astE',
 'yqeC',
 '',
 '',
 '',
 '',
 '',
 '',
 'ecpA',
 '',
 '',
 'rpoS',
 '',
 'fucP',
 '',
 'yobF',
 'yobB',
 'ygfS',
 '',
 'minE']



In [22]:

    
res = []
for gene in genes:
    c = []
    if gene in arcz_genes:
        c.append('arcZ')
    if gene in dsra_genes:
        c.append('dsrA')
    if gene in rpra_genes:
        c.append('rprA')
    res.append(','.join(c))
    
d = {'genes': list(genes), 'sRNA': res}



In [24]:

    
df = pd.DataFrame.from_dict(d)
df.to_csv(''../results/s1_s2_intersect.sRNA.csv')



In [ ]: