In [1]:
    
import pandas as pd
import numpy as np
import os
import sys
utils_path = os.path.abspath(os.path.join('..'))
if utils_path not in sys.path:
    sys.path.append(utils_path)
    
from utils.rpos import *
    
In [3]:
    
arcz_targets = get_predictions('../ref/arcZ.csv')
dsra_targets = get_predictions('../ref/dsrA.csv')
rpra_targets = get_predictions('../ref/rprA.csv')
    
In [5]:
    
arcz_targets.head()
    
    Out[5]:
In [9]:
    
bed_from_df(arcz_targets, '../results/arcz.bed',
            fields=['chrom', 'coord_5', 'coord_3', 'locus'],
            field_overrides={'chrom': 'gi|556503834|ref|NC_000913.3|'})
bed_from_df(dsra_targets, '../results/dsra.bed',
            fields=['chrom', 'coord_5', 'coord_3', 'locus'],
            field_overrides={'chrom': 'gi|556503834|ref|NC_000913.3|'})
bed_from_df(rpra_targets, '../results/rpra.bed',
            fields=['chrom', 'coord_5', 'coord_3', 'locus'],
            field_overrides={'chrom': 'gi|556503834|ref|NC_000913.3|'})
    
In [7]:
    
import pybedtools
    
In [38]:
    
def fix_bed(bedfile, newfile):
    with open(bedfile) as fi, open(newfile, 'w') as fo:
        for line in fi:
            fields = line.split('\t')
            if int(fields[1]) > int(fields[2]):
                fields[1], fields[2] = fields[2], fields[1]
            fo.write('\t'.join(fields))
    
In [39]:
    
fix_bed('../ref/5utrs.bed', '../ref/5utrs.fix.bed')
    
In [40]:
    
!head ../ref/5utrs.fix.bed
    
    
In [21]:
    
utrs = pybedtools.BedTool('../ref/5utrs.fix.bed')
arcz = pybedtools.BedTool('../results/redux/arcz.bed')
dsra = pybedtools.BedTool('../results/redux/dsra.bed')
rpra = pybedtools.BedTool('../results/redux/rpra.bed')
utrs.intersect(arcz, wa=True, wb=True).saveas('../results/redux/arcz_hits.all.bed')
utrs.intersect(dsra, wa=True, wb=True).saveas('../results/redux/dsra_hits.all.bed')
utrs.intersect(rpra, wa=True, wb=True).saveas('../results/redux/rpra_hits.all.bed')
    
    Out[21]:
In [23]:
    
!cat ../results/arcz_hits.all.bed | wc -l
    
    
In [24]:
    
!cat ../results/dsra_hits.all.bed | wc -l
    
    
In [25]:
    
!cat ../results/rpra_hits.all.bed | wc -l
    
    
In [26]:
    
genes = pd.DataFrame.from_csv('../results/d_offset200_win80_ratio2.csv', sep='\t')
    
In [28]:
    
gene_list = set(genes['gene'])
len(gene_list)
    
    Out[28]:
In [29]:
    
def genes_from_bed(bedfile):
    genes = []
    with open(bedfile) as fi:
        for line in fi:
            fields = line.split('\t')
            genes.append(fields[4])
    return set(genes)
    
In [30]:
    
arcz_genes = gene_list & genes_from_bed('../results/redux/arcz_hits.all.bed')
len(arcz_genes)
    
    Out[30]:
In [31]:
    
dsra_genes = gene_list & genes_from_bed('../results/redux/dsra_hits.all.bed')
len(dsra_genes)
    
    Out[31]:
In [32]:
    
rpra_genes = gene_list & genes_from_bed('../results/redux/rpra_hits.all.bed')
len(rpra_genes)
    
    Out[32]:
In [33]:
    
arcz_genes
    
    Out[33]:
In [34]:
    
dsra_genes
    
    Out[34]:
In [35]:
    
rpra_genes
    
    Out[35]:
In [36]:
    
arcz_genes & dsra_genes & rpra_genes
    
    Out[36]:
In [37]:
    
len(arcz_genes | dsra_genes | rpra_genes )
    
    Out[37]:
In [ ]: