In [1]:
import pandas as pd
import numpy as np
import os
import sys
utils_path = os.path.abspath(os.path.join('..'))
if utils_path not in sys.path:
sys.path.append(utils_path)
from utils.rpos import *
In [3]:
arcz_targets = get_predictions('../ref/arcZ.csv')
dsra_targets = get_predictions('../ref/dsrA.csv')
rpra_targets = get_predictions('../ref/rprA.csv')
In [5]:
arcz_targets.head()
Out[5]:
In [9]:
bed_from_df(arcz_targets, '../results/arcz.bed',
fields=['chrom', 'coord_5', 'coord_3', 'locus'],
field_overrides={'chrom': 'gi|556503834|ref|NC_000913.3|'})
bed_from_df(dsra_targets, '../results/dsra.bed',
fields=['chrom', 'coord_5', 'coord_3', 'locus'],
field_overrides={'chrom': 'gi|556503834|ref|NC_000913.3|'})
bed_from_df(rpra_targets, '../results/rpra.bed',
fields=['chrom', 'coord_5', 'coord_3', 'locus'],
field_overrides={'chrom': 'gi|556503834|ref|NC_000913.3|'})
In [7]:
import pybedtools
In [38]:
def fix_bed(bedfile, newfile):
with open(bedfile) as fi, open(newfile, 'w') as fo:
for line in fi:
fields = line.split('\t')
if int(fields[1]) > int(fields[2]):
fields[1], fields[2] = fields[2], fields[1]
fo.write('\t'.join(fields))
In [39]:
fix_bed('../ref/5utrs.bed', '../ref/5utrs.fix.bed')
In [40]:
!head ../ref/5utrs.fix.bed
In [21]:
utrs = pybedtools.BedTool('../ref/5utrs.fix.bed')
arcz = pybedtools.BedTool('../results/redux/arcz.bed')
dsra = pybedtools.BedTool('../results/redux/dsra.bed')
rpra = pybedtools.BedTool('../results/redux/rpra.bed')
utrs.intersect(arcz, wa=True, wb=True).saveas('../results/redux/arcz_hits.all.bed')
utrs.intersect(dsra, wa=True, wb=True).saveas('../results/redux/dsra_hits.all.bed')
utrs.intersect(rpra, wa=True, wb=True).saveas('../results/redux/rpra_hits.all.bed')
Out[21]:
In [23]:
!cat ../results/arcz_hits.all.bed | wc -l
In [24]:
!cat ../results/dsra_hits.all.bed | wc -l
In [25]:
!cat ../results/rpra_hits.all.bed | wc -l
In [26]:
genes = pd.DataFrame.from_csv('../results/d_offset200_win80_ratio2.csv', sep='\t')
In [28]:
gene_list = set(genes['gene'])
len(gene_list)
Out[28]:
In [29]:
def genes_from_bed(bedfile):
genes = []
with open(bedfile) as fi:
for line in fi:
fields = line.split('\t')
genes.append(fields[4])
return set(genes)
In [30]:
arcz_genes = gene_list & genes_from_bed('../results/redux/arcz_hits.all.bed')
len(arcz_genes)
Out[30]:
In [31]:
dsra_genes = gene_list & genes_from_bed('../results/redux/dsra_hits.all.bed')
len(dsra_genes)
Out[31]:
In [32]:
rpra_genes = gene_list & genes_from_bed('../results/redux/rpra_hits.all.bed')
len(rpra_genes)
Out[32]:
In [33]:
arcz_genes
Out[33]:
In [34]:
dsra_genes
Out[34]:
In [35]:
rpra_genes
Out[35]:
In [36]:
arcz_genes & dsra_genes & rpra_genes
Out[36]:
In [37]:
len(arcz_genes | dsra_genes | rpra_genes )
Out[37]:
In [ ]: