In [1]:
import re
import bitarray
In [2]:
!ls
In [3]:
orfs = []
interval_re = re.compile(r">(.+) \[([0-9]+) - ([0-9]+)\]")
with open("d9539_asm_v1.2_orf_filt.fa") as fh:
for line in fh:
if line.startswith(">"): #fasta header line
hit = interval_re.search(line)
#assumes regex will always match, otherwise it will throw error
orfs.append( (hit.group(1),
int(hit.group(2)),
int(hit.group(3)),
"-" if "REVERSE SENSE" in line else "+")
)
In [4]:
orfs
Out[4]:
In [5]:
with open("d9539_asm_v1.2_emboss_orf1.gff","w") as out_fh:
for orf in orfs:
line = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( "d9539_asm_v1.2",
"EMBOSS_getorf",
"CDS",
orf[1] if orf[1] < orf[2] else orf[2],
orf[2] if orf[1] < orf[2] else orf[1],
".",
orf[3],
0,
"ID={}".format(orf[0])
)
out_fh.write(line)
In [6]:
#invert rev cmp pairs
for idx, pair in enumerate(orfs):
if pair[2] < pair[1]:
orfs[idx] = (pair[0],pair[2],pair[1],pair[3])
In [7]:
ctg_pos = bitarray.bitarray(5752)
ctg_pos.setall(False)
In [8]:
for pair in orfs:
if pair[1] < 5752:
ctg_pos[pair[1]:(pair[2]+1)] = True
else:
ctg_pos[pair[1]:ctg_len] = True
ctg_pos[0:((pair[2]%ctg_len)+1)] = True
In [9]:
ctg_pos
Out[9]:
In [ ]: