In [24]:
from Bio import SeqIO, Seq
import gzip
from os import listdir
Create fasta files
In [29]:
for file in listdir("./"):
if file[-2:] == "gb":
record = SeqIO.read(file, "genbank")
record.id = record.name
record.name = ""
clean_seq = ""
for i in record.seq:
if i.upper() not in ["A","T","C","G","N"]:
clean_seq += "N"
print ("before" , i)
else:
clean_seq += i
a = Seq.Seq(clean_seq)
record.seq = a
for i in record.seq:
if i.upper() not in ["A","T","C","G","N"]:
print ("after" , i)
with open(file[:-3]+".fa", "wb") as fasta:
fasta.write(record.format("fasta"))
Create gff files
In [30]:
for file in listdir("./"):
if file[-2:] == "gb":
record = SeqIO.read(file, "genbank")
record.id = record.name
record.name = ""
with open(file[:-3]+".gtf", "wb") as gtf:
# Parse the features in the records
if hasattr(record, 'features'):
for feature in record.features:
if feature.type in ["CDS", "mRNA", "exon"]:
# Try several alternative key for feature name else use unknown
gene_name = "unknown"
for qualifier in ['gene', 'note', 'locus_tag', 'label']:
if qualifier in feature.qualifiers:
gene_name = feature.qualifiers[qualifier][0]
if gene_name[:3] == "CDS":
gene_name = gene_name.split("(")[1].split(")")[0]
break
# Clean names
gene_name = str(gene_name.decode('ascii', 'ignore')).replace(" ", "_").replace("/", "_")
# <seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
gtf.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{} \"{}\"\n".format(
record.id,
"Bacculo_RNAseq",
"exon",
feature.location.start.position,
feature.location.end.position,
'.',
'-' if feature.strand < 0 else '+',
'.',
'gene_id',
gene_name
))
Create bed files
In [31]:
for file in listdir("./"):
if file[-2:] == "gb":
record = SeqIO.read(file, "genbank")
record.id = record.name
record.name = ""
with open(file[:-3]+".bed", "wb") as bed:
bed.write("""track name={} description="{}" visibility=2 colorByStrand="255,0,0 0,0,255"\n""".format(
record.id,
record.id))
# Parse the features in the records
if hasattr(record, 'features'):
for feature in record.features:
if feature.type in ["CDS", "mRNA", "exon"]:
# Try several alternative key for feature name else use unknown
gene_name = ""
for qualifier in ['gene', 'note', 'locus_tag', 'label']:
if qualifier in feature.qualifiers:
gene_name = feature.qualifiers[qualifier][0]
if gene_name[:3] == "CDS":
gene_name = gene_name.split("(")[1].split(")")[0]
break
if gene_name:
# Clean names
gene_name = str(gene_name.decode('ascii', 'ignore')).replace(" ", "_").replace("/", "_")
bed.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
record.id,
feature.location.start.position,
feature.location.end.position,
gene_name,
1000,
'-' if feature.strand < 0 else '+'))