Clean and convert gb files for RNAseq analyses


In [24]:
from Bio import SeqIO, Seq
import gzip
from os import listdir

Create fasta files


In [29]:
for file in listdir("./"):
    if file[-2:] == "gb":
        record = SeqIO.read(file, "genbank")
        record.id = record.name
        record.name = ""
        clean_seq = ""
                
        for i in record.seq:
            if i.upper() not in ["A","T","C","G","N"]:
                clean_seq += "N"
                print ("before" , i)
            else:
                clean_seq += i
                
        a = Seq.Seq(clean_seq)
        record.seq = a
        for i in record.seq:
            if i.upper() not in ["A","T","C","G","N"]:
                print ("after" , i)
                    
        with open(file[:-3]+".fa", "wb") as fasta:
            fasta.write(record.format("fasta"))


('before', 'W')
('before', 'M')
('before', 'R')
('before', 'K')
('before', 'R')
('before', 'W')
('before', 'M')
('before', 'R')
('before', 'K')
('before', 'R')
('before', 'W')
('before', 'M')
('before', 'R')
('before', 'K')
('before', 'R')
('before', 'W')
('before', 'M')
('before', 'R')
('before', 'K')
('before', 'R')
('before', 'W')
('before', 'M')
('before', 'R')
('before', 'K')
('before', 'R')
('before', 'W')
('before', 'M')
('before', 'R')
('before', 'K')
('before', 'R')

Create gff files


In [30]:
for file in listdir("./"):
    if file[-2:] == "gb":
        record = SeqIO.read(file, "genbank")
        record.id = record.name
        record.name = ""
        with open(file[:-3]+".gtf", "wb") as gtf:
       
            # Parse the features in the records
            if hasattr(record, 'features'):
                for feature in record.features:
                    if feature.type in ["CDS", "mRNA", "exon"]:

                        # Try several alternative key for feature name else use unknown
                        gene_name = "unknown"
                        for qualifier in ['gene', 'note', 'locus_tag', 'label']:
                            if qualifier in feature.qualifiers:
                                gene_name = feature.qualifiers[qualifier][0]
                                if gene_name[:3] == "CDS":
                                    gene_name = gene_name.split("(")[1].split(")")[0]
                                break
                        
                        # Clean names
                        gene_name = str(gene_name.decode('ascii', 'ignore')).replace(" ", "_").replace("/", "_")

                        # <seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
                        gtf.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{} \"{}\"\n".format(
                            record.id,
                            "Bacculo_RNAseq",
                            "exon",
                            feature.location.start.position,
                            feature.location.end.position,
                            '.',
                            '-' if feature.strand < 0 else '+',
                            '.',
                            'gene_id',
                            gene_name
                            ))

Create bed files


In [31]:
for file in listdir("./"):
    if file[-2:] == "gb":
        record = SeqIO.read(file, "genbank")
        record.id = record.name
        record.name = ""
        
        with open(file[:-3]+".bed", "wb") as bed:
            bed.write("""track name={} description="{}" visibility=2 colorByStrand="255,0,0 0,0,255"\n""".format(
            record.id,
            record.id))
            
            # Parse the features in the records
            if hasattr(record, 'features'):
                for feature in record.features:
                    if feature.type in ["CDS", "mRNA", "exon"]:

                        # Try several alternative key for feature name else use unknown
                        gene_name = ""
                        for qualifier in ['gene', 'note', 'locus_tag', 'label']:
                            if qualifier in feature.qualifiers:
                                gene_name = feature.qualifiers[qualifier][0]
                                if gene_name[:3] == "CDS":
                                    gene_name = gene_name.split("(")[1].split(")")[0]
                                break
                        
                        if gene_name:
                            
                            # Clean names
                            gene_name = str(gene_name.decode('ascii', 'ignore')).replace(" ", "_").replace("/", "_")
                            
                            bed.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                                record.id,
                                feature.location.start.position,
                                feature.location.end.position,
                                gene_name,
                                1000,
                                '-' if feature.strand < 0 else '+'))