Author: Darcy Jones
Contact: darcy.ab.jones@gmail.com
Date: 02/04/2015
CodingQuarry is a gene prediction program optimised for annotating fungal genomes.
We were having a problem extracting ORF sequences from the GFF output of CodingQuarry because it appears to code some exons as 'CDS' types.
This notebook converts the type
of multiple 'CDS' features with the same parent
to 'exon' and recodes the ID
appropriately.
Then coding sequences of genes for each genome is extracted and saved as fasta files as nucleotide and amino acid sequences.
This was run with Python 3.4.1 with the following packages:
Testa, A. C., Hane, J. K., Ellwood, S. R., and Oliver, R. P. 2015. CodingQuarry: highly accurate hidden Markov model gene prediction in fungal genomes using RNA-seq transcripts. BMC Genomics. 16:1–12
In [1]:
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature
from Bio.SeqFeature import FeatureLocation
from Bio.SeqFeature import CompoundLocation
from BCBio import GFF
import os
In [2]:
def gff_correct(in_path, out_path):
""" Recode multiple CDS records as exons in GFF.
Keyword arguments:
in_path -- Path to input gff file (str, required)
out_path -- Path to write new GFF to (str, required)
"""
with open(in_gff, 'rU') as in_handle, \
open(out_path, 'w') as out_handle:
records = GFF.parse(in_handle)
def gff_iterator(records):
""" Using a generator to speed up the process. """
for record in records:
for feature in record.features:
sub_features = feature._sub_features
new_sub_features = list()
sub_features_cds = [f for f in feature._sub_features
if f.type == 'CDS']
new_sub_features.extend([f for f in feature._sub_features
if f.type != 'CDS'])
if len(sub_features_cds) == 1:
new_sub_features.extend(sub_features_cds)
elif len(sub_features_cds) > 1:
for sub_feature in sub_features_cds:
sub_feature.type = 'exon'
sub_feature.id = ('exon:' +
sub_feature.id.strip('CDS:'))
sub_feature.qualifiers['ID'] = sub_feature.id
del sub_feature.qualifiers['phase']
new_sub_features.append(sub_feature)
feature._sub_features = new_sub_features
yield record
output = GFF.write(gff_iterator(records), out_handle)
return output
In [3]:
def subfeatures(feature):
""" Return a location object from GFF output.
The BCBio GFF parser adds exons and CDS's to features as sub_features
which has since been depreciated in Biopython in favour of the
CompoundLocation object. This function returns a Location object that
we can use to extract sequences.
Keyword arguments:
feature -- A SeqFeature record with the _sub_features attribute.
Returns:
An ExactLocation or CompundLocation object.
"""
sub_features_cds = [f for f in feature._sub_features
if f.type == 'CDS']
sub_features_exons = [f for f in feature._sub_features
if f.type == 'exon']
strand = feature.strand
if len(sub_features_exons) > 0:
locations = [f.location for f in sub_features_exons]
locations.sort(key=lambda l: l.start)
if strand == -1:
""" When calling CompoundLocation.extract() the sequences are
extracted in the order that they are encountered. For features on
the - strand, we need to reverse this order. """
locations.reverse()
return CompoundLocation(locations)
elif len(sub_features_cds) > 0:
if len(sub_features_cds) > 1:
locations = [f.location for f in sub_features_cds]
locations.sort(key=lambda l: l.start)
if strand == -1:
locations.reverse()
locations = CompoundLocation(locations)
else:
locations = sub_features_cds[0] # One CDS returns an ExactLocation
return locations
In [4]:
def gff_to_genes(in_gff, in_genome):
""" Clean GFF files and extract sequences.
Keyword arguments:
in_gff -- Path to input GFF3 (str, required)
in_genome -- Path to fasta genome corresponding to `in_gff` (str, required)
Returns:
A tuple with 0 = A list of all extracted SeqRecords, and 1 = Extracted
SeqRecords with unexpected stop codons within the sequence.
"""
split_gff_path = os.path.splitext(in_gff)[0] # Strip the extension
genome = SeqIO.to_dict(SeqIO.parse(in_genome, format='fasta'))
with open(in_gff, 'rU') as in_handle:
genome_with_features = GFF.parse(in_handle, base_dict=genome)
sequences = list()
dubious_sequences = list()
for scaffold in genome_with_features:
for feature in scaffold.features:
subfeat = subfeatures(feature)
seq = subfeat.extract(genome[scaffold.id])
seq.id = feature.id
seq.name = feature.id
sequences.append(seq)
""" Test if there are unexpected stop codons. """
if '*' in seq.seq.translate()[:-1]:
dubious_sequences.append(seq)
return sequences, dubious_sequences
We now have the functions ready to process CodingQuarry's weird GFF's and extract protein coding sequences. Now we can loop through GFF's and their relevant genomes to perform those operations quickly.
In [5]:
cds_suffix = '.genes.fna'
orf_suffix = '.genes.faa'
dubious_suffix = '.genes.weird_cds.fna'
cleaned_suffix = '.cleaned'
in_files = [
("9/I9A.PredictedPass.gff3", "9/I9A.gapfilled.final.fa"),
("17/I17V.PredictedPass.gff3", "17/I17V.gapfilled.final.fa"),
("19/I19A.PredictedPass.gff3", "19/I19A.gapfilled.final.fa"),
("31/I31A.PredictedPass.gff3", "31/I31A.gapfilled.final.fa"),
("37/I37A.PredictedPass.gff3", "37/I37A.gapfilled.final.fa"),
("61/I61A.PredictedPass.gff3", "61/I61A.gapfilled.final.fa"),
("65/I65V.PredictedPass.gff3", "65/I65V.gapfilled.final.fa"),
("USR5/I5V.PredictedPass.gff3", "USR5/I5V.gapfilled.final.fa")
]
for in_gff, in_fasta in in_files:
""" Prepare filenames for IO. """
split_gff_path = os.path.splitext(in_gff)[0]
out_cds_file = split_gff_path + cds_suffix
out_orf_file = split_gff_path + orf_suffix
out_dubious_cds_file = split_gff_path + dubious_suffix
out_cleaned_orf_file = split_gff_path + cleaned_suffix + orf_suffix
out_cleaned_cds_file = split_gff_path + cleaned_suffix + cds_suffix
cleaned_gff_file = split_gff_path + cleaned_suffix + '.gff3'
""" Process the GFF to recode multiple CDS's as exons. """
gff_correct(in_gff, cleaned_gff_file)
""" Extract sequences from gff CDS and exon features. """
sequences, dubious_sequences = gff_to_genes(cleaned_gff_file, in_fasta)
""" Write sequences and dubious sequences to fasta files. """
with open(out_cleaned_cds_file, 'w') as handle:
SeqIO.write(sequences, handle, 'fasta')
with open(out_dubious_cds_file, 'w') as handle:
SeqIO.write(dubious_sequences, handle, 'fasta')
translated_sequences = list()
for sequence in sequences:
sequence.seq = sequence.seq.translate()
translated_sequences.append(sequence)
with open(out_cleaned_orf_file, 'w') as handle:
SeqIO.write(translated_sequences, handle, 'fasta')
The predicted genes have now been extracted and the nucleotide and amino acid sequences saved as fasta files.
Genes that the program could not extract properly or are not annotated properly (those with unexpected stop codons) are saved to another file with dubious_suffix
for later follow up.
These dubious sequences are still in the other output files.