In [2]:
import gffutils
import gzip
from Bio import Alphabet, Seq, SeqIO
In [ ]:
!rm -rf gambiae.gff.gz ag.db 2>/dev/null
!wget ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz -O gambiae.fa.gz
!wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz -O gambiae.gff.gz
In [4]:
!rm -f ag.db
db = gffutils.create_db('gambiae.gff.gz', 'ag.db')
In [5]:
gene_id = 'AGAP004707'
In [6]:
gene = db[gene_id]
In [7]:
print(gene)
print(gene.seqid, gene.strand)
In [8]:
recs = SeqIO.parse(gzip.open('gambiae.fa.gz'), 'fasta')
for rec in recs:
print(rec.description)
if rec.description.split(':')[2] == gene.seqid:
my_seq = rec.seq
break
print(my_seq.alphabet)
In [9]:
def get_sequence(chrom_seq, CDSs, strand):
seq = Seq.Seq('', alphabet=Alphabet.IUPAC.unambiguous_dna)
for CDS in CDSs:
#FRAME???
my_cds = Seq.Seq(str(my_seq[CDS.start - 1: CDS.end]), alphabet=Alphabet.IUPAC.unambiguous_dna)
seq += my_cds
return seq if strand == '+' else seq.reverse_complement()
In [10]:
mRNAs = db.children(gene, featuretype='mRNA')
for mRNA in mRNAs:
print(mRNA.id)
if mRNA.id.endswith('RA'):
break
CDSs = db.children(mRNA, featuretype='CDS', order_by='start')
gene_seq = get_sequence(my_seq, CDSs, gene.strand)
print(len(gene_seq), gene_seq)
prot = gene_seq.translate()
print(len(prot), prot)
In [11]:
reverse_transcript_id = 'AGAP004708-RA'
In [12]:
reverse_CDSs = db.children(reverse_transcript_id, featuretype='CDS', order_by='start')
reverse_seq = get_sequence(my_seq, reverse_CDSs, '-')
print(len(reverse_seq), reverse_seq)
reverse_prot = reverse_seq.translate()
print(len(reverse_prot), reverse_prot)
In [ ]: