In [2]:
import gffutils
import gzip
from Bio import Alphabet, Seq, SeqIO

Retrieving data


In [ ]:
!rm -rf gambiae.gff.gz ag.db 2>/dev/null
!wget ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz -O gambiae.fa.gz
!wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz -O gambiae.gff.gz


--2015-06-26 14:46:59--  ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz
           => ‘gambiae.fa.gz’
Resolving ftp.vectorbase.org (ftp.vectorbase.org)... 129.74.255.228
Connecting to ftp.vectorbase.org (ftp.vectorbase.org)|129.74.255.228|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /public_data/organism_data/agambiae/Genome ... done.
==> SIZE agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz ... 81591806
==> PASV ... done.    ==> RETR agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz ... done.
Length: 81591806 (78M) (unauthoritative)

HROMOSOMES-PEST.Aga  67%[=============>        ]  52.57M  3.08MB/s   eta 12s   

In [4]:
!rm -f ag.db

db = gffutils.create_db('gambiae.gff.gz', 'ag.db')

Getting a gene


In [5]:
gene_id = 'AGAP004707'

In [6]:
gene = db[gene_id]

In [7]:
print(gene)
print(gene.seqid, gene.strand)


2L	VectorBase	gene	2358158	2431617	.	+	.	ID=AGAP004707;biotype=protein_coding
('2L', '+')

In [8]:
recs = SeqIO.parse(gzip.open('gambiae.fa.gz'), 'fasta')
for rec in recs:
    print(rec.description)
    if rec.description.split(':')[2] == gene.seqid:
        my_seq = rec.seq
        break
print(my_seq.alphabet)


chromosome:AgamP3:2L:1:49364325:1 chromosome 2L
SingleLetterAlphabet()

In [9]:
def get_sequence(chrom_seq, CDSs, strand):
    seq = Seq.Seq('', alphabet=Alphabet.IUPAC.unambiguous_dna)
    for CDS in CDSs:
        #FRAME???
        my_cds = Seq.Seq(str(my_seq[CDS.start - 1: CDS.end]), alphabet=Alphabet.IUPAC.unambiguous_dna)
        seq += my_cds
    return seq if strand == '+' else seq.reverse_complement()

In [10]:
mRNAs = db.children(gene, featuretype='mRNA')
for mRNA in mRNAs:
    print(mRNA.id)
    if mRNA.id.endswith('RA'):
        break

CDSs = db.children(mRNA, featuretype='CDS', order_by='start')
gene_seq = get_sequence(my_seq, CDSs, gene.strand)

print(len(gene_seq), gene_seq)
prot = gene_seq.translate()
print(len(prot), prot)


AGAP004707-RA
(6357, Seq('ATGACCGAAGACTCCGATTCGATATCTGAGGAAGAACGTAGTTTGTTCCGTCCT...TGA', IUPACUnambiguousDNA()))
(2119, Seq('MTEDSDSISEEERSLFRPFTRESLQAIEARIADEEAKQRELERKRAEGEDEDEG...DV*', HasStopCodon(IUPACProtein(), '*')))

Reverse strand


In [11]:
reverse_transcript_id = 'AGAP004708-RA'

In [12]:
reverse_CDSs = db.children(reverse_transcript_id, featuretype='CDS', order_by='start')
reverse_seq = get_sequence(my_seq, reverse_CDSs, '-')

print(len(reverse_seq), reverse_seq)
reverse_prot = reverse_seq.translate()
print(len(reverse_prot), reverse_prot)


(1992, Seq('ATGGCTGACTTCGATAGTGCCACTAAATGTATCAGAAACATTGAAAAAGAAATT...TAA', IUPACUnambiguousDNA()))
(664, Seq('MADFDSATKCIRNIEKEILLLQSEVLKTREGLGLEDDNVELKKLMEENTRLKHR...KI*', HasStopCodon(IUPACProtein(), '*')))

In [ ]: