In [1]:
!rm -rf gambiae.g?f.gz ag.db 2>/dev/null
#!wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gtfgz -O gambiae.gtf.gz
!wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz -O gambiae.gff.gz


--2015-02-18 14:00:45--  http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz
Resolving www.vectorbase.org (www.vectorbase.org)... 129.74.255.228
Connecting to www.vectorbase.org (www.vectorbase.org)|129.74.255.228|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz [following]
--2015-02-18 14:00:45--  https://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz
Connecting to www.vectorbase.org (www.vectorbase.org)|129.74.255.228|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.vectorbase.org/sites/default/files/ftp/downloads/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.gff3.gz [following]
--2015-02-18 14:00:47--  https://www.vectorbase.org/sites/default/files/ftp/downloads/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.gff3.gz
Reusing existing connection to www.vectorbase.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 2702601 (2.6M) [application/x-gzip]
Saving to: ‘gambiae.gff.gz’

100%[======================================>] 2,702,601   1.46MB/s   in 1.8s   

2015-02-18 14:00:49 (1.46 MB/s) - ‘gambiae.gff.gz’ saved [2702601/2702601]


In [1]:
import gffutils
import sqlite3

In [3]:
!rm -f ag.db

In [2]:
try:
    db = gffutils.create_db('gambiae.gff.gz', 'ag.db')
except sqlite3.OperationalError:
    db = gffutils.FeatureDB('ag.db')

In [3]:
print(list(db.featuretypes()))
for feat_type in db.featuretypes():
    print(feat_type, db.count_features_of_type(feat_type))


['CDS', 'RNase_P_RNA', 'SRP_RNA', 'contig', 'exon', 'five_prime_UTR', 'gene', 'mRNA', 'miRNA', 'misc_RNA', 'pseudogene', 'rRNA', 'snRNA', 'snoRNA', 'tRNA', 'tRNA_pseudogene', 'three_prime_UTR']
('CDS', 62408)
('RNase_P_RNA', 1)
('SRP_RNA', 3)
('contig', 8)
('exon', 66485)
('five_prime_UTR', 10520)
('gene', 13624)
('mRNA', 14697)
('miRNA', 187)
('misc_RNA', 10)
('pseudogene', 5)
('rRNA', 53)
('snRNA', 38)
('snoRNA', 12)
('tRNA', 463)
('tRNA_pseudogene', 9)
('three_prime_UTR', 7281)

In [4]:
for contig in db.features_of_type('contig'):
    print(contig)


2L	VectorBase	contig	1	49364325	.	.	.	translation_table=1;topology=linear;ID=2L;molecule_type=dsDNA;localization=chromosomal
3R	VectorBase	contig	1	53200684	.	.	.	translation_table=1;topology=linear;ID=3R;molecule_type=dsDNA;localization=chromosomal
UNKN	VectorBase	contig	1	42389979	.	.	.	translation_table=1;topology=linear;ID=UNKN;molecule_type=dsDNA;localization=chromosomal
X	VectorBase	contig	1	24393108	.	.	.	translation_table=1;topology=linear;ID=X;molecule_type=dsDNA;localization=chromosomal
Y_unplaced	VectorBase	contig	1	237045	.	.	.	translation_table=1;topology=linear;ID=Y_unplaced;molecule_type=dsDNA;localization=chromosomal
Mt	VectorBase	contig	1	15363	.	.	.	translation_table=1;topology=linear;ID=Mt;molecule_type=dsDNA;localization=chromosomal
2R	VectorBase	contig	1	61545105	.	.	.	translation_table=1;topology=linear;ID=2R;molecule_type=dsDNA;localization=chromosomal
3L	VectorBase	contig	1	41963435	.	.	.	translation_table=1;topology=linear;ID=3L;molecule_type=dsDNA;localization=chromosomal

In [5]:
from collections import defaultdict
num_mRNAs = defaultdict(int)
num_exons = defaultdict(int)
max_exons = 0
max_span = 0
for contig in db.features_of_type('contig'):
    cnt = 0
    for gene in db.region((contig.seqid, contig.start, contig.end), featuretype='gene'):
        cnt += 1
        span = abs(gene.start - gene.end) # strand
        if span > max_span:
            max_span = span
            max_span_gene = gene
        my_mRNAs = list(db.children(gene, featuretype='mRNA'))
        num_mRNAs[len(my_mRNAs)] += 1
        if len(my_mRNAs) == 0:
            exon_check = [gene]
        else:
            exon_check = my_mRNAs
        for check in exon_check:
            num_exons[len(my_exons)] += 1
            if len(my_exons) > max_exons:
                max_exons = len(my_exons)
                max_exons_gene = gene
    print('contig %s, number of genes %d' % (contig.seqid, cnt))
print('Max number of exons: %s (%d)' % (max_exons_gene.id, max_exons))
print('Max span: %s (%d)' % (max_span_gene.id, max_span))
print(num_mRNAs)
print(num_exons)


contig 2L, number of genes 3105
contig 3R, number of genes 2763
contig UNKN, number of genes 567
contig X, number of genes 1097
contig Y_unplaced, number of genes 0
contig Mt, number of genes 37
contig 2R, number of genes 3834
contig 3L, number of genes 2221
Max number of exons: AGAP001660 (67)
Max span: AGAP006656 (365621)
defaultdict(<type 'int'>, {0: 781, 1: 11595, 2: 910, 3: 211, 4: 74, 5: 27, 6: 9, 7: 5, 8: 4, 9: 1, 10: 1, 11: 3, 12: 1, 13: 1, 20: 1})
defaultdict(<type 'int'>, {1: 2019, 2: 3359, 3: 2838, 4: 2091, 5: 1411, 6: 1039, 7: 718, 8: 454, 9: 419, 10: 298, 11: 202, 12: 159, 13: 106, 14: 65, 15: 65, 16: 45, 17: 53, 18: 22, 19: 28, 20: 19, 21: 9, 22: 7, 23: 6, 24: 6, 25: 9, 26: 3, 27: 2, 28: 5, 29: 4, 30: 5, 31: 5, 32: 1, 33: 1, 34: 1, 42: 1, 49: 1, 50: 1, 67: 1})

In [ ]: