In [1]:
!rm -rf gambiae.g?f.gz ag.db 2>/dev/null
#!wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gtfgz -O gambiae.gtf.gz
!wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz -O gambiae.gff.gz
In [1]:
import gffutils
import sqlite3
In [3]:
!rm -f ag.db
In [2]:
try:
db = gffutils.create_db('gambiae.gff.gz', 'ag.db')
except sqlite3.OperationalError:
db = gffutils.FeatureDB('ag.db')
In [3]:
print(list(db.featuretypes()))
for feat_type in db.featuretypes():
print(feat_type, db.count_features_of_type(feat_type))
In [4]:
for contig in db.features_of_type('contig'):
print(contig)
In [5]:
from collections import defaultdict
num_mRNAs = defaultdict(int)
num_exons = defaultdict(int)
max_exons = 0
max_span = 0
for contig in db.features_of_type('contig'):
cnt = 0
for gene in db.region((contig.seqid, contig.start, contig.end), featuretype='gene'):
cnt += 1
span = abs(gene.start - gene.end) # strand
if span > max_span:
max_span = span
max_span_gene = gene
my_mRNAs = list(db.children(gene, featuretype='mRNA'))
num_mRNAs[len(my_mRNAs)] += 1
if len(my_mRNAs) == 0:
exon_check = [gene]
else:
exon_check = my_mRNAs
for check in exon_check:
num_exons[len(my_exons)] += 1
if len(my_exons) > max_exons:
max_exons = len(my_exons)
max_exons_gene = gene
print('contig %s, number of genes %d' % (contig.seqid, cnt))
print('Max number of exons: %s (%d)' % (max_exons_gene.id, max_exons))
print('Max span: %s (%d)' % (max_span_gene.id, max_span))
print(num_mRNAs)
print(num_exons)
In [ ]: