In [78]:
import os.path as op
import gffutils
# http://bacteria.ensembl.org/info/website/ftp/index.html
# ftp://ftp.ensemblgenomes.org/pub/release-32/bacteria//gtf/bacteria_0_collection/mycobacterium_tuberculosis_h37rv/
fn = op.expanduser('~/Dropbox/data/ensembl/Mycobacterium_tuberculosis_h37rv.ASM19595v2.32.gtf.gz')
db_name = op.expanduser('~/Dropbox/data/ensembl/Mycobacterium_tuberculosis_h37rv.ASM19595v2.32.db')
In [79]:
#db = gffutils.create_db(fn, dbfn=db_name, force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)
In [80]:
db = gffutils.FeatureDB(db_name, keep_order=True)
In [81]:
def extract_annotations(db):
'''
Extract gene annotations from the gff db
Paramters
---------
db: gffutils.FeatureDB
The database containing annotations database
Returns:
--------
gene_annotations: [{'gene_name': ...}]
Annotations containing the gene starts, ends and exons
'''
annotations = []
for gene in db.features_of_type('gene', order_by='start'):
#print("gene:", gene)
transcripts = db.children(gene, featuretype='transcript')
transcript_objs = []
gene_type = ''
try:
gene_type = gene['gene_biotype'][0]
except Exception as ex:
pass
for transcript in transcripts:
#print("children:", list(db.children(transcript)))
transcript_obj = None
try:
start_codon = next(db.children(transcript, featuretype='start_codon'))
end_codon = next(db.children(transcript, featuretype='stop_codon'))
transcript_obj = {
'start_codon': { 'start': start_codon.start,
'end': start_codon.end,
'chrom': start_codon.chrom,
},
'end_codon': { 'start': end_codon.start,
'end': end_codon.end,
'chrom': end_codon.chrom,
}
}
except Exception as ex:
pass
if transcript_obj is not None:
exons = db.children(transcript, featuretype='exon')
exon_objs = []
for exon in exons:
exon_objs += [{
'start': exon.start,
'end': exon.end,
'chrom': exon.chrom,
}]
transcript_obj['exons'] = exon_objs
cdss = db.children(transcript, featuretype='CDS')
cds_objs = []
for cds in cdss:
cds_objs += [{
'start': cds.start,
'end': cds.end,
'chrom': cds.chrom,
}]
transcript_obj['CDSs'] = cds_objs
transcript_objs += [transcript_obj]
gene_name = ''
try:
gene_name = gene['gene_name'][0]
except Exception as ex:
pass
try:
gene_id = gene['gene_id'][0]
except Exception as ex:
pass
annotations += [{
'start': gene.start,
'end': gene.end,
'chrom': gene.chrom,
'gene_name': gene_name,
'gene_type': gene_type,
'ensembl_id': gene_id,
'transcripts': transcript_objs,
}]
return annotations
annotations_h37rv = extract_annotations(db)
In [82]:
for a in annotations_h37rv[:10]:
print(a['ensembl_id'], a['gene_name'], a['gene_type'], a['start'])
In [75]:
%%time
db_name = op.expanduser('~/Dropbox/data/ensembl/Mus_musculus.GRCm38.91.db')
db = gffutils.FeatureDB(db_name, keep_order=True)
annotations_mm10 = extract_annotations(db)
for a in annotations_mm10[:10]:
print(a['ensembl_id'], a['gene_name'], a['gene_type'], a['start'])
In [76]:
annotations_mm10 = annotations
In [ ]: