In [78]:
import os.path as op
import gffutils

# http://bacteria.ensembl.org/info/website/ftp/index.html
# ftp://ftp.ensemblgenomes.org/pub/release-32/bacteria//gtf/bacteria_0_collection/mycobacterium_tuberculosis_h37rv/

fn = op.expanduser('~/Dropbox/data/ensembl/Mycobacterium_tuberculosis_h37rv.ASM19595v2.32.gtf.gz')
db_name = op.expanduser('~/Dropbox/data/ensembl/Mycobacterium_tuberculosis_h37rv.ASM19595v2.32.db')

In [79]:
#db = gffutils.create_db(fn, dbfn=db_name, force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)

In [80]:
db = gffutils.FeatureDB(db_name, keep_order=True)

In [81]:
def extract_annotations(db):
    '''
    Extract gene annotations from the gff db
    
    Paramters
    ---------
    db: gffutils.FeatureDB
        The database containing annotations database
    
    Returns:
    --------
    gene_annotations: [{'gene_name': ...}]
        Annotations containing the gene starts, ends and exons
    '''
    annotations = []

    for gene in db.features_of_type('gene', order_by='start'):
        #print("gene:", gene)
        transcripts = db.children(gene, featuretype='transcript')
        transcript_objs = []

        gene_type = ''

        try:
            gene_type = gene['gene_biotype'][0]
        except Exception as ex:
            pass

        for transcript in transcripts:
            #print("children:", list(db.children(transcript)))

            transcript_obj = None
            
            try:
                start_codon = next(db.children(transcript, featuretype='start_codon'))
                end_codon = next(db.children(transcript, featuretype='stop_codon'))

                transcript_obj = {
                    'start_codon': { 'start': start_codon.start, 
                                    'end': start_codon.end,
                                   'chrom': start_codon.chrom,
                                   },
                    'end_codon': { 'start': end_codon.start, 
                                  'end': end_codon.end,
                                 'chrom': end_codon.chrom,
                                 }
                    }
            except Exception as ex:
                pass

            
            if transcript_obj is not None:
                exons = db.children(transcript, featuretype='exon')
                exon_objs = []
                for exon in exons:
                    exon_objs += [{
                        'start': exon.start,
                        'end': exon.end,
                        'chrom': exon.chrom,
                    }]
                transcript_obj['exons'] = exon_objs

                cdss = db.children(transcript, featuretype='CDS')
                cds_objs = []
                for cds in cdss:
                    cds_objs += [{
                        'start': cds.start,
                        'end': cds.end,
                        'chrom': cds.chrom,
                    }]
                transcript_obj['CDSs'] = cds_objs

                transcript_objs += [transcript_obj]

        gene_name = ''

        try:
            gene_name = gene['gene_name'][0]
        except Exception as ex:
            pass

        try:
            gene_id = gene['gene_id'][0]
        except Exception as ex:
            pass

        annotations += [{
            'start': gene.start,
            'end': gene.end,
            'chrom': gene.chrom,
            'gene_name': gene_name,
            'gene_type': gene_type,
            'ensembl_id': gene_id,
            'transcripts': transcript_objs,  
        }]

    return annotations

annotations_h37rv = extract_annotations(db)

In [82]:
for a in annotations_h37rv[:10]:
    print(a['ensembl_id'], a['gene_name'], a['gene_type'], a['start'])


Rv0001 dnaA protein_coding 1
Rv0002 dnaN protein_coding 2052
Rv0003 recF protein_coding 3280
Rv0004  protein_coding 4434
Rv0005 gyrB protein_coding 5240
Rv0006 gyrA protein_coding 7302
Rv0007  protein_coding 9914
EBG00000313329 ileT tRNA 10887
EBG00000313365 alaT tRNA 11112
Rv0008c  protein_coding 11874

In [75]:
%%time

db_name = op.expanduser('~/Dropbox/data/ensembl/Mus_musculus.GRCm38.91.db')
db = gffutils.FeatureDB(db_name, keep_order=True)
annotations_mm10 = extract_annotations(db)

for a in annotations_mm10[:10]:
    print(a['ensembl_id'], a['gene_name'], a['gene_type'], a['start'])


ENSMUSG00000064336 mt-Tf Mt_tRNA 1
ENSMUSG00000095742 CAAA01147332.1 protein_coding 66
ENSMUSG00000064337 mt-Rnr1 Mt_rRNA 70
ENSMUSG00000094121 Ccl21b protein_coding 394
ENSMUSG00000064338 mt-Tv Mt_tRNA 1025
ENSMUSG00000064339 mt-Rnr2 Mt_rRNA 1094
ENSMUSG00000093828 Ccl27a protein_coding 2012
ENSMUSG00000064340 mt-Tl1 Mt_tRNA 2676
ENSMUSG00000064341 mt-Nd1 protein_coding 2751
ENSMUSG00000096730 Vmn2r122 protein_coding 3536
CPU times: user 4min 2s, sys: 41.5 s, total: 4min 44s
Wall time: 6min 27s

In [76]:
annotations_mm10 = annotations

In [ ]: