In [1]:
import os.path as op
import gffutils

fn = op.expanduser('~/Dropbox/data/ensembl/Mus_musculus.GRCm38.91.gtf.gz')
db_name = op.expanduser('~/Dropbox/data/ensembl/Mus_musculus.GRCm38.91.db')

In [2]:
op.exists(fn)


Out[2]:
True

In [3]:
#db = gffutils.create_db(fn, dbfn='test.db', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)

In [5]:
db = gffutils.FeatureDB(db_name, keep_order=True)

In [6]:
counter = 0

for gene in db.features_of_type('gene', order_by='start'):
    #print("gene:", gene)
    transcripts = db.children(gene, featuretype='transcript')
    if len(list(transcripts)) > 1:
        print('gene', gene)
        counter += 1
        if counter > 0:
            break


gene JH584294.1	ensembl	gene	2012	3786	.	+	.	gene_id "ENSMUSG00000093828"; gene_version "7"; gene_name "Ccl27a"; gene_source "ensembl"; gene_biotype "protein_coding";

In [7]:
gene = db['ENSMUSG00000094836']
print(gene['gene_name'], gene.chrom, gene.start, gene.end)
print(list(db.children(gene, featuretype='transcript')))


['AC123873.3'] JH584299.1 4975 7356
[<Feature transcript (JH584299.1:4975-7356[+]) at 0x1039dfe48>]

In [11]:
transcript = list(db.children(gene, featuretype='transcript'))[0]
list(db.children(transcript))


Out[11]:
[<Feature CDS (JH584299.1:4975-5243[+]) at 0x1042f4518>,
 <Feature CDS (JH584299.1:5735-6325[+]) at 0x1042f4470>,
 <Feature CDS (JH584299.1:7137-7353[+]) at 0x1042f44e0>,
 <Feature transcript (JH584299.1:4975-7356[+]) at 0x1042f4710>,
 <Feature exon (JH584299.1:4975-5243[+]) at 0x1042f48d0>,
 <Feature exon (JH584299.1:5735-6325[+]) at 0x1042f4ac8>,
 <Feature exon (JH584299.1:7137-7356[+]) at 0x1042f4cc0>,
 <Feature start_codon (JH584299.1:4975-4977[+]) at 0x1042f4ef0>,
 <Feature stop_codon (JH584299.1:7354-7356[+]) at 0x1066a6128>]

In [8]:
print(list(db.children(gene, featuretype='CDS')))


[<Feature CDS (JH584299.1:4975-5243[+]) at 0x1066ed438>, <Feature CDS (JH584299.1:5735-6325[+]) at 0x1066ed2b0>, <Feature CDS (JH584299.1:7137-7353[+]) at 0x1066ed400>]

In [58]:
gene = db['ENSMUSG00000093828']
print(gene['gene_name'], gene.chrom, gene.start, gene.end)
print(list(db.children(gene, featuretype='transcript')))


['Ccl27a'] JH584294.1 2012 3786
[<Feature transcript (JH584294.1:2012-3786[+]) at 0x10550c6a0>, <Feature transcript (JH584294.1:3094-3786[+]) at 0x108c81978>]

In [69]:
def extract_transcripts(db, gene_id):
    '''
    Get a list of all transcripts and then output their transcripts
    for this gene_id
    
    Parameters:
    -----------
    db: gffutils database
        The database containing all of the annotations for this assembly
    gene_id: string
        The ensembl ID for the gene
        
    Returns:
    --------
        transcripts: [{'exon_starts': [], 'exon_ends': []},...]
    '''
    gene = db[gene_id]
    transcripts = []
    
    for transcript in list(db.children(gene, featuretype='transcript')):
        #print('transcript:', transcript, transcript.strand)
        transcript_data = {'exon_starts': [], 'exon_ends': [], 'strand': transcript.strand}
        for exon in list(db.children(transcript, featuretype='exon')):
            transcript_data['exon_starts'] += [exon.start]
            transcript_data['exon_ends'] += [exon.end]
            
        transcripts += [transcript_data]
    return transcripts
        
with open(op.expanduser('/Users/pete/Dropbox/paper-data/Bonev-et-al-2017/GSM2533843_ES_rep1.txt'), 'r') as f:
    f.readline()
    for line in f:
        parts = line.strip().split()
        print('parts:', parts)
        
        transcripts = extract_transcripts(db, gene_id=parts[0])
        print('transcripts:', transcripts)
        break


parts: ['ENSMUSG00000000001', '42.0474674420546', 'Gnai3']
transcripts: [{'exon_starts': [108145888, 108123795, 108123542, 108118301, 108115763, 108112473, 108111935, 108109403, 108107280], 'exon_ends': [108146146, 108123837, 108123683, 108118458, 108115891, 108112602, 108112088, 108109612, 108109316], 'strand': '-'}]

In [ ]:


In [ ]: