In [1]:
import os.path as op
import gffutils
fn = op.expanduser('~/Dropbox/data/ensembl/Mus_musculus.GRCm38.91.gtf.gz')
db_name = op.expanduser('~/Dropbox/data/ensembl/Mus_musculus.GRCm38.91.db')
In [2]:
op.exists(fn)
Out[2]:
In [3]:
#db = gffutils.create_db(fn, dbfn='test.db', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)
In [5]:
db = gffutils.FeatureDB(db_name, keep_order=True)
In [6]:
counter = 0
for gene in db.features_of_type('gene', order_by='start'):
#print("gene:", gene)
transcripts = db.children(gene, featuretype='transcript')
if len(list(transcripts)) > 1:
print('gene', gene)
counter += 1
if counter > 0:
break
In [7]:
gene = db['ENSMUSG00000094836']
print(gene['gene_name'], gene.chrom, gene.start, gene.end)
print(list(db.children(gene, featuretype='transcript')))
In [11]:
transcript = list(db.children(gene, featuretype='transcript'))[0]
list(db.children(transcript))
Out[11]:
In [8]:
print(list(db.children(gene, featuretype='CDS')))
In [58]:
gene = db['ENSMUSG00000093828']
print(gene['gene_name'], gene.chrom, gene.start, gene.end)
print(list(db.children(gene, featuretype='transcript')))
In [69]:
def extract_transcripts(db, gene_id):
'''
Get a list of all transcripts and then output their transcripts
for this gene_id
Parameters:
-----------
db: gffutils database
The database containing all of the annotations for this assembly
gene_id: string
The ensembl ID for the gene
Returns:
--------
transcripts: [{'exon_starts': [], 'exon_ends': []},...]
'''
gene = db[gene_id]
transcripts = []
for transcript in list(db.children(gene, featuretype='transcript')):
#print('transcript:', transcript, transcript.strand)
transcript_data = {'exon_starts': [], 'exon_ends': [], 'strand': transcript.strand}
for exon in list(db.children(transcript, featuretype='exon')):
transcript_data['exon_starts'] += [exon.start]
transcript_data['exon_ends'] += [exon.end]
transcripts += [transcript_data]
return transcripts
with open(op.expanduser('/Users/pete/Dropbox/paper-data/Bonev-et-al-2017/GSM2533843_ES_rep1.txt'), 'r') as f:
f.readline()
for line in f:
parts = line.strip().split()
print('parts:', parts)
transcripts = extract_transcripts(db, gene_id=parts[0])
print('transcripts:', transcripts)
break
In [ ]:
In [ ]: