In [1]:
import pyGeno.Genome as pgg
import pyGeno.tools.UsefulFunctions as uf
In [2]:
genome_name = 'GRCh38.98'
gene_name = 'POMP'
In [3]:
%%time
ref = pgg.Genome(name=genome_name)
gene = ref.get(pgg.Gene, name=gene_name, gen=False)[0] # gen=False returns list, not generator
In [4]:
print('Strand:', gene.strand)
In [5]:
%%time
for transcript in gene.get(pgg.Transcript):
print(transcript.id)
for exon in transcript.get(pgg.Exon, gen=True):
print(" >", exon.id)
In [6]:
from collections import defaultdict
exon_dict = {'CDS': defaultdict(list), 'NotCDS': defaultdict(list)}
for exon in gene.get(pgg.Exon, gen=True):
exon_dict['CDS' if exon.hasCDS() else 'NotCDS'][exon.id].append(exon.transcript.id)
exon_dict
Out[6]:
In [7]:
# choose a coding exon
exon_id = list(exon_dict['CDS'].keys())[0]
exon = gene.get(pgg.Exon, id=exon_id, gen=False)[0]
In [8]:
print('UTR5:', exon.UTR5)
print('CDS:', exon.CDS)
print('UTR3:', exon.UTR3)
print()
print('sequence:', exon.sequence)
assert exon.sequence == ''.join(exon.UTR5 + exon.CDS + exon.UTR3)
In [9]:
uf.translateDNA_6Frames(exon.CDS)
Out[9]:
In [10]:
exon.transcript.protein.sequence
Out[10]: