In [1]:
import pyGeno.Genome as pgg
import pyGeno.tools.UsefulFunctions as uf

Initialize genome and select gene


In [2]:
genome_name = 'GRCh38.98'
gene_name = 'POMP'

In [3]:
%%time
ref = pgg.Genome(name=genome_name)
gene = ref.get(pgg.Gene, name=gene_name, gen=False)[0]  # gen=False returns list, not generator


CPU times: user 1.91 ms, sys: 0 ns, total: 1.91 ms
Wall time: 15.8 ms

In [4]:
print('Strand:', gene.strand)


Strand: +

From transcripts to exons


In [5]:
%%time
for transcript in gene.get(pgg.Transcript):
    print(transcript.id)
    for exon in transcript.get(pgg.Exon, gen=True):
        print("    >", exon.id)


ENST00000460403
    > ENSE00003538396
    > ENSE00001927741
    > ENSE00003641576
    > ENSE00003658469
    > ENSE00003634423
    > ENSE00003676638
    > ENSE00003621088
ENST00000380842
    > ENSE00003667813
    > ENSE00003577317
    > ENSE00003620057
    > ENSE00003521643
    > ENSE00003676638
    > ENSE00003849438
CPU times: user 13.5 ms, sys: 1.94 ms, total: 15.4 ms
Wall time: 48.1 ms

From exons to transcripts


In [6]:
from collections import defaultdict
exon_dict = {'CDS': defaultdict(list), 'NotCDS': defaultdict(list)}
for exon in gene.get(pgg.Exon, gen=True):
    exon_dict['CDS' if exon.hasCDS() else 'NotCDS'][exon.id].append(exon.transcript.id)
exon_dict


Out[6]:
{'CDS': defaultdict(list,
             {'ENSE00003634423': ['ENST00000460403'],
              'ENSE00003676638': ['ENST00000460403', 'ENST00000380842'],
              'ENSE00003621088': ['ENST00000460403'],
              'ENSE00003667813': ['ENST00000380842'],
              'ENSE00003577317': ['ENST00000380842'],
              'ENSE00003620057': ['ENST00000380842'],
              'ENSE00003521643': ['ENST00000380842'],
              'ENSE00003849438': ['ENST00000380842']}),
 'NotCDS': defaultdict(list,
             {'ENSE00003538396': ['ENST00000460403'],
              'ENSE00001927741': ['ENST00000460403'],
              'ENSE00003641576': ['ENST00000460403'],
              'ENSE00003658469': ['ENST00000460403']})}

Choose a coding exon


In [7]:
# choose a coding exon
exon_id = list(exon_dict['CDS'].keys())[0]
exon = gene.get(pgg.Exon, id=exon_id, gen=False)[0]

Peep into the object structure


In [8]:
print('UTR5:', exon.UTR5)
print('CDS:', exon.CDS) 
print('UTR3:', exon.UTR3)
print()
print('sequence:', exon.sequence)

assert exon.sequence == ''.join(exon.UTR5 + exon.CDS + exon.UTR3)


UTR5: TTCCAGCTCAACCAAGATAAA
CDS: ATGAATTTTTCCACACTGAGAAACATTCAGGGTCTATTTGCTCCGCTAAAATTACAGATGGAATTCAAGGCAGTGCAGCAG
UTR3: 

sequence: TTCCAGCTCAACCAAGATAAAATGAATTTTTCCACACTGAGAAACATTCAGGGTCTATTTGCTCCGCTAAAATTACAGATGGAATTCAAGGCAGTGCAGCAG

Translate in 6 frames


In [9]:
uf.translateDNA_6Frames(exon.CDS)


Out[9]:
('MNFSTLRNIQGLFAPLKLQMEFKAVQQ',
 '*IFPH*ETFRVYLLR*NYRWNSRQCS',
 'EFFHTEKHSGSICSAKITDGIQGSAA',
 'LLHCLEFHL*F*RSK*TLNVSQCGKIH',
 'CCTALNSICNFSGANRP*MFLSVEKF',
 'AALP*IPSVILAEQIDPECFSVWKNS')

Easily get the protein sequence corresponding to the transcript containing that exon


In [10]:
exon.transcript.protein.sequence


Out[10]:
'MNFSTLRNIQGLFAPLKLQMEFKAVQQVQRLPFLSSSNLSLDVLRGNDETIGFEDILNDPSQSEVMGEPHLMVEYKLGLL'