In [1]:
from collections import defaultdict, OrderedDict
import warnings
import logging
import gffutils
import pybedtools
import pandas as pd
import copy
import re
from gffutils.pybedtools_integration import tsses

logging.basicConfig(level=logging.INFO)

In [2]:
gtf = '/home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gtf'
gtf_db = '/home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gtf.db'
prefix = '/home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gffutils'
chrsizes = '/home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/fasta/Gallus_gallus.Gallus_gallus-5.0.91.dna.toplevel.sizes'

In [3]:
db = gffutils.create_db(gtf, dbfn=gtf_db, disable_infer_genes=True, disable_infer_transcripts=True, merge_strategy='merge', force=True)
def create_gene_dict(db):
    '''
    Store each feature line db.all_features() as a dict of dicts
    '''
    gene_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    for line_no, feature in enumerate(db.all_features()):
        gene_ids = feature.attributes['gene_id']
        feature_type = feature.featuretype
        if feature_type == 'gene':
            if len(gene_ids)!=1:
                logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))
                break
            else:
                gene_id = gene_ids[0]
                gene_dict[gene_id]['gene'] = feature
        else:
            transcript_ids = feature.attributes['transcript_id']

            for gene_id in gene_ids:
                for transcript_id in transcript_ids:
                    gene_dict[gene_id][transcript_id][feature_type].append(feature)
    return gene_dict

In [4]:
db = gffutils.FeatureDB(gtf_db, keep_order=True)
gene_dict = create_gene_dict(db)

In [5]:
for x in db.featuretypes():
    print(x)


CDS
exon
five_prime_utr
gene
start_codon
stop_codon
three_prime_utr
transcript

In [7]:
def get_gene_list(gene_dict):
    return list(set(gene_dict.keys()))

def get_UTR_regions(gene_dict, gene_id, transcript, cds):
    if len(cds)==0:
        return [], []
    utr5_regions = []
    utr3_regions = []
    utrs = gene_dict[gene_id][transcript]['UTR']
    first_cds = cds[0]
    last_cds = cds[-1]
    for utr in utrs:
        ## Push all cds at once
        ## Sort later to remove duplicates
        strand = utr.strand
        if strand == '+':
            if utr.stop < first_cds.start:
                utr.feature_type = 'five_prime_UTR'
                utr5_regions.append(utr)
            elif utr.start > last_cds.stop:
                utr.feature_type = 'three_prime_UTR'
                utr3_regions.append(utr)
            else:
                raise RuntimeError('Error with cds')
        elif strand == '-':
            if utr.stop < first_cds.start:
                utr.feature_type = 'three_prime_UTR'
                utr3_regions.append(utr)
            elif utr.start > last_cds.stop:
                utr.feature_type = 'five_prime_UTR'
                utr5_regions.append(utr)                
            else:
                raise RuntimeError('Error with cds')    
    return utr5_regions, utr3_regions
    
def create_bed(regions, bedtype='0'):
    '''Create bed from list of regions
    bedtype: 0 or 1
        0-Based or 1-based coordinate of the BED
    '''
    bedstr = ''
    for region in regions:
        assert len(region.attributes['gene_id']) == 1
        ## GTF start is 1-based, so shift by one while writing 
        ## to 0-based BED format
        if bedtype == '0':
            start = region.start - 1
        else:
            start = region.start
        bedstr += '{}\t{}\t{}\t{}\t{}\t{}\n'.format(region.chrom,
                                             start,
                                             region.stop,
                                             re.sub('\.\d+', '', region.attributes['gene_id'][0]),
                                             '.',
                                             region.strand)
    # Remove duplicates
    bedstr = '\n'.join(list(OrderedDict.fromkeys(bedstr.split('\n'))))
    return bedstr

def rename_regions(regions, gene_id):
    regions = list(regions)
    if len(regions) == 0:
        return []
    for region in regions:
        region.attributes['gene_id'] = gene_id
    return regions

def merge_regions(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start))
    return merged

def merge_regions_nostrand(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start), ignore_strand=True)
    return merged

In [8]:
utr5_bed = ''
utr3_bed = ''



gene_bed = ''
exon_bed = ''
intron_bed = ''
start_codon_bed = ''
stop_codon_bed = ''
cds_bed = ''

gene_list = []

for gene_id in get_gene_list(gene_dict):
    gene_list.append(gene_dict[gene_id]['gene'])
    
    utr5_regions, utr3_regions = [], []
    exon_regions, intron_regions = [], []
    star_codon_regions, stop_codon_regions = [], []
    cds_regions = []
    
    for feature in gene_dict[gene_id].keys():
        if feature == 'gene':
            continue
        cds = list(gene_dict[gene_id][feature]['CDS'])
        exons = list(gene_dict[gene_id][feature]['exon'])
        merged_exons = merge_regions(db, exons)
        introns = db.interfeatures(merged_exons)
        utr5_region = list(gene_dict[gene_id][feature]['five_prime_utr'])
        utr3_region = list(gene_dict[gene_id][feature]['three_prime_utr'])
        utr5_regions += utr5_region
        utr3_regions += utr3_region
        exon_regions += exons
        intron_regions += introns
        cds_regions += cds
        
    merged_utr5 = merge_regions(db, utr5_regions)
    renamed_utr5 = rename_regions(merged_utr5, gene_id)
    
    merged_utr3 = merge_regions(db, utr3_regions)
    renamed_utr3 = rename_regions(merged_utr3, gene_id)
    
    merged_exons = merge_regions(db, exon_regions)
    renamed_exons = rename_regions(merged_exons, gene_id)
    
    merged_introns = merge_regions(db, intron_regions)
    renamed_introns = rename_regions(merged_introns, gene_id)
    
    merged_cds = merge_regions(db, cds_regions)
    renamed_cds = rename_regions(merged_cds, gene_id)
    
    utr3_bed += create_bed(renamed_utr3)
    utr5_bed += create_bed(renamed_utr5)
    exon_bed += create_bed(renamed_exons)
    intron_bed += create_bed(renamed_introns)
    cds_bed += create_bed(renamed_cds)
    
    
gene_bed = create_bed(gene_list)
gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)
utr5_bedtool = pybedtools.BedTool(utr5_bed, from_string=True)
utr3_bedtool = pybedtools.BedTool(utr3_bed, from_string=True)
exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)
intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)
cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)

gene_bedtool.remove_invalid().sort().saveas('{}.genes.bed'.format(prefix))
utr5_bedtool.remove_invalid().sort().saveas('{}.UTR5.bed'.format(prefix))
utr3_bedtool.remove_invalid().sort().saveas('{}.UTR3.bed'.format(prefix))
exon_bedtool.remove_invalid().sort().saveas('{}.exon.bed'.format(prefix))
intron_bedtool.remove_invalid().sort().saveas('{}.intron.bed'.format(prefix))
cds_bedtool.remove_invalid().sort().saveas('{}.cds.bed'.format(prefix))


Out[8]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gffutils.cds.bed)>

In [9]:
utr5_region_bed = ''
utr3_region_bed = ''


for gene_id in get_gene_list(gene_dict):
    utr5_regions = []
    utr3_regions = []
    for utr5_region in db.children(gene_id, featuretype='five_prime_utr'):
        utr5_regions.append(utr5_region)
    for utr3_region in db.children(gene_id, featuretype='three_prime_utr'):
        utr3_regions.append(utr3_region)
    merged_utr5_regions = merge_regions(db, utr5_regions)
    renamed_utr5_regions = rename_regions(merged_utr5_regions, gene_id)
    merged_utr3_regions = merge_regions(db, utr3_regions)
    renamed_utr3_regions = rename_regions(merged_utr3_regions, gene_id)
    
    utr5_region_bed += create_bed(renamed_utr5_regions)    
    utr3_region_bed += create_bed(renamed_utr3_regions)


    
utr5_region_bedtool = pybedtools.BedTool(utr5_region_bed, from_string=True)
utr3_region_bedtool = pybedtools.BedTool(utr3_region_bed, from_string=True)

In [10]:
utr5_region_bedtool.remove_invalid().sort().saveas('{}.utr5_region.bed'.format(prefix))
utr3_region_bedtool.remove_invalid().sort().saveas('{}.utr3_region.bed'.format(prefix))


Out[10]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gffutils.utr3_region.bed)>

In [11]:
'{}.utr5_region.bed'.format(prefix)


Out[11]:
'/home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gffutils.utr5_region.bed'

In [12]:
!bedSort /home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gffutils.utr3_region.bed /home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gffutils.utr3_region.bed

In [13]:
!bedSort /home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gffutils.utr5_region.bed /home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gffutils.utr5_region.bed

In [14]:
tRNA_sites = []
tRNA_bed = ''
for gene_id in get_gene_list(gene_dict):
    for transcript in db.children(gene_id, featuretype='transcript'):
        if 'tRNA' in transcript.attributes['gene_biotype'] or 'Mt_tRNA' in transcript.attributes['transcript_biotype']:
            tRNA_sites.append(transcript)
    #merged_tRNA_sites = merge_regions_nostrand(db, tRNA_sites)
    #renamed_tRNA_sites = rename_regions(merged_tRNA_sites, gene_id)
    tRNA_bed += create_bed(tRNA_sites)

tRNA_bed = '\n'.join(list(OrderedDict.fromkeys(tRNA_bed.split('\n'))))
tRNA_sites_bedtool = pybedtools.BedTool(tRNA_bed, from_string=True)
tRNA_sites_bedtool.remove_invalid().sort().saveas('{}.tRNA_sites.bed'.format(prefix))


Out[14]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gffutils.tRNA_sites.bed)>

In [15]:
tRNA_sites_bedtool.to_dataframe()


Out[15]:
chrom start end name score strand
0 MT 5101 5172 ENSGALG00000029193 . -
1 MT 9015 9083 ENSGALG00000038283 . +
2 MT 8260 8329 ENSGALG00000031197 . +
3 MT 6433 6506 ENSGALG00000033139 . -
4 MT 12863 12932 ENSGALG00000042903 . +
5 MT 12999 13070 ENSGALG00000036970 . +
6 MT 12933 12998 ENSGALG00000034022 . +
7 MT 6507 6573 ENSGALG00000034813 . -
8 MT 6361 6430 ENSGALG00000035392 . -
9 MT 16707 16775 ENSGALG00000039249 . -
10 MT 3966 4040 ENSGALG00000040296 . +
11 MT 8123 8258 ENSGALG00000033462 . -
12 MT 5024 5096 ENSGALG00000035975 . +
13 MT 5171 5240 ENSGALG00000038760 . +
14 MT 2272 2345 ENSGALG00000032059 . +
15 MT 6279 6355 ENSGALG00000035685 . +
16 MT 10707 10775 ENSGALG00000038243 . +
17 MT 1227 1296 ENSGALG00000041922 . +
18 MT 11127 11195 ENSGALG00000037369 . +
19 MT 16038 16107 ENSGALG00000032370 . +
20 MT 6572 6643 ENSGALG00000037641 . -
21 MT 16107 16177 ENSGALG00000042677 . -

In [16]:
rRNA_sites = []
rRNA_bed = ''
for gene_id in get_gene_list(gene_dict):
    for transcript in db.children(gene_id, featuretype='transcript'):
        if 'rRNA' in transcript.attributes['gene_biotype']:
            rRNA_sites.append(transcript)
    #renamed_rRNA_sites = rename_regions(rRNA_sites, gene_id)
    rRNA_bed += create_bed(rRNA_sites)
rRNA_bed = '\n'.join(list(OrderedDict.fromkeys(rRNA_bed.split('\n'))))
rRNA_sites_bedtool = pybedtools.BedTool(rRNA_bed, from_string=True)
rRNA_sites_bedtool.remove_invalid().sort().saveas('{}.rRNA_sites.bed'.format(prefix))


Out[16]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gffutils.rRNA_sites.bed)>

In [17]:
rRNA_sites_bedtool.to_dataframe()


Out[17]:
chrom start end name score strand
0 AADN04016707.1 564 682 ENSGALG00000031236 . -
1 AADN04003868.1 7975 8128 ENSGALG00000032920 . +
2 AADN04001404.1 14906 15029 ENSGALG00000039272 . +
3 2 28184027 28184138 ENSGALG00000027900 . -
4 AADN04001404.1 2298 2417 ENSGALG00000036591 . +
5 AADN04007760.1 9983 10135 ENSGALG00000037964 . -
6 AADN04017093.1 2714 2866 ENSGALG00000037963 . +
7 AADN04003312.1 248 367 ENSGALG00000032451 . +
8 AADN04001404.1 25686 25804 ENSGALG00000039768 . +
9 AADN04003312.1 12866 12980 ENSGALG00000029463 . +
10 AADN04002199.1 11011 11130 ENSGALG00000035984 . +
11 AADN04003863.1 8576 8729 ENSGALG00000033913 . +
12 AADN04006969.1 8399 8552 ENSGALG00000036940 . -
13 AADN04004372.1 9347 9499 ENSGALG00000029838 . +
14 1 166149900 166150027 ENSGALG00000017789 . -
15 AADN04004449.1 5267 5386 ENSGALG00000030448 . +
16 AADN04005700.1 3890 4045 ENSGALG00000029977 . +
17 1 45543827 45543933 ENSGALG00000028788 . -
18 AADN04002018.1 7780 7899 ENSGALG00000029786 . +
19 9 937970 938092 ENSGALG00000039601 . -
20 AADN04005104.1 12382 12535 ENSGALG00000041531 . -
21 KQ759483.1 693765 693883 ENSGALG00000038956 . -
22 AADN04005797.1 10116 10269 ENSGALG00000029549 . -
23 AADN04002199.1 19365 19484 ENSGALG00000037013 . +
24 AADN04004449.1 11791 11910 ENSGALG00000038288 . +
25 AADN04002018.1 12150 12269 ENSGALG00000038105 . +
26 AADN04000891.1 18394 18513 ENSGALG00000032076 . +
27 AADN04002199.1 163 282 ENSGALG00000037931 . +
28 AADN04003737.1 9352 9505 ENSGALG00000041381 . -
29 AADN04003187.1 10189 10342 ENSGALG00000040262 . -
... ... ... ... ... ... ...
173 AADN04002587.1 11093 11230 ENSGALG00000039667 . +
174 AADN04001089.1 1384 1536 ENSGALG00000029561 . +
175 9 935922 936041 ENSGALG00000037069 . -
176 8 19457840 19457958 ENSGALG00000027857 . -
177 AADN04001404.1 4389 4508 ENSGALG00000043549 . +
178 AADN04001305.1 3010 3162 ENSGALG00000043542 . -
179 AADN04006451.1 7165 7318 ENSGALG00000029496 . +
180 AADN04002018.1 9965 10084 ENSGALG00000031750 . +
181 AADN04005188.1 2081 2199 ENSGALG00000038067 . +
182 AADN04006374.1 11848 11982 ENSGALG00000037550 . -
183 2 135524230 135524346 ENSGALG00000025601 . +
184 AADN04011704.1 3305 3425 ENSGALG00000041250 . +
185 9 944328 944447 ENSGALG00000029609 . -
186 AADN04000891.1 20578 20697 ENSGALG00000036818 . +
187 AADN04003312.1 14520 14639 ENSGALG00000032018 . +
188 AADN04009122.1 8449 8601 ENSGALG00000031378 . -
189 AADN04004293.1 6937 7089 ENSGALG00000042701 . +
190 AADN04011704.1 7432 7552 ENSGALG00000029869 . +
191 AADN04002836.1 1555 1708 ENSGALG00000040165 . +
192 AADN04000891.1 22744 22863 ENSGALG00000029288 . +
193 AADN04005188.1 12747 12866 ENSGALG00000041326 . +
194 AADN04004449.1 9613 9732 ENSGALG00000043663 . +
195 AADN04009588.1 5404 5522 ENSGALG00000043431 . +
196 AADN04004449.1 1369 1488 ENSGALG00000040257 . +
197 AADN04003913.1 12476 12629 ENSGALG00000038196 . -
198 AADN04001404.1 23483 23602 ENSGALG00000031897 . +
199 AADN04003312.1 8596 8715 ENSGALG00000030604 . +
200 AADN04003312.1 2313 2366 ENSGALG00000031407 . +
201 AADN04011388.1 5207 5360 ENSGALG00000035877 . -
202 AADN04007387.1 7607 7756 ENSGALG00000036379 . +

203 rows × 6 columns


In [18]:
for gene_id in get_gene_list(gene_dict):
    start_codons = []
    stop_codons = []
    for start_codon in db.children(gene_id, featuretype='start_codon'):
        ## 1 -based stop
        ## 0-based start handled while converting to bed
        start_codon.stop = start_codon.start
        start_codons.append(start_codon)
    for stop_codon in db.children(gene_id, featuretype='stop_codon'):
        stop_codon.start = stop_codon.stop
        stop_codon.stop = stop_codon.stop+1
        stop_codons.append(stop_codon)
    merged_start_codons = merge_regions(db, start_codons)
    renamed_start_codons = rename_regions(merged_start_codons, gene_id)
    merged_stop_codons = merge_regions(db, stop_codons)
    renamed_stop_codons = rename_regions(merged_stop_codons, gene_id)
    
    start_codon_bed += create_bed(renamed_start_codons)    
    stop_codon_bed += create_bed(renamed_stop_codons)


    
start_codon_bedtool = pybedtools.BedTool(start_codon_bed, from_string=True)
stop_codon_bedtool = pybedtools.BedTool(stop_codon_bed, from_string=True)
start_codon_bedtool.remove_invalid().sort().saveas('{}.start_codon.bed'.format(prefix))
stop_codon_bedtool.remove_invalid().sort().saveas('{}.stop_codon.bed'.format(prefix))


Out[18]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gffutils.stop_codon.bed)>

In [19]:
## TSS
polyA_sites_bed = ''
tss_sites_bed = ''
for gene_id in get_gene_list(gene_dict):
    tss_sites = []
    polyA_sites = []
    for transcript in db.children(gene_id, featuretype='transcript'):
        start_t = copy.deepcopy(transcript)
        stop_t = copy.deepcopy(transcript)
        
        start_t.stop = start_t.start + 1
        
        stop_t.start = stop_t.stop
        
        if transcript.strand == '-':
            start_t, stop_t = stop_t, start_t
        polyA_sites.append(start_t)
        tss_sites.append(stop_t)
    merged_polyA_sites = merge_regions(db, polyA_sites)
    renamed_polyA_sites = rename_regions(merged_polyA_sites, gene_id)    
    merged_tss_sites = merge_regions(db, tss_sites)
    renamed_tss_sites = rename_regions(merged_tss_sites, gene_id)
    polyA_sites_bed += create_bed(renamed_polyA_sites)    
    tss_sites_bed += create_bed(renamed_tss_sites)

polyA_sites_bedtool = pybedtools.BedTool(polyA_sites_bed, from_string=True)
tss_sites_bedtool = pybedtools.BedTool(tss_sites_bed, from_string=True)
polyA_sites_bedtool.remove_invalid().sort().saveas('{}.polyA_sites.bed'.format(prefix))
tss_sites_bedtool.remove_invalid().sort().saveas('{}.tss_sites.bed'.format(prefix))


Out[19]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/annotation/Gallus_gallus.Gallus_gallus-5.0.91.gffutils.tss_sites.bed)>

In [20]:
tss = tsses(db, as_bed6=True)#, merge_overlapping=True)
tss.remove_invalid().sort().saveas('{}.tss_temp.bed'.format(prefix))
promoter = tss.slop(l=1000, r=1000, s=True, g=chrsizes)
promoter.remove_invalid().sort().saveas('{}.promoter.1000.bed'.format(prefix))



ValueErrorTraceback (most recent call last)
<ipython-input-20-c8e3b035a63e> in <module>()
      1 tss = tsses(db, as_bed6=True)#, merge_overlapping=True)
      2 tss.remove_invalid().sort().saveas('{}.tss_temp.bed'.format(prefix))
----> 3 promoter = tss.slop(l=1000, r=1000, s=True, g=chrsizes)
      4 promoter.remove_invalid().sort().saveas('{}.promoter.1000.bed'.format(prefix))

/home/cmb-panasas2/skchoudh/software_frozen/anaconda27/lib/python2.7/site-packages/pybedtools/bedtool.pyc in decorated(self, *args, **kwargs)
    804             # this calls the actual method in the first place; *result* is
    805             # whatever you get back
--> 806             result = method(self, *args, **kwargs)
    807 
    808             # add appropriate tags

/home/cmb-panasas2/skchoudh/software_frozen/anaconda27/lib/python2.7/site-packages/pybedtools/bedtool.pyc in wrapped(self, *args, **kwargs)
    286                             check_for_genome = True
    287             if check_for_genome:
--> 288                 kwargs = self.check_genome(**kwargs)
    289 
    290             # For sequence methods, we may need to make a tempfile that will

/home/cmb-panasas2/skchoudh/software_frozen/anaconda27/lib/python2.7/site-packages/pybedtools/bedtool.pyc in check_genome(self, **kwargs)
   1448 
   1449         if not os.path.exists(kwargs['g']):
-> 1450             raise ValueError('Genome file "%s" does not exist' % (kwargs['g']))
   1451 
   1452         return kwargs

ValueError: Genome file "/home/cmb-panasas2/skchoudh/genomes/gallus_gallus_5.0/fasta/Gallus_gallus.Gallus_gallus-5.0.91.dna.toplevel.sizes" does not exist

In [ ]:
for l in [1000, 2000, 3000, 4000, 5000]:
    promoter = tss.slop(l=l, r=l, s=True, g=chrsizes)
    promoter.remove_invalid().sort().saveas('{}.promoter.{}.bed'.format(prefix, l))

In [ ]: