In [1]:
from collections import defaultdict, OrderedDict
import warnings
import logging
import gffutils
import pybedtools
import pandas as pd
import copy
import re
from gffutils.pybedtools_integration import tsses

logging.basicConfig(level=logging.INFO)

In [2]:
gtf = '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/annotation/Drosophila_melanogaster.BDGP6.91.gtf'
gtf_db = '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/annotation/Drosophila_melanogaster.BDGP6.91.gtf.db'
prefix = '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/annotation/Drosophila_melanogaster.BDGP6.91.gffutils'
chrsizes = '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/fasta/Drosophila_melanogaster.BDGP6.dna.toplevel.sizes'

In [3]:
def create_gene_dict(db):
    '''
    Store each feature line db.all_features() as a dict of dicts
    '''
    gene_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    for line_no, feature in enumerate(db.all_features()):
        gene_ids = feature.attributes['gene_id']
        feature_type = feature.featuretype
        if feature_type == 'gene':
            if len(gene_ids)!=1:
                logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))
                break
            else:
                gene_id = gene_ids[0]
                gene_dict[gene_id]['gene'] = feature
        else:
            transcript_ids = feature.attributes['transcript_id']

            for gene_id in gene_ids:
                for transcript_id in transcript_ids:
                    gene_dict[gene_id][transcript_id][feature_type].append(feature)
    return gene_dict

In [4]:
db = gffutils.create_db(gtf, dbfn=gtf_db, disable_infer_genes=True, disable_infer_transcripts=True, merge_strategy='merge', force=True)
db = gffutils.FeatureDB(gtf_db, keep_order=True)
gene_dict = create_gene_dict(db)

In [5]:
for x in db.featuretypes():
    print(x)


CDS
Selenocysteine
exon
five_prime_utr
gene
start_codon
three_prime_utr
transcript

In [6]:
def get_gene_list(gene_dict):
    return list(set(gene_dict.keys()))


    
def create_bed(regions, bedtype='0'):
    '''Create bed from list of regions
    bedtype: 0 or 1
        0-Based or 1-based coordinate of the BED
    '''
    bedstr = ''
    for region in regions:
        assert len(region.attributes['gene_id']) == 1
        ## GTF start is 1-based, so shift by one while writing 
        ## to 0-based BED format
        if bedtype == '0':
            start = region.start - 1
        else:
            start = region.start
        bedstr += '{}\t{}\t{}\t{}\t{}\t{}\n'.format(region.chrom,
                                             start,
                                             region.stop,
                                             re.sub('\.\d+', '', region.attributes['gene_id'][0]),
                                             '.',
                                             region.strand)
    return bedstr

def rename_regions(regions, gene_id):
    regions = list(regions)
    if len(regions) == 0:
        return []
    for region in regions:
        region.attributes['gene_id'] = gene_id
    return regions

def merge_regions(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start))
    return merged

def merge_regions_nostrand(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start), ignore_strand=True)
    return merged

In [7]:
utr5_bed = ''
utr3_bed = ''
gene_bed = ''
exon_bed = ''
intron_bed = ''
start_codon_bed = ''
stop_codon_bed = ''
cds_bed = ''

gene_list = []

for gene_id in get_gene_list(gene_dict):
    gene_list.append(gene_dict[gene_id]['gene'])
    
    utr5_regions, utr3_regions = [], []
    exon_regions, intron_regions = [], []
    star_codon_regions, stop_codon_regions = [], []
    cds_regions = []
    
    for feature in gene_dict[gene_id].keys():
        if feature == 'gene':
            continue
        cds = list(gene_dict[gene_id][feature]['CDS'])
        exons = list(gene_dict[gene_id][feature]['exon'])
        merged_exons = merge_regions(db, exons)
        introns = db.interfeatures(merged_exons)
        #utr5_region, utr3_region = get_UTR_regions(gene_dict, gene_id, feature, cds)
        utr5_region = list(gene_dict[gene_id][feature]['five_prime_utr'])
        utr3_region = list(gene_dict[gene_id][feature]['three_prime_utr'])
        utr5_regions += utr5_region
        utr3_regions += utr3_region
        exon_regions += exons
        intron_regions += introns
        cds_regions += cds
        
    merged_utr5 = merge_regions(db, utr5_regions)
    renamed_utr5 = rename_regions(merged_utr5, gene_id)
    
    merged_utr3 = merge_regions(db, utr3_regions)
    renamed_utr3 = rename_regions(merged_utr3, gene_id)
    
    merged_exons = merge_regions(db, exon_regions)
    renamed_exons = rename_regions(merged_exons, gene_id)
    
    merged_introns = merge_regions(db, intron_regions)
    renamed_introns = rename_regions(merged_introns, gene_id)
    
    merged_cds = merge_regions(db, cds_regions)
    renamed_cds = rename_regions(merged_cds, gene_id)
    
    utr3_bed += create_bed(renamed_utr3)
    utr5_bed += create_bed(renamed_utr5)
    exon_bed += create_bed(renamed_exons)
    intron_bed += create_bed(renamed_introns)
    cds_bed += create_bed(renamed_cds)
    
    
gene_bed = create_bed(gene_list)
gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)
utr5_bedtool = pybedtools.BedTool(utr5_bed, from_string=True)
utr3_bedtool = pybedtools.BedTool(utr3_bed, from_string=True)
exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)
intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)
cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)

gene_bedtool.remove_invalid().sort().saveas('{}.genes.bed'.format(prefix))
utr5_bedtool.remove_invalid().sort().saveas('{}.UTR5.bed'.format(prefix))
utr3_bedtool.remove_invalid().sort().saveas('{}.UTR3.bed'.format(prefix))
exon_bedtool.remove_invalid().sort().saveas('{}.exon.bed'.format(prefix))
intron_bedtool.remove_invalid().sort().saveas('{}.intron.bed'.format(prefix))
cds_bedtool.remove_invalid().sort().saveas('{}.cds.bed'.format(prefix))


Out[7]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/annotation/Drosophila_melanogaster.BDGP6.91.gffutils.cds.bed)>

In [8]:
for gene_id in get_gene_list(gene_dict):
    start_codons = []
    stop_codons = []
    for start_codon in db.children(gene_id, featuretype='start_codon'):
        ## 1 -based stop
        ## 0-based start handled while converting to bed
        start_codon.stop = start_codon.start
        start_codons.append(start_codon)
    for stop_codon in db.children(gene_id, featuretype='stop_codon'):
        stop_codon.start = stop_codon.stop
        stop_codon.stop = stop_codon.stop+1
        stop_codons.append(stop_codon)
    merged_start_codons = merge_regions(db, start_codons)
    renamed_start_codons = rename_regions(merged_start_codons, gene_id)
    merged_stop_codons = merge_regions(db, stop_codons)
    renamed_stop_codons = rename_regions(merged_stop_codons, gene_id)
    
    start_codon_bed += create_bed(renamed_start_codons)    
    stop_codon_bed += create_bed(renamed_stop_codons)

    
start_codon_bedtool = pybedtools.BedTool(start_codon_bed, from_string=True)
stop_codon_bedtool = pybedtools.BedTool(stop_codon_bed, from_string=True)
start_codon_bedtool.remove_invalid().sort().saveas('{}.start_codon.bed'.format(prefix))
stop_codon_bedtool.remove_invalid().sort().saveas('{}.stop_codon.bed'.format(prefix))


Out[8]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/annotation/Drosophila_melanogaster.BDGP6.91.gffutils.stop_codon.bed)>

In [9]:
## TSS
polyA_sites_bed = ''
tss_sites_bed = ''
for gene_id in get_gene_list(gene_dict):
    tss_sites = []
    polyA_sites = []
    for transcript in db.children(gene_id, featuretype='transcript'):
        start_t = copy.deepcopy(transcript)
        stop_t = copy.deepcopy(transcript)
        
        start_t.stop = start_t.start + 1
        
        stop_t.start = stop_t.stop
        
        if transcript.strand == '-':
            start_t, stop_t = stop_t, start_t
        polyA_sites.append(start_t)
        tss_sites.append(stop_t)
    merged_polyA_sites = merge_regions(db, polyA_sites)
    renamed_polyA_sites = rename_regions(merged_polyA_sites, gene_id)    
    merged_tss_sites = merge_regions(db, tss_sites)
    renamed_tss_sites = rename_regions(merged_tss_sites, gene_id)
    polyA_sites_bed += create_bed(renamed_polyA_sites)    
    tss_sites_bed += create_bed(renamed_tss_sites)

polyA_sites_bedtool = pybedtools.BedTool(polyA_sites_bed, from_string=True)
tss_sites_bedtool = pybedtools.BedTool(tss_sites_bed, from_string=True)
polyA_sites_bedtool.remove_invalid().sort().saveas('{}.polyA_sites.bed'.format(prefix))
tss_sites_bedtool.remove_invalid().sort().saveas('{}.tss_sites.bed'.format(prefix))


Out[9]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/annotation/Drosophila_melanogaster.BDGP6.91.gffutils.tss_sites.bed)>

In [10]:
tRNA_sites = []
tRNA_bed = ''
for gene_id in get_gene_list(gene_dict):
    for transcript in db.children(gene_id, featuretype='transcript'):
        if 'tRNA' in transcript.attributes['gene_biotype'] or 'Mt_tRNA' in transcript.attributes['transcript_biotype']:
            tRNA_sites.append(transcript)
    #merged_tRNA_sites = merge_regions_nostrand(db, tRNA_sites)
    #renamed_tRNA_sites = rename_regions(merged_tRNA_sites, gene_id)
    tRNA_bed += create_bed(tRNA_sites)

tRNA_bed = '\n'.join(list(OrderedDict.fromkeys(tRNA_bed.split('\n'))))

tRNA_sites_bedtool = pybedtools.BedTool(tRNA_bed, from_string=True)
tRNA_sites_bedtool.remove_invalid().sort().saveas('{}.tRNA_sites.bed'.format(prefix))


Out[10]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/annotation/Drosophila_melanogaster.BDGP6.91.gffutils.tRNA_sites.bed)>

In [11]:
tRNA_sites_bedtool.to_dataframe()


Out[11]:
chrom start end name score strand
0 2R 6164875 6164948 FBgn0011888 . -
1 2R 6162680 6162753 FBgn0011889 . -
2 2R 6166150 6166224 FBgn0011880 . +
3 2R 13430281 13430355 FBgn0011881 . -
4 2R 13430983 13431057 FBgn0011882 . +
5 2R 13431284 13431358 FBgn0011883 . +
6 2R 13431776 13431850 FBgn0011884 . +
7 2R 13432092 13432166 FBgn0011885 . +
8 2R 6165983 6166056 FBgn0011887 . +
9 3L 1379933 1380005 FBgn0052330 . +
10 3R 18628466 18628538 FBgn0051228 . -
11 3R 18108636 18108708 FBgn0051242 . +
12 3R 17646367 17646440 FBgn0051579 . +
13 2R 19414844 19414917 FBgn0050225 . +
14 X 20025098 20025170 FBgn0052826 . +
15 3L 1925640 1925712 FBgn0052312 . +
16 2R 20657890 20657962 FBgn0050155 . +
17 X 14104648 14104730 FBgn0011981 . +
18 X 14025108 14025190 FBgn0011980 . -
19 X 14117728 14117810 FBgn0011983 . -
20 X 14025565 14025647 FBgn0011982 . -
21 2L 3169581 3169663 FBgn0011985 . +
22 2L 3173002 3173084 FBgn0011984 . -
23 2R 11357563 11357650 FBgn0011987 . -
24 3L 5342083 5342165 FBgn0011986 . +
25 3L 13211550 13211622 FBgn0052128 . +
26 3L 13211213 13211285 FBgn0052129 . +
27 3L 13212364 13212436 FBgn0052126 . +
28 3L 13214108 13214233 FBgn0052127 . +
29 2L 2010868 2010940 FBgn0051940 . -
... ... ... ... ... ... ...
282 2R 9661224 9661296 FBgn0011921 . +
283 3L 14674343 14674415 FBgn0011928 . +
284 2R 19414597 19414679 FBgn0028981 . +
285 3R 6822962 6823035 FBgn0011904 . -
286 3L 16349916 16349989 FBgn0052357 . +
287 2R 13430694 13430815 FBgn0011909 . -
288 2L 8290726 8290799 FBgn0011902 . +
289 3R 12131355 12131427 FBgn0011905 . +
290 3R 17631041 17631114 FBgn0051569 . -
291 3R 17645306 17645379 FBgn0051568 . -
292 2R 14984326 14984408 FBgn0050241 . -
293 2R 16220051 16220123 FBgn0050240 . +
294 3R 18108992 18109064 FBgn0051518 . +
295 mitochondrion_genome 3011 3077 FBgn0013699 . +
296 mitochondrion_genome 12669 12734 FBgn0013698 . -
297 mitochondrion_genome 3767 3838 FBgn0013697 . +
298 mitochondrion_genome 0 65 FBgn0013696 . +
299 mitochondrion_genome 8140 8206 FBgn0013695 . -
300 mitochondrion_genome 5542 5607 FBgn0013694 . +
301 mitochondrion_genome 6343 6408 FBgn0013693 . -
302 mitochondrion_genome 6258 6325 FBgn0013692 . +
303 mitochondrion_genome 3839 3906 FBgn0013691 . +
304 mitochondrion_genome 1321 1383 FBgn0013690 . -
305 2L 8481776 8481848 FBgn0051896 . +
306 2L 8570887 8570960 FBgn0051895 . +
307 2L 8683450 8683522 FBgn0051890 . +
308 2R 21161062 21161134 FBgn0050206 . -
309 2R 19728187 19728259 FBgn0050454 . -
310 3R 14044524 14044606 FBgn0011971 . -
311 X 14103362 14103444 FBgn0011977 . -

312 rows × 6 columns


In [12]:
rRNA_sites = []
rRNA_bed = ''
for gene_id in get_gene_list(gene_dict):
    for transcript in db.children(gene_id, featuretype='transcript'):
        if 'rRNA' in transcript.attributes['gene_biotype']:
            rRNA_sites.append(transcript)
    #renamed_rRNA_sites = rename_regions(rRNA_sites, gene_id)
    rRNA_bed += create_bed(rRNA_sites)
rRNA_bed = '\n'.join(list(OrderedDict.fromkeys(rRNA_bed.split('\n'))))

rRNA_sites_bedtool = pybedtools.BedTool(rRNA_bed, from_string=True)
rRNA_sites_bedtool.remove_invalid().sort().saveas('{}.rRNA_sites.bed'.format(prefix))


Out[12]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/annotation/Drosophila_melanogaster.BDGP6.91.gffutils.rRNA_sites.bed)>

In [13]:
rRNA_sites_bedtool. to_dataframe()


Out[13]:
chrom start end name score strand
0 2R 19731042 19731177 FBgn0053448 . -
1 2R 19730669 19730804 FBgn0053449 . -
2 2R 19731767 19731902 FBgn0053446 . -
3 2R 19731408 19731543 FBgn0053447 . -
4 2R 19732549 19732684 FBgn0053444 . -
5 2R 19733287 19733422 FBgn0053442 . -
6 2R 19732925 19733060 FBgn0053443 . -
7 2R 19734036 19734171 FBgn0053440 . -
8 2R 19733670 19733805 FBgn0053441 . -
9 2R 19758361 19758496 FBgn0053374 . -
10 2R 19757992 19758127 FBgn0053375 . -
11 2R 19759841 19759976 FBgn0053370 . -
12 2R 19759100 19759235 FBgn0053372 . -
13 2R 19758734 19758869 FBgn0053373 . -
14 2R 19764288 19764423 FBgn0053358 . -
15 2R 19764647 19764782 FBgn0053357 . -
16 2R 19765774 19765909 FBgn0053354 . -
17 2R 19765408 19765543 FBgn0053355 . -
18 2R 19766143 19766278 FBgn0053353 . -
19 2R 19756509 19756644 FBgn0053379 . -
20 2R 19753557 19753692 FBgn0053387 . -
21 2R 19753926 19754061 FBgn0053386 . -
22 2R 19749142 19749277 FBgn0053399 . -
23 2R 19751726 19751861 FBgn0053392 . -
24 2R 19751357 19751492 FBgn0053393 . -
25 2R 19752095 19752229 FBgn0053391 . -
26 2R 19750250 19750385 FBgn0053396 . -
27 2R 19749877 19750012 FBgn0053397 . -
28 2R 19750988 19751123 FBgn0053394 . -
29 2R 19750619 19750754 FBgn0053395 . -
... ... ... ... ... ... ...
85 2R 19744719 19744854 FBgn0053411 . -
86 2R 19745095 19745230 FBgn0053410 . -
87 X 23278006 23283731 FBgn0267516 . -
88 2R 19747676 19747811 FBgn0053403 . -
89 2R 19748773 19748908 FBgn0053400 . -
90 2R 19748404 19748539 FBgn0053401 . -
91 2R 19746200 19746335 FBgn0053407 . -
92 2R 19729561 19729696 FBgn0053452 . -
93 rDNA 58804 58834 FBgn0267500 . +
94 2R 19763912 19764047 FBgn0053359 . -
95 rDNA 46204 46327 FBgn0267512 . +
96 2R 19738451 19738586 FBgn0053428 . -
97 2R 19741405 19741540 FBgn0053420 . -
98 2R 19740663 19740798 FBgn0053422 . -
99 2R 19740304 19740439 FBgn0053423 . -
100 2R 19739935 19740070 FBgn0053424 . -
101 2R 19739193 19739328 FBgn0053426 . -
102 X 23291699 23291729 FBgn0267524 . +
103 2R 19748042 19748177 FBgn0053402 . -
104 2R 19746573 19746708 FBgn0053406 . -
105 2R 19745827 19745962 FBgn0053408 . -
106 rDNA 55103 59294 FBgn0267506 . +
107 rDNA 66806 74924 FBgn0267507 . +
108 rDNA 70954 74924 FBgn0267504 . +
109 rDNA 42622 49485 FBgn0267505 . +
110 rDNA 70388 70511 FBgn0267502 . +
111 rDNA 70539 70569 FBgn0267503 . +
112 rDNA 67667 69662 FBgn0267501 . +
113 X 23280154 23280277 FBgn0250731 . -
114 2R 19757623 19757758 FBgn0053376 . -

115 rows × 6 columns


In [ ]: