In [1]:
from collections import defaultdict, OrderedDict
import warnings
import gffutils
import pybedtools
import pandas as pd
import copy
import os
import re
from gffutils.pybedtools_integration import tsses
from copy import deepcopy
from collections import OrderedDict, Callable
import errno

def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

class DefaultOrderedDict(OrderedDict):
    # Source: http://stackoverflow.com/a/6190500/562769
    def __init__(self, default_factory=None, *a, **kw):
        if (default_factory is not None and
           not isinstance(default_factory, Callable)):
            raise TypeError('first argument must be callable')
        OrderedDict.__init__(self, *a, **kw)
        self.default_factory = default_factory

    def __getitem__(self, key):
        try:
            return OrderedDict.__getitem__(self, key)
        except KeyError:
            return self.__missing__(key)

    def __missing__(self, key):
        if self.default_factory is None:
            raise KeyError(key)
        self[key] = value = self.default_factory()
        return value

    def __reduce__(self):
        if self.default_factory is None:
            args = tuple()
        else:
            args = self.default_factory,
        return type(self), args, None, None, self.items()

    def copy(self):
        return self.__copy__()

    def __copy__(self):
        return type(self)(self.default_factory, self)

    def __deepcopy__(self, memo):
        import copy
        return type(self)(self.default_factory,
                          copy.deepcopy(self.items()))

    def __repr__(self):
        return 'OrderedDefaultDict(%s, %s)' % (self.default_factory,
                                               OrderedDict.__repr__(self))

In [2]:
gtf = '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/annotation/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.38.gtf'
gtf_db = '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/annotation/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.38.gtf.db'
prefix = '/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/MG1655/ASM584v2.38/'
chrsizes = '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/fasta/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.sizes'
mkdir_p(prefix)

In [3]:
db = gffutils.create_db(gtf, dbfn=gtf_db, disable_infer_genes=True, disable_infer_transcripts=True, merge_strategy='merge', force=True)
def create_gene_dict(db):
    '''
    Store each feature line db.all_features() as a dict of dicts
    '''
    gene_dict = DefaultOrderedDict(lambda: DefaultOrderedDict(lambda: DefaultOrderedDict(list)))
    for line_no, feature in enumerate(db.all_features()):
        gene_ids = feature.attributes['gene_id']
        feature_type = feature.featuretype
        if feature_type == 'gene':
            if len(gene_ids)!=1:
                logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))
                break
            else:
                gene_id = gene_ids[0]
                gene_dict[gene_id]['gene'] = feature
        else:
            transcript_ids = feature.attributes['transcript_id']

            for gene_id in gene_ids:
                for transcript_id in transcript_ids:
                    gene_dict[gene_id][transcript_id][feature_type].append(feature)
    return gene_dict

In [4]:
db = gffutils.FeatureDB(gtf_db, keep_order=True)
gene_dict = create_gene_dict(db)
for x in db.featuretypes():
    print(x)


CDS
exon
gene
start_codon
stop_codon
transcript

In [5]:
def get_gene_list(gene_dict):
    return list(set(gene_dict.keys()))

def get_UTR_regions(gene_dict, gene_id, transcript, cds):
    if len(cds)==0:
        return [], []
    utr5_regions = []
    utr3_regions = []
    utrs = gene_dict[gene_id][transcript]['UTR']
    first_cds = cds[0]
    last_cds = cds[-1]
    for utr in utrs:
        ## Push all cds at once
        ## Sort later to remove duplicates
        strand = utr.strand
        if strand == '+':
            if utr.stop < first_cds.start:
                utr.feature_type = 'five_prime_UTR'
                utr5_regions.append(utr)
            elif utr.start > last_cds.stop:
                utr.feature_type = 'three_prime_UTR'
                utr3_regions.append(utr)
            else:
                raise RuntimeError('Error with cds')
        elif strand == '-':
            if utr.stop < first_cds.start:
                utr.feature_type = 'three_prime_UTR'
                utr3_regions.append(utr)
            elif utr.start > last_cds.stop:
                utr.feature_type = 'five_prime_UTR'
                utr5_regions.append(utr)                
            else:
                raise RuntimeError('Error with cds')    
    return utr5_regions, utr3_regions
    
def create_bed(regions, bedtype='0'):
    '''Create bed from list of regions
    bedtype: 0 or 1
        0-Based or 1-based coordinate of the BED
    '''
    bedstr = ''
    for region in regions:
        assert len(region.attributes['gene_id']) == 1
        ## GTF start is 1-based, so shift by one while writing 
        ## to 0-based BED format
        if bedtype == '0':
            start = region.start - 1
        else:
            start = region.start
        bedstr += '{}\t{}\t{}\t{}\t{}\t{}\n'.format(region.chrom,
                                             start,
                                             region.stop,
                                             re.sub('\.\d+', '', region.attributes['gene_id'][0]),
                                             '.',
                                             region.strand)
    return bedstr

def rename_regions(regions, gene_id):
    regions = list(regions)
    if len(regions) == 0:
        return []
    for region in regions:
        region.attributes['gene_id'] = gene_id
    return regions

def merge_regions(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start))
    return merged

def merge_regions_nostrand(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start), ignore_strand=True)
    return merged

In [6]:
gene_bed = ''
exon_bed = ''
intron_bed = ''
start_codon_bed = ''
stop_codon_bed = ''
cds_bed = ''

gene_list = []

for gene_id in get_gene_list(gene_dict):
    gene_list.append(gene_dict[gene_id]['gene'])
    exon_regions, intron_regions = [], []
    star_codon_regions, stop_codon_regions = [], []
    cds_regions = []
    
    for feature in gene_dict[gene_id].keys():
        if feature == 'gene':
            continue
        cds = list(gene_dict[gene_id][feature]['CDS'])
        exons = list(gene_dict[gene_id][feature]['exon'])
        merged_exons = merge_regions(db, exons)
        introns = db.interfeatures(merged_exons)
        exon_regions += exons
        intron_regions += introns
        cds_regions += cds        
    
    merged_exons = merge_regions(db, exon_regions)
    renamed_exons = rename_regions(merged_exons, gene_id)
    
    merged_introns = merge_regions(db, intron_regions)
    renamed_introns = rename_regions(merged_introns, gene_id)
    
    merged_cds = merge_regions(db, cds_regions)
    renamed_cds = rename_regions(merged_cds, gene_id)
    
    exon_bed += create_bed(renamed_exons)
    intron_bed += create_bed(renamed_introns)
    cds_bed += create_bed(renamed_cds)
    
gene_bed = create_bed(gene_list)
gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)
exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)
intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)
cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)

gene_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'gene.bed.gz'))
exon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'exon.bed.gz'))
intron_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'intron.bed.gz'))
cds_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'cds.bed.gz'))


Out[6]:
<BedTool(/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/MG1655/ASM584v2.38/cds.bed.gz)>

In [7]:
for gene_id in get_gene_list(gene_dict):
    start_codons = []
    stop_codons = []
    for start_codon in db.children(gene_id, featuretype='start_codon'):
        ## 1 -based stop
        ## 0-based start handled while converting to bed
        start_codon.stop = start_codon.start
        start_codons.append(start_codon)
    for stop_codon in db.children(gene_id, featuretype='stop_codon'):
        stop_codon.start = stop_codon.stop
        stop_codon.stop = stop_codon.stop+1
        stop_codons.append(stop_codon)
    merged_start_codons = merge_regions(db, start_codons)
    renamed_start_codons = rename_regions(merged_start_codons, gene_id)
    merged_stop_codons = merge_regions(db, stop_codons)
    renamed_stop_codons = rename_regions(merged_stop_codons, gene_id)
    
    start_codon_bed += create_bed(renamed_start_codons)    
    stop_codon_bed += create_bed(renamed_stop_codons)

    
start_codon_bedtool = pybedtools.BedTool(start_codon_bed, from_string=True)
stop_codon_bedtool = pybedtools.BedTool(stop_codon_bed, from_string=True)
start_codon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'start_codon.bed.gz'))
stop_codon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'stop_codon.bed.gz'))


Out[7]:
<BedTool(/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/MG1655/ASM584v2.38/stop_codon.bed.gz)>

In [10]:
cds_bedtool.to_dataframe()


Out[10]:
chrom start end name score strand
0 Chromosome 3485416 3485818 b3356 . -
1 Chromosome 3067342 3068080 b2922 . -
2 Chromosome 2645015 2647325 b2519 . -
3 Chromosome 2644435 2644864 b2518 . -
4 Chromosome 3481288 3483199 b3352 . +
5 Chromosome 3483201 3484221 b3353 . +
6 Chromosome 1583764 1583959 b1500 . -
7 Chromosome 2638665 2639283 b2513 . -
8 Chromosome 2637476 2638652 b2512 . -
9 Chromosome 2635886 2637356 b2511 . -
10 Chromosome 2635601 2635814 b2510 . -
11 Chromosome 2643131 2644283 b2517 . -
12 Chromosome 2641833 2642844 b2516 . -
13 Chromosome 2640688 2641804 b2515 . -
14 Chromosome 2639303 2640575 b2514 . -
15 Chromosome 3066279 3067173 b2921 . -
16 Chromosome 3407606 3409055 b3258 . +
17 Chromosome 3071461 3072622 b2926 . -
18 Chromosome 3072674 3073691 b2927 . -
19 Chromosome 3486119 3486749 b3357 . +
20 Chromosome 3068949 3069807 b2924 . -
21 Chromosome 2295585 2296320 b2199 . -
22 Chromosome 2295379 2295586 b2198 . -
23 Chromosome 2294903 2295380 b2197 . -
24 Chromosome 2292963 2294904 b2196 . -
25 Chromosome 2292409 2292964 b2195 . -
26 Chromosome 2291360 2292410 b2194 . -
27 Chromosome 2290499 2291144 b2193 . +
28 Chromosome 2289067 2290081 b2192 . -
29 Chromosome 4218595 4220329 b4016 . +
... ... ... ... ... ... ...
4111 Chromosome 3376784 3377420 b3229 . -
4112 Chromosome 2021087 2021498 b1946 . +
4113 Chromosome 3370349 3371036 b3223 . -
4114 Chromosome 3367826 3368951 b3220 . +
4115 Chromosome 3369016 3369478 b3221 . -
4116 Chromosome 3373700 3374489 b3226 . -
4117 Chromosome 3374868 3376233 b3227 . +
4118 Chromosome 3371086 3372574 b3224 . -
4119 Chromosome 3372685 3373576 b3225 . -
4120 Chromosome 3813931 3814384 b3640 . +
4121 Chromosome 1962974 1965050 b1879 . -
4122 Chromosome 3977524 3978601 b4481 . +
4123 Chromosome 3814493 3815087 b3641 . +
4124 Chromosome 4459902 4460364 b4237 . -
4125 Chromosome 2933043 2933688 b2800 . -
4126 Chromosome 3815129 3815768 b3642 . -
4127 Chromosome 4213682 4214123 b4012 . -
4128 Chromosome 3816675 3817536 b3644 . +
4129 Chromosome 3817759 3818581 b3645 . +
4130 Chromosome 3821427 3822048 b3648 . +
4131 Chromosome 3822105 3822378 b3649 . +
4132 Chromosome 3818873 3819488 b3646 . +
4133 Chromosome 3284169 3284643 b3138 . +
4134 Chromosome 3284684 3285485 b3139 . +
4135 Chromosome 3281975 3283127 b3136 . +
4136 Chromosome 3283142 3284000 b3137 . +
4137 Chromosome 3278913 3280191 b3132 . +
4138 Chromosome 3280216 3280687 b3133 . +
4139 Chromosome 3277336 3277798 b3130 . +
4140 Chromosome 3277858 3278665 b3131 . -

4141 rows × 6 columns


In [ ]: