In [12]:
from collections import defaultdict, OrderedDict
import warnings
import logging
import gffutils
import pybedtools
import pandas as pd
import copy
import re
from gffutils.pybedtools_integration import tsses

logging.basicConfig(level=logging.INFO)

In [2]:
gtf = '/home/cmb-panasas2/skchoudh/genomes/WBcel235/annotation/Caenorhabditis_elegans.WBcel235.91.gtf'
gtf_db = '/home/cmb-panasas2/skchoudh/genomes/WBcel235/annotation/Caenorhabditis_elegans.WBcel235.91.gtf.db'
prefix = '/home/cmb-panasas2/skchoudh/genomes/WBcel235/annotation/Caenorhabditis_elegans.WBcel235.91.gffutils'
chrsizes = '/home/cmb-panasas2/skchoudh/genomes/WBcel235/fasta/Caenorhabditis_elegans.WBcel235.dna.toplevel.sizes'

In [3]:
db = gffutils.create_db(gtf, dbfn=gtf_db, disable_infer_genes=True, disable_infer_transcripts=True, merge_strategy='merge', force=True)
def create_gene_dict(db):
    '''
    Store each feature line db.all_features() as a dict of dicts
    '''
    gene_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    for line_no, feature in enumerate(db.all_features()):
        gene_ids = feature.attributes['gene_id']
        feature_type = feature.featuretype
        if feature_type == 'gene':
            if len(gene_ids)!=1:
                logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))
                break
            else:
                gene_id = gene_ids[0]
                gene_dict[gene_id]['gene'] = feature
        else:
            transcript_ids = feature.attributes['transcript_id']

            for gene_id in gene_ids:
                for transcript_id in transcript_ids:
                    gene_dict[gene_id][transcript_id][feature_type].append(feature)
    return gene_dict

In [4]:
db = gffutils.FeatureDB(gtf_db, keep_order=True)
gene_dict = create_gene_dict(db)

In [5]:
for x in db.featuretypes():
    print(x)


CDS
Selenocysteine
exon
five_prime_utr
gene
start_codon
stop_codon
three_prime_utr
transcript

In [7]:
def get_gene_list(gene_dict):
    return list(set(gene_dict.keys()))

def get_UTR_regions(gene_dict, gene_id, transcript, cds):
    if len(cds)==0:
        return [], []
    utr5_regions = []
    utr3_regions = []
    utrs = gene_dict[gene_id][transcript]['UTR']
    first_cds = cds[0]
    last_cds = cds[-1]
    for utr in utrs:
        ## Push all cds at once
        ## Sort later to remove duplicates
        strand = utr.strand
        if strand == '+':
            if utr.stop < first_cds.start:
                utr.feature_type = 'five_prime_UTR'
                utr5_regions.append(utr)
            elif utr.start > last_cds.stop:
                utr.feature_type = 'three_prime_UTR'
                utr3_regions.append(utr)
            else:
                raise RuntimeError('Error with cds')
        elif strand == '-':
            if utr.stop < first_cds.start:
                utr.feature_type = 'three_prime_UTR'
                utr3_regions.append(utr)
            elif utr.start > last_cds.stop:
                utr.feature_type = 'five_prime_UTR'
                utr5_regions.append(utr)                
            else:
                raise RuntimeError('Error with cds')    
    return utr5_regions, utr3_regions
    
def create_bed(regions, bedtype='0'):
    '''Create bed from list of regions
    bedtype: 0 or 1
        0-Based or 1-based coordinate of the BED
    '''
    bedstr = ''
    for region in regions:
        assert len(region.attributes['gene_id']) == 1
        ## GTF start is 1-based, so shift by one while writing 
        ## to 0-based BED format
        if bedtype == '0':
            start = region.start - 1
        else:
            start = region.start
        bedstr += '{}\t{}\t{}\t{}\t{}\t{}\n'.format(region.chrom,
                                             start,
                                             region.stop,
                                             re.sub('\.\d+', '', region.attributes['gene_id'][0]),
                                             '.',
                                             region.strand)
    return bedstr

def rename_regions(regions, gene_id):
    regions = list(regions)
    if len(regions) == 0:
        return []
    for region in regions:
        region.attributes['gene_id'] = gene_id
    return regions

def merge_regions(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start))
    return merged

def merge_regions_nostrand(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start), ignore_strand=True)
    return merged

In [8]:
utr5_bed = ''
utr3_bed = ''
gene_bed = ''
exon_bed = ''
intron_bed = ''
start_codon_bed = ''
stop_codon_bed = ''
cds_bed = ''

gene_list = []

for gene_id in get_gene_list(gene_dict):
    gene_list.append(gene_dict[gene_id]['gene'])
    
    utr5_regions, utr3_regions = [], []
    exon_regions, intron_regions = [], []
    star_codon_regions, stop_codon_regions = [], []
    cds_regions = []
    
    for feature in gene_dict[gene_id].keys():
        if feature == 'gene':
            continue
        cds = list(gene_dict[gene_id][feature]['CDS'])
        exons = list(gene_dict[gene_id][feature]['exon'])
        merged_exons = merge_regions(db, exons)
        introns = db.interfeatures(merged_exons)
        #utr5_region, utr3_region = get_UTR_regions(gene_dict, gene_id, feature, cds)
        utr5_region = list(gene_dict[gene_id][feature]['five_prime_utr'])
        utr3_region = list(gene_dict[gene_id][feature]['three_prime_utr'])
        utr5_regions += utr5_region
        utr3_regions += utr3_region
        exon_regions += exons
        intron_regions += introns
        cds_regions += cds
        
    merged_utr5 = merge_regions(db, utr5_regions)
    renamed_utr5 = rename_regions(merged_utr5, gene_id)
    
    merged_utr3 = merge_regions(db, utr3_regions)
    renamed_utr3 = rename_regions(merged_utr3, gene_id)
    
    merged_exons = merge_regions(db, exon_regions)
    renamed_exons = rename_regions(merged_exons, gene_id)
    
    merged_introns = merge_regions(db, intron_regions)
    renamed_introns = rename_regions(merged_introns, gene_id)
    
    merged_cds = merge_regions(db, cds_regions)
    renamed_cds = rename_regions(merged_cds, gene_id)
    
    utr3_bed += create_bed(renamed_utr3)
    utr5_bed += create_bed(renamed_utr5)
    exon_bed += create_bed(renamed_exons)
    intron_bed += create_bed(renamed_introns)
    cds_bed += create_bed(renamed_cds)
    
    
gene_bed = create_bed(gene_list)
gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)
utr5_bedtool = pybedtools.BedTool(utr5_bed, from_string=True)
utr3_bedtool = pybedtools.BedTool(utr3_bed, from_string=True)
exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)
intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)
cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)

gene_bedtool.remove_invalid().sort().saveas('{}.genes.bed'.format(prefix))
utr5_bedtool.remove_invalid().sort().saveas('{}.UTR5.bed'.format(prefix))
utr3_bedtool.remove_invalid().sort().saveas('{}.UTR3.bed'.format(prefix))
exon_bedtool.remove_invalid().sort().saveas('{}.exon.bed'.format(prefix))
intron_bedtool.remove_invalid().sort().saveas('{}.intron.bed'.format(prefix))
cds_bedtool.remove_invalid().sort().saveas('{}.cds.bed'.format(prefix))


Out[8]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/WBcel235/annotation/Caenorhabditis_elegans.WBcel235.91.gffutils.cds.bed)>

In [9]:
for gene_id in get_gene_list(gene_dict):
    start_codons = []
    stop_codons = []
    for start_codon in db.children(gene_id, featuretype='start_codon'):
        ## 1 -based stop
        ## 0-based start handled while converting to bed
        start_codon.stop = start_codon.start
        start_codons.append(start_codon)
    for stop_codon in db.children(gene_id, featuretype='stop_codon'):
        stop_codon.start = stop_codon.stop
        stop_codon.stop = stop_codon.stop+1
        stop_codons.append(stop_codon)
    merged_start_codons = merge_regions(db, start_codons)
    renamed_start_codons = rename_regions(merged_start_codons, gene_id)
    merged_stop_codons = merge_regions(db, stop_codons)
    renamed_stop_codons = rename_regions(merged_stop_codons, gene_id)
    
    start_codon_bed += create_bed(renamed_start_codons)    
    stop_codon_bed += create_bed(renamed_stop_codons)

    
start_codon_bedtool = pybedtools.BedTool(start_codon_bed, from_string=True)
stop_codon_bedtool = pybedtools.BedTool(stop_codon_bed, from_string=True)
start_codon_bedtool.remove_invalid().sort().saveas('{}.start_codon.bed'.format(prefix))
stop_codon_bedtool.remove_invalid().sort().saveas('{}.stop_codon.bed'.format(prefix))


Out[9]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/WBcel235/annotation/Caenorhabditis_elegans.WBcel235.91.gffutils.stop_codon.bed)>

In [10]:
## TSS
polyA_sites_bed = ''
tss_sites_bed = ''
for gene_id in get_gene_list(gene_dict):
    tss_sites = []
    polyA_sites = []
    for transcript in db.children(gene_id, featuretype='transcript'):
        start_t = copy.deepcopy(transcript)
        stop_t = copy.deepcopy(transcript)
        
        start_t.stop = start_t.start + 1
        
        stop_t.start = stop_t.stop
        
        if transcript.strand == '-':
            start_t, stop_t = stop_t, start_t
        polyA_sites.append(start_t)
        tss_sites.append(stop_t)
    merged_polyA_sites = merge_regions(db, polyA_sites)
    renamed_polyA_sites = rename_regions(merged_polyA_sites, gene_id)    
    merged_tss_sites = merge_regions(db, tss_sites)
    renamed_tss_sites = rename_regions(merged_tss_sites, gene_id)
    polyA_sites_bed += create_bed(renamed_polyA_sites)    
    tss_sites_bed += create_bed(renamed_tss_sites)

polyA_sites_bedtool = pybedtools.BedTool(polyA_sites_bed, from_string=True)
tss_sites_bedtool = pybedtools.BedTool(tss_sites_bed, from_string=True)
polyA_sites_bedtool.remove_invalid().sort().saveas('{}.polyA_sites.bed'.format(prefix))
tss_sites_bedtool.remove_invalid().sort().saveas('{}.tss_sites.bed'.format(prefix))


Out[10]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/WBcel235/annotation/Caenorhabditis_elegans.WBcel235.91.gffutils.tss_sites.bed)>

In [13]:
tRNA_sites = []
tRNA_bed = ''
for gene_id in get_gene_list(gene_dict):
    for transcript in db.children(gene_id, featuretype='transcript'):
        if 'tRNA' in transcript.attributes['gene_biotype'] or 'Mt_tRNA' in transcript.attributes['transcript_biotype']:
            tRNA_sites.append(transcript)
    #merged_tRNA_sites = merge_regions_nostrand(db, tRNA_sites)
    #renamed_tRNA_sites = rename_regions(merged_tRNA_sites, gene_id)
    tRNA_bed += create_bed(tRNA_sites)

tRNA_bed = '\n'.join(list(OrderedDict.fromkeys(tRNA_bed.split('\n'))))

tRNA_sites_bedtool = pybedtools.BedTool(tRNA_bed, from_string=True)
tRNA_sites_bedtool.remove_invalid().sort().saveas('{}.tRNA_sites.bed'.format(prefix))


Out[13]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/WBcel235/annotation/Caenorhabditis_elegans.WBcel235.91.gffutils.tRNA_sites.bed)>

In [14]:
tRNA_sites_bedtool.to_dataframe()


Out[14]:
chrom start end name score strand
0 X 8409295 8409377 WBGene00022981 . -
1 X 8417911 8417993 WBGene00022980 . -
2 X 8820184 8820256 WBGene00023119 . +
3 X 8819680 8819752 WBGene00023118 . +
4 X 410390 410462 WBGene00023117 . +
5 X 3994601 3994678 WBGene00023116 . +
6 IV 3524818 3524891 WBGene00023115 . +
7 II 4565015 4565087 WBGene00023113 . -
8 II 4565388 4565460 WBGene00023112 . +
9 X 3035975 3036047 WBGene00023111 . -
10 X 3036163 3036235 WBGene00023110 . +
11 X 8640793 8640866 WBGene00022989 . -
12 X 8650708 8650781 WBGene00022988 . -
13 X 8638921 8638993 WBGene00022986 . +
14 I 9320177 9320249 WBGene00012377 . +
15 IV 3552188 3552260 WBGene00022894 . -
16 IV 322631 322703 WBGene00023092 . -
17 III 6513936 6514018 WBGene00023093 . -
18 X 8623997 8624069 WBGene00023090 . -
19 X 8623763 8623835 WBGene00023091 . -
20 V 6559756 6559828 WBGene00023096 . +
21 III 6888805 6888877 WBGene00023097 . +
22 IV 1514591 1514663 WBGene00023094 . +
23 V 6551853 6551935 WBGene00023095 . -
24 III 6888609 6888681 WBGene00023098 . -
25 III 6887727 6887800 WBGene00023099 . -
26 IV 13870264 13870336 WBGene00009206 . +
27 IV 5967159 5967232 WBGene00022903 . +
28 X 1589666 1589740 WBGene00022902 . -
29 X 1590079 1590153 WBGene00022901 . -
... ... ... ... ... ... ...
604 IV 15311117 15311199 WBGene00014604 . -
605 IV 15604311 15604383 WBGene00014607 . -
606 IV 15793414 15793487 WBGene00014606 . +
607 III 11554824 11554897 WBGene00014600 . +
608 IV 15318477 15318559 WBGene00014603 . -
609 IV 15235387 15235460 WBGene00014602 . +
610 IV 15933352 15933424 WBGene00014609 . +
611 IV 15932009 15932081 WBGene00014608 . +
612 X 14010252 14010324 WBGene00014274 . +
613 X 11806587 11806660 WBGene00014276 . -
614 III 1164799 1164873 WBGene00023197 . -
615 III 1163353 1163427 WBGene00023196 . -
616 III 1424490 1424562 WBGene00023195 . +
617 X 1764253 1764327 WBGene00023194 . -
618 IV 6481160 6481233 WBGene00023192 . +
619 III 2711710 2711782 WBGene00023190 . +
620 I 946043 946115 WBGene00023199 . -
621 I 1575719 1575792 WBGene00023198 . +
622 MtDNA 13274 13327 WBGene00014473 . +
623 MtDNA 9592 9647 WBGene00014470 . +
624 MtDNA 10347 10401 WBGene00014471 . +
625 X 13307033 13307106 WBGene00014478 . +
626 X 15200816 15200888 WBGene00014515 . -
627 X 15201006 15201078 WBGene00014514 . +
628 II 9773653 9773728 WBGene00014513 . +
629 I 9327636 9327708 WBGene00012653 . +
630 V 17317878 17317962 WBGene00014362 . -
631 X 15241354 15241427 WBGene00014369 . -
632 I 11505550 11505622 WBGene00014363 . +
633 II 9566240 9566313 WBGene00014365 . +

634 rows × 6 columns


In [15]:
rRNA_sites = []
rRNA_bed = ''
for gene_id in get_gene_list(gene_dict):
    for transcript in db.children(gene_id, featuretype='transcript'):
        if 'rRNA' in transcript.attributes['gene_biotype']:
            rRNA_sites.append(transcript)
    #renamed_rRNA_sites = rename_regions(rRNA_sites, gene_id)
    rRNA_bed += create_bed(rRNA_sites)
rRNA_bed = '\n'.join(list(OrderedDict.fromkeys(rRNA_bed.split('\n'))))

rRNA_sites_bedtool = pybedtools.BedTool(rRNA_bed, from_string=True)
rRNA_sites_bedtool.remove_invalid().sort().saveas('{}.rRNA_sites.bed'.format(prefix))


Out[15]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/WBcel235/annotation/Caenorhabditis_elegans.WBcel235.91.gffutils.rRNA_sites.bed)>

In [16]:
rRNA_sites_bedtool. to_dataframe()


Out[16]:
chrom start end name score strand
0 V 17120967 17121086 WBGene00014621 . +
1 MtDNA 897 1593 WBGene00014454 . +
2 V 17120012 17120111 WBGene00077468 . +
3 V 17121943 17122062 WBGene00077469 . +
4 V 17117862 17117981 WBGene00077466 . +
5 V 17118836 17118935 WBGene00077467 . +
6 V 17115902 17116021 WBGene00077465 . +
7 I 15064837 15068346 WBGene00004622 . +
8 V 17428878 17428996 WBGene00189966 . +
9 I 15064300 15064453 WBGene00004567 . +
10 I 15062082 15063836 WBGene00004512 . +
11 I 15069279 15071033 WBGene00004513 . +
12 I 6169083 6169195 WBGene00235197 . +
13 V 17129375 17129494 WBGene00077475 . +
14 V 17128403 17128522 WBGene00077474 . +
15 V 17131337 17131456 WBGene00077477 . +
16 V 17130361 17130480 WBGene00077476 . +
17 V 17123895 17124014 WBGene00077471 . +
18 V 17122919 17123038 WBGene00077470 . +
19 V 17125847 17125966 WBGene00077473 . +
20 V 17124871 17124990 WBGene00077472 . +
21 MtDNA 10402 11354 WBGene00014472 . +

In [ ]: