Table of Contents


In [10]:
from collections import defaultdict, OrderedDict
import warnings
import logging
import gffutils
import pybedtools
import pandas as pd
import copy
import re
from gffutils.pybedtools_integration import tsses

logging.basicConfig(level=logging.INFO)

In [11]:
gtf = '/home/cmb-panasas2/skchoudh/genomes/GRCz10/annotation/Danio_rerio.GRCz10.91.gtf'
gtf_db = '/home/cmb-panasas2/skchoudh/genomes/GRCz10/annotation/Danio_rerio.GRCz10.91.gtf.db'
prefix = '/home/cmb-panasas2/skchoudh/genomes/GRCz10/annotation/Danio_rerio.GRCz10.91.gffutils'
chr_sizes = '/home/cmb-panasas2/skchoudh/genomes/GRCz10/fasta/Danio_rerio.GRCz10.dna.toplevel.sizes'

In [12]:
db = gffutils.create_db(gtf, dbfn=gtf_db, disable_infer_genes=True, disable_infer_transcripts=True, merge_strategy='merge', force=True)
def create_gene_dict(db):
    '''
    Store each feature line db.all_features() as a dict of dicts
    '''
    gene_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    for line_no, feature in enumerate(db.all_features()):
        gene_ids = feature.attributes['gene_id']
        feature_type = feature.featuretype
        if feature_type == 'gene':
            if len(gene_ids)!=1:
                logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))
                break
            else:
                gene_id = gene_ids[0]
                gene_dict[gene_id]['gene'] = feature
        else:
            transcript_ids = feature.attributes['transcript_id']

            for gene_id in gene_ids:
                for transcript_id in transcript_ids:
                    gene_dict[gene_id][transcript_id][feature_type].append(feature)
    return gene_dict

In [13]:
db = gffutils.FeatureDB(gtf_db, keep_order=True)
gene_dict = create_gene_dict(db)

In [14]:
def get_gene_list(gene_dict):
    return list(set(gene_dict.keys()))

def get_UTR_regions(gene_dict, gene_id, transcript, cds):
    if len(cds)==0:
        return [], []
    utr5_regions = []
    utr3_regions = []
    utrs = gene_dict[gene_id][transcript]['UTR']
    first_cds = cds[0]
    last_cds = cds[-1]
    for utr in utrs:
        ## Push all cds at once
        ## Sort later to remove duplicates
        strand = utr.strand
        if strand == '+':
            if utr.stop < first_cds.start:
                utr.feature_type = 'five_prime_UTR'
                utr5_regions.append(utr)
            elif utr.start > last_cds.stop:
                utr.feature_type = 'three_prime_UTR'
                utr3_regions.append(utr)
            else:
                raise RuntimeError('Error with cds')
        elif strand == '-':
            if utr.stop < first_cds.start:
                utr.feature_type = 'three_prime_UTR'
                utr3_regions.append(utr)
            elif utr.start > last_cds.stop:
                utr.feature_type = 'five_prime_UTR'
                utr5_regions.append(utr)                
            else:
                raise RuntimeError('Error with cds')    
    return utr5_regions, utr3_regions
    
def create_bed(regions, bedtype='0'):
    '''Create bed from list of regions
    bedtype: 0 or 1
        0-Based or 1-based coordinate of the BED
    '''
    bedstr = ''
    for region in regions:
        assert len(region.attributes['gene_id']) == 1
        ## GTF start is 1-based, so shift by one while writing 
        ## to 0-based BED format
        if bedtype == '0':
            start = region.start - 1
        else:
            start = region.start
        bedstr += '{}\t{}\t{}\t{}\t{}\t{}\n'.format(region.chrom,
                                             start,
                                             region.stop,
                                             re.sub('\.\d+', '', region.attributes['gene_id'][0]),
                                             '.',
                                             region.strand)
    return bedstr

def rename_regions(regions, gene_id):
    regions = list(regions)
    if len(regions) == 0:
        return []
    for region in regions:
        region.attributes['gene_id'] = gene_id
    return regions

def merge_regions(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start))
    return merged

def merge_regions_nostrand(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start), ignore_strand=True)
    return merged

In [15]:
for x in db.featuretypes():
    print x


CDS
Selenocysteine
exon
five_prime_utr
gene
start_codon
stop_codon
three_prime_utr
transcript

In [16]:
utr5_bed = ''
utr3_bed = ''
gene_bed = ''
exon_bed = ''
intron_bed = ''
start_codon_bed = ''
stop_codon_bed = ''
cds_bed = ''

gene_list = []

for gene_id in get_gene_list(gene_dict):
    gene_list.append(gene_dict[gene_id]['gene'])
    
    utr5_regions, utr3_regions = [], []
    exon_regions, intron_regions = [], []
    star_codon_regions, stop_codon_regions = [], []
    cds_regions = []
    
    for feature in gene_dict[gene_id].keys():
        if feature == 'gene':
            continue
        cds = list(gene_dict[gene_id][feature]['CDS'])
        exons = list(gene_dict[gene_id][feature]['exon'])
        merged_exons = merge_regions(db, exons)
        introns = db.interfeatures(merged_exons)
        #utr5_region, utr3_region = get_UTR_regions(gene_dict, gene_id, feature, cds)
        utr5_region = list(gene_dict[gene_id][feature]['five_prime_utr'])
        utr3_region = list(gene_dict[gene_id][feature]['three_prime_utr'])
        utr5_regions += utr5_region
        utr3_regions += utr3_region
        exon_regions += exons
        intron_regions += introns
        cds_regions += cds
        
    merged_utr5 = merge_regions(db, utr5_regions)
    renamed_utr5 = rename_regions(merged_utr5, gene_id)
    
    merged_utr3 = merge_regions(db, utr3_regions)
    renamed_utr3 = rename_regions(merged_utr3, gene_id)
    
    merged_exons = merge_regions(db, exon_regions)
    renamed_exons = rename_regions(merged_exons, gene_id)
    
    merged_introns = merge_regions(db, intron_regions)
    renamed_introns = rename_regions(merged_introns, gene_id)
    
    merged_cds = merge_regions(db, cds_regions)
    renamed_cds = rename_regions(merged_cds, gene_id)
    
    utr3_bed += create_bed(renamed_utr3)
    utr5_bed += create_bed(renamed_utr5)
    exon_bed += create_bed(renamed_exons)
    intron_bed += create_bed(renamed_introns)
    cds_bed += create_bed(renamed_cds)
    
    
gene_bed = create_bed(gene_list)
gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)
utr5_bedtool = pybedtools.BedTool(utr5_bed, from_string=True)
utr3_bedtool = pybedtools.BedTool(utr3_bed, from_string=True)
exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)
intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)
cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)

gene_bedtool.remove_invalid().sort().saveas('{}.genes.bed'.format(prefix))
utr5_bedtool.remove_invalid().sort().saveas('{}.UTR5.bed'.format(prefix))
utr3_bedtool.remove_invalid().sort().saveas('{}.UTR3.bed'.format(prefix))
exon_bedtool.remove_invalid().sort().saveas('{}.exon.bed'.format(prefix))
intron_bedtool.remove_invalid().sort().saveas('{}.intron.bed'.format(prefix))
cds_bedtool.remove_invalid().sort().saveas('{}.cds.bed'.format(prefix))


Out[16]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/GRCz10/annotation/Danio_rerio.GRCz10.91.gffutils.cds.bed)>

In [17]:
for gene_id in get_gene_list(gene_dict):
    start_codons = []
    stop_codons = []
    for start_codon in db.children(gene_id, featuretype='start_codon'):
        ## 1 -based stop
        ## 0-based start handled while converting to bed
        start_codon.stop = start_codon.start
        start_codons.append(start_codon)
    for stop_codon in db.children(gene_id, featuretype='stop_codon'):
        stop_codon.start = stop_codon.stop
        stop_codon.stop = stop_codon.stop+1
        stop_codons.append(stop_codon)
    merged_start_codons = merge_regions(db, start_codons)
    renamed_start_codons = rename_regions(merged_start_codons, gene_id)
    merged_stop_codons = merge_regions(db, stop_codons)
    renamed_stop_codons = rename_regions(merged_stop_codons, gene_id)
    
    start_codon_bed += create_bed(renamed_start_codons)    
    stop_codon_bed += create_bed(renamed_stop_codons)

    
start_codon_bedtool = pybedtools.BedTool(start_codon_bed, from_string=True)
stop_codon_bedtool = pybedtools.BedTool(stop_codon_bed, from_string=True)
start_codon_bedtool.remove_invalid().sort().saveas('{}.start_codon.bed'.format(prefix))
stop_codon_bedtool.remove_invalid().sort().saveas('{}.stop_codon.bed'.format(prefix))


Out[17]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/GRCz10/annotation/Danio_rerio.GRCz10.91.gffutils.stop_codon.bed)>

In [18]:
## TSS
polyA_sites_bed = ''
tss_sites_bed = ''
for gene_id in get_gene_list(gene_dict):
    tss_sites = []
    polyA_sites = []
    for transcript in db.children(gene_id, featuretype='transcript'):
        start_t = copy.deepcopy(transcript)
        stop_t = copy.deepcopy(transcript)
        
        start_t.stop = start_t.start + 1
        
        stop_t.start = stop_t.stop
        
        if transcript.strand == '-':
            start_t, stop_t = stop_t, start_t
        polyA_sites.append(start_t)
        tss_sites.append(stop_t)
    merged_polyA_sites = merge_regions(db, polyA_sites)
    renamed_polyA_sites = rename_regions(merged_polyA_sites, gene_id)    
    merged_tss_sites = merge_regions(db, tss_sites)
    renamed_tss_sites = rename_regions(merged_tss_sites, gene_id)
    polyA_sites_bed += create_bed(renamed_polyA_sites)    
    tss_sites_bed += create_bed(renamed_tss_sites)

polyA_sites_bedtool = pybedtools.BedTool(polyA_sites_bed, from_string=True)
tss_sites_bedtool = pybedtools.BedTool(tss_sites_bed, from_string=True)
polyA_sites_bedtool.remove_invalid().sort().saveas('{}.polyA_sites.bed'.format(prefix))
tss_sites_bedtool.remove_invalid().sort().saveas('{}.tss_sites.bed'.format(prefix))


Out[18]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/GRCz10/annotation/Danio_rerio.GRCz10.91.gffutils.tss_sites.bed)>

TSS


In [19]:
tss = tsses(db, as_bed6=True, merge_overlapping=True)
tss.remove_invalid().sort().saveas('{}.tss_temp.bed'.format(prefix))
promoter = tss.slop(l=1000, r=1000, s=True, g=chr_sizes)
promoter.remove_invalid().sort().saveas('{}.promoter.1000.bed'.format(prefix))



BEDToolsErrorTraceback (most recent call last)
<ipython-input-19-b60aec5fb42f> in <module>()
----> 1 tss = tsses(db, as_bed6=True, merge_overlapping=True)
      2 tss.remove_invalid().sort().saveas('{}.tss_temp.bed'.format(prefix))
      3 promoter = tss.slop(l=1000, r=1000, s=True, g=chr_sizes)
      4 promoter.remove_invalid().sort().saveas('{}.promoter.1000.bed'.format(prefix))

/home/cmb-panasas2/skchoudh/software_frozen/anaconda27/lib/python2.7/site-packages/gffutils/pybedtools_integration.pyc in tsses(db, merge_overlapping, attrs, attrs_sep, merge_kwargs, as_bed6)
    207                 f[3])
    208 
--> 209         x = x.merge(**_merge_kwargs).each(fix_merge).saveas()
    210 
    211 

/home/cmb-panasas2/skchoudh/software_frozen/anaconda27/lib/python2.7/site-packages/pybedtools/bedtool.pyc in decorated(self, *args, **kwargs)
    804             # this calls the actual method in the first place; *result* is
    805             # whatever you get back
--> 806             result = method(self, *args, **kwargs)
    807 
    808             # add appropriate tags

/home/cmb-panasas2/skchoudh/software_frozen/anaconda27/lib/python2.7/site-packages/pybedtools/bedtool.pyc in wrapped(self, *args, **kwargs)
    335             stream = call_bedtools(cmds, tmp, stdin=stdin,
    336                                    check_stderr=check_stderr,
--> 337                                    decode_output=decode_output,
    338                                    )
    339 

/home/cmb-panasas2/skchoudh/software_frozen/anaconda27/lib/python2.7/site-packages/pybedtools/helpers.pyc in call_bedtools(cmds, tmpfn, stdin, check_stderr, decode_output, encode_input)
    354                 sys.stderr.write(stderr)
    355             else:
--> 356                 raise BEDToolsError(subprocess.list2cmdline(cmds), stderr)
    357 
    358 

BEDToolsError: 
Command was:

	bedtools merge -s -o distinct -i /tmp/26742042.hpc-pbs1.hpcc.usc.edu/pybedtools.9mfjCf.tmp -c 4

Error message was:

*****ERROR: Unrecognized parameter: -o *****


*****ERROR: Unrecognized parameter: distinct *****


*****ERROR: Unrecognized parameter: -c *****


*****ERROR: Unrecognized parameter: 4 *****


Tool:    bedtools merge (aka mergeBed)
Version: v2.17.0
Summary: Merges overlapping BED/GFF/VCF entries into a single interval.

Usage:   bedtools merge [OPTIONS] -i <bed/gff/vcf>

Options: 
	-s	Force strandedness.  That is, only merge features
		that are the same strand.
		- By default, merging is done without respect to strand.

	-n	Report the number of BED entries that were merged.
		- Note: "1" is reported if no merging occurred.

	-d	Maximum distance between features allowed for features
		to be merged.
		- Def. 0. That is, overlapping & book-ended features are merged.
		- (INTEGER)

	-nms	Report the names of the merged features separated by semicolons.

	-scores	Report the scores of the merged features. Specify one of 
		the following options for reporting scores:
		  sum, min, max,
		  mean, median, mode, antimode,
		  collapse (i.e., print a semicolon-separated list),
		- (INTEGER)

Notes: 
	(1) All output, regardless of input type (e.g., GFF or VCF)
	    will in BED format with zero-based starts

	(2) The input file (-i) file must be sorted by chrom, then start.


In [20]:
for l in [1000, 2000, 3000, 4000, 5000]:
    promoter = tss.slop(l=l, r=l, s=True, g=chr_sizes)
    promoter.remove_invalid().sort().saveas('{}.promoter.{}.bed'.format(prefix, l))



NameErrorTraceback (most recent call last)
<ipython-input-20-b239d2b794bd> in <module>()
      1 for l in [1000, 2000, 3000, 4000, 5000]:
----> 2     promoter = tss.slop(l=l, r=l, s=True, g=chr_sizes)
      3     promoter.remove_invalid().sort().saveas('{}.promoter.{}.bed'.format(prefix, l))

NameError: name 'tss' is not defined

tRNA


In [21]:
for x in db.featuretypes():
    print(x)


CDS
Selenocysteine
exon
five_prime_utr
gene
start_codon
stop_codon
three_prime_utr
transcript

In [22]:
tRNA_sites = []
tRNA_bed = ''
for gene_id in get_gene_list(gene_dict):
    for transcript in db.children(gene_id, featuretype='transcript'):
        if 'tRNA' in transcript.attributes['gene_biotype'] or 'Mt_tRNA' in transcript.attributes['transcript_biotype']:
            tRNA_sites.append(transcript)
    #merged_tRNA_sites = merge_regions_nostrand(db, tRNA_sites)
    #renamed_tRNA_sites = rename_regions(merged_tRNA_sites, gene_id)
    tRNA_bed += create_bed(tRNA_sites)

tRNA_bed = '\n'.join(list(OrderedDict.fromkeys(tRNA_bed.split('\n'))))

tRNA_sites_bedtool = pybedtools.BedTool(tRNA_bed, from_string=True)
tRNA_sites_bedtool.remove_invalid().sort().saveas('{}.tRNA_sites.bed'.format(prefix))


Out[22]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/GRCz10/annotation/Danio_rerio.GRCz10.91.gffutils.tRNA_sites.bed)>

In [23]:
tRNA_sites_bedtool.to_dataframe()


Out[23]:
chrom start end name score strand
0 MT 12749 12819 ENSDARG00000082123 . +
1 MT 16526 16596 ENSDARG00000081475 . -
2 MT 6282 6354 ENSDARG00000082789 . -
3 MT 6352 6423 ENSDARG00000080128 . -
4 MT 4851 4922 ENSDARG00000080630 . -
5 MT 15232 15301 ENSDARG00000083312 . -
6 MT 6177 6250 ENSDARG00000081938 . -
7 MT 10938 11008 ENSDARG00000080329 . +
8 MT 12822 12895 ENSDARG00000081280 . +
9 MT 6108 6176 ENSDARG00000080401 . -
10 MT 950 1019 ENSDARG00000083480 . +
11 MT 1971 2042 ENSDARG00000081443 . +
12 MT 12680 12750 ENSDARG00000082716 . +
13 MT 6037 6107 ENSDARG00000080718 . +
14 MT 4923 4992 ENSDARG00000082084 . +
15 MT 16448 16520 ENSDARG00000083462 . +
16 MT 4780 4854 ENSDARG00000083118 . +
17 MT 10519 10589 ENSDARG00000081758 . +
18 MT 8048 8117 ENSDARG00000083519 . +
19 MT 8820 8893 ENSDARG00000080151 . +
20 MT 7975 8046 ENSDARG00000081369 . -
21 MT 3725 3802 ENSDARG00000083046 . +

In [24]:
rRNA_sites = []
rRNA_bed = ''
for gene_id in get_gene_list(gene_dict):
    for transcript in db.children(gene_id, featuretype='transcript'):
        if 'rRNA' in transcript.attributes['gene_biotype']:
            rRNA_sites.append(transcript)
    #renamed_rRNA_sites = rename_regions(rRNA_sites, gene_id)
    rRNA_bed += create_bed(rRNA_sites)
rRNA_bed = '\n'.join(list(OrderedDict.fromkeys(rRNA_bed.split('\n'))))

rRNA_sites_bedtool = pybedtools.BedTool(rRNA_bed, from_string=True)
rRNA_sites_bedtool.remove_invalid().sort().saveas('{}.rRNA_sites.bed'.format(prefix))


Out[24]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/GRCz10/annotation/Danio_rerio.GRCz10.91.gffutils.rRNA_sites.bed)>

In [25]:
rRNA_sites_bedtool. to_dataframe()


Out[25]:
chrom start end name score strand
0 6 38379554 38379675 ENSDARG00000085224 . +
1 19 11807153 11807289 ENSDARG00000084173 . -
2 22 35859941 35860076 ENSDARG00000084170 . -
3 20 34245593 34245726 ENSDARG00000084175 . +
4 6 15417114 15417230 ENSDARG00000109159 . +
5 20 51703089 51703206 ENSDARG00000109150 . +
6 13 32261932 32262049 ENSDARG00000109156 . +
7 2 27578031 27578147 ENSDARG00000109157 . -
8 24 9572108 9572224 ENSDARG00000106397 . -
9 6 20081523 20081640 ENSDARG00000106395 . +
10 8 39772539 39772656 ENSDARG00000106398 . +
11 KN150346.1 28638 28756 ENSDARG00000103780 . +
12 24 28955464 28955581 ENSDARG00000107445 . -
13 2 8391279 8391395 ENSDARG00000107448 . +
14 15 29972095 29972212 ENSDARG00000108497 . -
15 12 26048844 26048961 ENSDARG00000108495 . -
16 24 697763 697878 ENSDARG00000106476 . -
17 2 45220795 45220912 ENSDARG00000106470 . -
18 11 704308 704425 ENSDARG00000106471 . +
19 11 21489953 21490070 ENSDARG00000108723 . +
20 6 46747623 46747742 ENSDARG00000101268 . +
21 2 41571946 41572063 ENSDARG00000106746 . -
22 7 68076129 68076247 ENSDARG00000104720 . +
23 23 34042334 34042457 ENSDARG00000080256 . -
24 10 5949243 5949359 ENSDARG00000108637 . -
25 15 1910085 1910206 ENSDARG00000084403 . +
26 6 54074413 54074548 ENSDARG00000083753 . +
27 17 39694095 39694213 ENSDARG00000085416 . -
28 5 31570016 31570133 ENSDARG00000106217 . -
29 23 429443 429559 ENSDARG00000106215 . +
... ... ... ... ... ... ...
1549 11 26510675 26510813 ENSDARG00000083954 . -
1550 10 38432375 38432495 ENSDARG00000082600 . -
1551 4 17073003 17073121 ENSDARG00000085074 . +
1552 24 14206480 14206601 ENSDARG00000081993 . +
1553 17 39039710 39039830 ENSDARG00000084381 . -
1554 23 9683242 9683365 ENSDARG00000085384 . +
1555 25 5946529 5946648 ENSDARG00000085923 . -
1556 17 39478506 39478627 ENSDARG00000085921 . +
1557 22 2615351 2615472 ENSDARG00000084032 . +
1558 6 8535936 8536057 ENSDARG00000083564 . +
1559 10 12375014 12375131 ENSDARG00000106605 . +
1560 22 8235686 8235803 ENSDARG00000106607 . +
1561 21 18621478 18621594 ENSDARG00000107439 . -
1562 14 33304341 33304457 ENSDARG00000108513 . +
1563 15 18215642 18215758 ENSDARG00000108515 . -
1564 18 39605537 39605657 ENSDARG00000100760 . -
1565 6 57266468 57266604 ENSDARG00000101494 . -
1566 25 15532462 15532580 ENSDARG00000101491 . -
1567 9 30485928 30486045 ENSDARG00000107261 . -
1568 14 29012529 29012646 ENSDARG00000107263 . -
1569 12 43991883 43991999 ENSDARG00000107265 . -
1570 19 27221542 27221659 ENSDARG00000107267 . +
1571 6 40669810 40669927 ENSDARG00000107268 . -
1572 5 60410086 60410205 ENSDARG00000081715 . -
1573 25 10026436 10026555 ENSDARG00000086093 . -
1574 13 26436615 26436734 ENSDARG00000080449 . -
1575 7 15952021 15952140 ENSDARG00000084639 . +
1576 8 5011460 5011582 ENSDARG00000083369 . -
1577 6 33741356 33741458 ENSDARG00000082109 . -
1578 12 9610384 9610502 ENSDARG00000082100 . -

1579 rows × 6 columns


In [ ]: