In [1]:
from collections import defaultdict
import warnings
import logging
import gffutils
import pybedtools
import pandas as pd
import copy
import re
from gffutils.pybedtools_integration import tsses
logging.basicConfig(level=logging.INFO)
In [2]:
gtf = '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/annotation/Saccharomyces_cerevisiae.R64-1-1.91.gtf'
gtf_db = '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/annotation/Saccharomyces_cerevisiae.R64-1-1.91.gtf.db'
prefix = '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/annotation/Saccharomyces_cerevisiae.R64-1-1.91.gffutils'
chrsizes = '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/fasta/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.sizes'
In [4]:
db = gffutils.create_db(gtf, dbfn=gtf_db, merge_strategy='merge',
disable_infer_transcripts=True,
disable_infer_genes = True,
force=True)
In [9]:
def create_gene_dict(db):
'''
Store each feature line db.all_features() as a dict of dicts
'''
gene_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
for line_no, feature in enumerate(db.all_features()):
gene_ids = feature.attributes['gene_id']
feature_type = feature.featuretype
if feature_type == 'gene':
if len(gene_ids)!=1:
logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))
break
else:
gene_id = gene_ids[0]
gene_dict[gene_id]['gene'] = feature
else:
transcript_ids = feature.attributes['transcript_id']
for gene_id in gene_ids:
for transcript_id in transcript_ids:
gene_dict[gene_id][transcript_id][feature_type].append(feature)
return gene_dict
for x in db.featuretypes():
print x
In [10]:
gene_dict = create_gene_dict(db)
In [6]:
def get_gene_list(gene_dict):
return list(set(gene_dict.keys()))
def get_UTR_regions(gene_dict, gene_id, transcript, cds):
if len(cds)==0:
return [], []
utr5_regions = []
utr3_regions = []
utrs = gene_dict[gene_id][transcript]['UTR']
first_cds = cds[0]
last_cds = cds[-1]
for utr in utrs:
## Push all cds at once
## Sort later to remove duplicates
strand = utr.strand
if strand == '+':
if utr.stop < first_cds.start:
utr.feature_type = 'five_prime_UTR'
utr5_regions.append(utr)
elif utr.start > last_cds.stop:
utr.feature_type = 'three_prime_UTR'
utr3_regions.append(utr)
else:
raise RuntimeError('Error with cds')
elif strand == '-':
if utr.stop < first_cds.start:
utr.feature_type = 'three_prime_UTR'
utr3_regions.append(utr)
elif utr.start > last_cds.stop:
utr.feature_type = 'five_prime_UTR'
utr5_regions.append(utr)
else:
raise RuntimeError('Error with cds')
return utr5_regions, utr3_regions
def create_bed(regions, bedtype='0'):
'''Create bed from list of regions
bedtype: 0 or 1
0-Based or 1-based coordinate of the BED
'''
bedstr = ''
for region in regions:
assert len(region.attributes['gene_id']) == 1
## GTF start is 1-based, so shift by one while writing
## to 0-based BED format
if bedtype == '0':
start = region.start - 1
else:
start = region.start
bedstr += '{}\t{}\t{}\t{}\t{}\t{}\n'.format(region.chrom,
start,
region.stop,
re.sub('\.\d+', '', region.attributes['gene_id'][0]),
'.',
region.strand)
return bedstr
def rename_regions(regions, gene_id):
regions = list(regions)
if len(regions) == 0:
return []
for region in regions:
region.attributes['gene_id'] = gene_id
return regions
def merge_regions(db, regions):
if len(regions) == 0:
return []
merged = db.merge(sorted(list(regions), key=lambda x: x.start))
return merged
def merge_regions_nostrand(db, regions):
if len(regions) == 0:
return []
merged = db.merge(sorted(list(regions), key=lambda x: x.start), ignore_strand=True)
return merged
In [11]:
gene_bed = ''
exon_bed = ''
intron_bed = ''
start_codon_bed = ''
stop_codon_bed = ''
cds_bed = ''
gene_list = []
for gene_id in get_gene_list(gene_dict):
gene_list.append(gene_dict[gene_id]['gene'])
exon_regions, intron_regions = [], []
star_codon_regions, stop_codon_regions = [], []
cds_regions = []
for feature in gene_dict[gene_id].keys():
if feature == 'gene':
continue
cds = list(gene_dict[gene_id][feature]['CDS'])
exons = list(gene_dict[gene_id][feature]['exon'])
merged_exons = merge_regions(db, exons)
introns = db.interfeatures(merged_exons)
exon_regions += exons
intron_regions += introns
cds_regions += cds
merged_exons = merge_regions(db, exon_regions)
renamed_exons = rename_regions(merged_exons, gene_id)
merged_introns = merge_regions(db, intron_regions)
renamed_introns = rename_regions(merged_introns, gene_id)
merged_cds = merge_regions(db, cds_regions)
renamed_cds = rename_regions(merged_cds, gene_id)
exon_bed += create_bed(renamed_exons)
intron_bed += create_bed(renamed_introns)
cds_bed += create_bed(renamed_cds)
gene_bed = create_bed(gene_list)
gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)
exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)
intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)
cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)
gene_bedtool.remove_invalid().sort().saveas('{}.genes.bed'.format(prefix))
exon_bedtool.remove_invalid().sort().saveas('{}.exon.bed'.format(prefix))
intron_bedtool.remove_invalid().sort().saveas('{}.intron.bed'.format(prefix))
cds_bedtool.remove_invalid().sort().saveas('{}.cds.bed'.format(prefix))
Out[11]:
In [ ]: