In [1]:
from collections import defaultdict
import warnings
import logging
import gffutils
import pybedtools
import pandas as pd
import copy
import re
from gffutils.pybedtools_integration import tsses
logging.basicConfig(level=logging.INFO)
In [6]:
gtf = '/home/cmb-panasas2/skchoudh/genomes/S_cerevisiae_BY4741/annotation/BY4741_Toronto_2012.gff'
gtf_db = '/home/cmb-panasas2/skchoudh/genomes/S_cerevisiae_BY4741/annotation/BY4741_Toronto_2012.gff.db'
prefix = '/home/cmb-panasas2/skchoudh/genomes/S_cerevisiae_BY4741/annotation/BY4741_Toronto_2012.gffutils'
chrsizes = '/home/cmb-panasas2/skchoudh/genomes/S_cerevisiae_BY4741/fasta/BY4741_Toronto_2012.chrom.sizes'
In [30]:
db = gffutils.create_db(gtf, dbfn=gtf_db, merge_strategy='merge',
force=True)
In [102]:
def create_gene_dict(db):
'''
Store each feature line db.all_features() as a dict of dicts
'''
gene_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
for line_no, feature in enumerate(db.all_features()):
feature_type = feature.featuretype
if feature_type == 'exon':
print feature
if feature_type == 'contig':
continue
elif feature_type == 'gene':
gene_ids = feature.attributes['ID']
if len(gene_ids)!=1:
logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))
break
else:
gene_id = gene_ids[0]
feature.attributes['gene_id'] = gene_id
gene_dict[gene_id]['gene'] = feature
elif feature_type == 'mRNA':
gene_id = feature.attributes['Parent']
transcript_id = feature.attributes['ID']
assert len(gene_id) == 1
assert len(transcript_id) == 1
feature.attributes['gene_id'] = gene_id
gene_dict[gene_id[0]][transcript_id[0]][feature_type].append(feature)
else:
try:
transcript_id = feature.attributes['Parent']
except:
pass
#print feature
for gene_id in gene_dict.keys():
assert len(transcript_id) == 1
## Insert only at place where the gene has evidence of this mRNA:
if transcript_id[0] in gene_dict[gene_id].keys():
feature.attributes['gene_id'] = gene_id
gene_dict[gene_id][transcript_id[0]][feature_type].append(feature)
return gene_dict
for x in db.featuretypes():
print x
In [103]:
db = gffutils.FeatureDB(gtf_db, keep_order=True)
gene_dict = create_gene_dict(db)
In [ ]:
In [ ]:
In [99]:
utr5_bed = ''
utr3_bed = ''
gene_bed = ''
exon_bed = ''
intron_bed = ''
start_codon_bed = ''
stop_codon_bed = ''
cds_bed = ''
gene_list = []
for gene_id in get_gene_list(gene_dict):
gene_list.append(gene_dict[gene_id]['gene'])
utr5_regions, utr3_regions = [], []
exon_regions, intron_regions = [], []
star_codon_regions, stop_codon_regions = [], []
cds_regions = []
for feature in gene_dict[gene_id].keys():
if feature == 'gene':
continue
cds = list(gene_dict[gene_id][feature]['CDS'])
exons = list(gene_dict[gene_id][feature]['exon'])
merged_exons = merge_regions(db, exons)
introns = list(gene_dict[gene_id][feature]['intron'])
merged_introns = merge_regions(db, introns)
exon_regions += exons
intron_regions += introns
cds_regions += cds
if exon_regions:
print gene_id, exon_regions
merged_exons = merge_regions(db, exon_regions)
renamed_exons = rename_regions(merged_exons, gene_id)
merged_introns = merge_regions(db, intron_regions)
renamed_introns = rename_regions(merged_introns, gene_id)
merged_cds = merge_regions_nostrand(db, cds_regions)
renamed_cds = rename_regions(merged_cds, gene_id)
exon_bed += create_bed(renamed_exons)
intron_bed += create_bed(renamed_introns)
cds_bed += create_bed(renamed_cds)
gene_bed = create_bed(gene_list)
gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)
exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)
intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)
cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)
gene_bedtool.remove_invalid().sort().saveas('{}.genes.bed'.format(prefix))
exon_bedtool.remove_invalid().sort().saveas('{}.exon.bed'.format(prefix))
intron_bedtool.remove_invalid().sort().saveas('{}.intron.bed'.format(prefix))
cds_bedtool.remove_invalid().sort().saveas('{}.cds.bed'.format(prefix))
Out[99]:
In [ ]:
gene_list
In [14]:
for gene_id in get_gene_list(gene_dict):
start_codons = []
stop_codons = []
for start_codon in db.children(gene_id, featuretype='start_codon'):
## 1 -based stop
## 0-based start handled while converting to bed
start_codon.stop = start_codon.start
start_codons.append(start_codon)
for stop_codon in db.children(gene_id, featuretype='stop_codon'):
stop_codon.start = stop_codon.stop
stop_codon.stop = stop_codon.stop+1
stop_codons.append(stop_codon)
merged_start_codons = merge_regions(db, start_codons)
renamed_start_codons = rename_regions(merged_start_codons, gene_id)
merged_stop_codons = merge_regions(db, stop_codons)
renamed_stop_codons = rename_regions(merged_stop_codons, gene_id)
start_codon_bed += create_bed(renamed_start_codons)
stop_codon_bed += create_bed(renamed_stop_codons)
start_codon_bedtool = pybedtools.BedTool(start_codon_bed, from_string=True)
stop_codon_bedtool = pybedtools.BedTool(stop_codon_bed, from_string=True)
start_codon_bedtool.remove_invalid().sort().saveas('{}.start_codon.bed'.format(prefix))
stop_codon_bedtool.remove_invalid().sort().saveas('{}.stop_codon.bed'.format(prefix))
Out[14]:
In [15]:
## TSS
polyA_sites_bed = ''
tss_sites_bed = ''
for gene_id in get_gene_list(gene_dict):
tss_sites = []
polyA_sites = []
for transcript in db.children(gene_id, featuretype='transcript'):
start_t = copy.deepcopy(transcript)
stop_t = copy.deepcopy(transcript)
start_t.stop = start_t.start + 1
stop_t.start = stop_t.stop
if transcript.strand == '-':
start_t, stop_t = stop_t, start_t
polyA_sites.append(start_t)
tss_sites.append(stop_t)
merged_polyA_sites = merge_regions(db, polyA_sites)
renamed_polyA_sites = rename_regions(merged_polyA_sites, gene_id)
merged_tss_sites = merge_regions(db, tss_sites)
renamed_tss_sites = rename_regions(merged_tss_sites, gene_id)
polyA_sites_bed += create_bed(renamed_polyA_sites)
tss_sites_bed += create_bed(renamed_tss_sites)
polyA_sites_bedtool = pybedtools.BedTool(polyA_sites_bed, from_string=True)
tss_sites_bedtool = pybedtools.BedTool(tss_sites_bed, from_string=True)
polyA_sites_bedtool.remove_invalid().sort().saveas('{}.polyA_sites.bed'.format(prefix))
tss_sites_bedtool.remove_invalid().sort().saveas('{}.tss_sites.bed'.format(prefix))
Out[15]:
In [16]:
tss = tsses(db, as_bed6=True, merge_overlapping=True)
tss.remove_invalid().sort().saveas('{}.tss_temp.bed'.format(prefix))
promoter = tss.slop(l=1000, r=1000, s=True, g=chrsizes)
promoter.remove_invalid().sort().saveas('{}.promoter.1000.bed'.format(prefix))
Out[16]:
In [17]:
for l in [1000, 2000, 3000, 4000, 5000]:
promoter = tss.slop(l=l, r=l, s=True, g=chrsizes)
promoter.remove_invalid().sort().saveas('{}.promoter.{}.bed'.format(prefix, l))
In [ ]: