In [10]:
from collections import defaultdict, OrderedDict
import warnings
import gffutils
import pybedtools
import pandas as pd
import copy
import os
import re
from gffutils.pybedtools_integration import tsses
from copy import deepcopy
from collections import OrderedDict, Callable
import errno
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
class DefaultOrderedDict(OrderedDict):
# Source: http://stackoverflow.com/a/6190500/562769
def __init__(self, default_factory=None, *a, **kw):
if (default_factory is not None and
not isinstance(default_factory, Callable)):
raise TypeError('first argument must be callable')
OrderedDict.__init__(self, *a, **kw)
self.default_factory = default_factory
def __getitem__(self, key):
try:
return OrderedDict.__getitem__(self, key)
except KeyError:
return self.__missing__(key)
def __missing__(self, key):
if self.default_factory is None:
raise KeyError(key)
self[key] = value = self.default_factory()
return value
def __reduce__(self):
if self.default_factory is None:
args = tuple()
else:
args = self.default_factory,
return type(self), args, None, None, self.items()
def copy(self):
return self.__copy__()
def __copy__(self):
return type(self)(self.default_factory, self)
def __deepcopy__(self, memo):
import copy
return type(self)(self.default_factory,
copy.deepcopy(self.items()))
def __repr__(self):
return 'OrderedDefaultDict(%s, %s)' % (self.default_factory,
OrderedDict.__repr__(self))
In [11]:
gtf = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.96.gtf'
gtf_db = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.96.gtf.db'
prefix = '/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/Mmul8/v96/'
chrsizes = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.sizes'
mkdir_p(prefix)
In [3]:
def create_gene_dict(db):
'''
Store each feature line db.all_features() as a dict of dicts
'''
gene_dict = DefaultOrderedDict(lambda: DefaultOrderedDict(lambda: DefaultOrderedDict(list)))
for line_no, feature in enumerate(db.all_features()):
gene_ids = feature.attributes['gene_id']
feature_type = feature.featuretype
if feature_type == 'gene':
if len(gene_ids)!=1:
logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))
break
else:
gene_id = gene_ids[0]
gene_dict[gene_id]['gene'] = feature
else:
transcript_ids = feature.attributes['transcript_id']
for gene_id in gene_ids:
for transcript_id in transcript_ids:
gene_dict[gene_id][transcript_id][feature_type].append(feature)
return gene_dict
In [4]:
db = gffutils.create_db(gtf, dbfn=gtf_db, merge_strategy='merge', force=True, disable_infer_transcripts=True, disable_infer_genes=True)
In [5]:
db = gffutils.FeatureDB(gtf_db, keep_order=True)
gene_dict = create_gene_dict(db)
In [6]:
for x in db.featuretypes():
print(x)
In [7]:
def get_gene_list(gene_dict):
return list(set(gene_dict.keys()))
def get_UTR_regions(gene_dict, gene_id, transcript, cds):
if len(cds)==0:
return [], []
utr5_regions = []
utr3_regions = []
utrs = gene_dict[gene_id][transcript]['UTR']
first_cds = cds[0]
last_cds = cds[-1]
for utr in utrs:
## Push all cds at once
## Sort later to remove duplicates
strand = utr.strand
if strand == '+':
if utr.stop < first_cds.start:
utr.feature_type = 'five_prime_UTR'
utr5_regions.append(utr)
elif utr.start > last_cds.stop:
utr.feature_type = 'three_prime_UTR'
utr3_regions.append(utr)
else:
raise RuntimeError('Error with cds')
elif strand == '-':
if utr.stop < first_cds.start:
utr.feature_type = 'three_prime_UTR'
utr3_regions.append(utr)
elif utr.start > last_cds.stop:
utr.feature_type = 'five_prime_UTR'
utr5_regions.append(utr)
else:
raise RuntimeError('Error with cds')
return utr5_regions, utr3_regions
def create_bed(regions, bedtype='0'):
'''Create bed from list of regions
bedtype: 0 or 1
0-Based or 1-based coordinate of the BED
'''
bedstr = ''
for region in regions:
assert len(region.attributes['gene_id']) == 1
## GTF start is 1-based, so shift by one while writing
## to 0-based BED format
if bedtype == '0':
start = region.start - 1
else:
start = region.start
bedstr += '{}\t{}\t{}\t{}\t{}\t{}\n'.format(region.chrom,
start,
region.stop,
re.sub('\.\d+', '', region.attributes['gene_id'][0]),
'.',
region.strand)
return bedstr
def rename_regions(regions, gene_id):
regions = list(regions)
if len(regions) == 0:
return []
for region in regions:
region.attributes['gene_id'] = gene_id
return regions
def merge_regions(db, regions):
if len(regions) == 0:
return []
merged = db.merge(sorted(list(regions), key=lambda x: x.start))
return merged
def merge_regions_nostrand(db, regions):
if len(regions) == 0:
return []
merged = db.merge(sorted(list(regions), key=lambda x: x.start), ignore_strand=True)
return merged
In [8]:
utr5_bed = ''
utr3_bed = ''
gene_bed = ''
exon_bed = ''
intron_bed = ''
start_codon_bed = ''
stop_codon_bed = ''
cds_bed = ''
gene_list = []
for gene_id in get_gene_list(gene_dict):
gene_list.append(gene_dict[gene_id]['gene'])
utr5_regions, utr3_regions = [], []
exon_regions, intron_regions = [], []
star_codon_regions, stop_codon_regions = [], []
cds_regions = []
for feature in gene_dict[gene_id].keys():
if feature == 'gene':
continue
cds = list(gene_dict[gene_id][feature]['CDS'])
exons = list(gene_dict[gene_id][feature]['exon'])
merged_exons = merge_regions(db, exons)
introns = db.interfeatures(merged_exons)
#utr5_region, utr3_region = get_UTR_regions(gene_dict, gene_id, feature, cds)
utr5_region = list(gene_dict[gene_id][feature]['five_prime_utr'])
utr3_region = list(gene_dict[gene_id][feature]['three_prime_utr'])
utr5_regions += utr5_region
utr3_regions += utr3_region
exon_regions += exons
intron_regions += introns
cds_regions += cds
merged_utr5 = merge_regions(db, utr5_regions)
renamed_utr5 = rename_regions(merged_utr5, gene_id)
merged_utr3 = merge_regions(db, utr3_regions)
renamed_utr3 = rename_regions(merged_utr3, gene_id)
merged_exons = merge_regions(db, exon_regions)
renamed_exons = rename_regions(merged_exons, gene_id)
merged_introns = merge_regions(db, intron_regions)
renamed_introns = rename_regions(merged_introns, gene_id)
merged_cds = merge_regions(db, cds_regions)
renamed_cds = rename_regions(merged_cds, gene_id)
utr3_bed += create_bed(renamed_utr3)
utr5_bed += create_bed(renamed_utr5)
exon_bed += create_bed(renamed_exons)
intron_bed += create_bed(renamed_introns)
cds_bed += create_bed(renamed_cds)
gene_bed = create_bed(gene_list)
gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)
utr5_bedtool = pybedtools.BedTool(utr5_bed, from_string=True)
utr3_bedtool = pybedtools.BedTool(utr3_bed, from_string=True)
exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)
intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)
cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)
utr5_cds_subtracted = utr5_bedtool.subtract(cds_bedtool)
utr3_cds_subtracted = utr3_bedtool.subtract(cds_bedtool)
utr5_cds_subtracted.remove_invalid().sort().saveas(os.path.join(prefix, 'utr5.bed.gz'))
utr3_cds_subtracted.remove_invalid().sort().saveas(os.path.join(prefix, 'utr3.bed.gz'))
gene_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'gene.bed.gz'))
exon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'exon.bed.gz'))
intron_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'intron.bed.gz'))
cds_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'cds.bed.gz'))
Out[8]:
In [9]:
for gene_id in get_gene_list(gene_dict):
start_codons = []
stop_codons = []
for start_codon in db.children(gene_id, featuretype='start_codon'):
## 1 -based stop
## 0-based start handled while converting to bed
start_codon.stop = start_codon.start
start_codons.append(start_codon)
for stop_codon in db.children(gene_id, featuretype='stop_codon'):
stop_codon.start = stop_codon.stop
stop_codon.stop = stop_codon.stop+1
stop_codons.append(stop_codon)
merged_start_codons = merge_regions(db, start_codons)
renamed_start_codons = rename_regions(merged_start_codons, gene_id)
merged_stop_codons = merge_regions(db, stop_codons)
renamed_stop_codons = rename_regions(merged_stop_codons, gene_id)
start_codon_bed += create_bed(renamed_start_codons)
stop_codon_bed += create_bed(renamed_stop_codons)
start_codon_bedtool = pybedtools.BedTool(start_codon_bed, from_string=True)
stop_codon_bedtool = pybedtools.BedTool(stop_codon_bed, from_string=True)
start_codon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'start_codon.bed.gz'))
stop_codon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'stop_codon.bed.gz'))
Out[9]:
In [1]:
from collections import defaultdict
import gzip
import pandas as pd
import re
GTF_HEADER = ['seqname', 'source', 'feature', 'start', 'end', 'score',
'strand', 'frame']
R_SEMICOLON = re.compile(r'\s*;\s*')
R_COMMA = re.compile(r'\s*,\s*')
R_KEYVALUE = re.compile(r'(\s+|\s*=\s*)')
def dataframe(filename):
"""Open an optionally gzipped GTF file and return a pandas.DataFrame.
"""
# Each column is a list stored as a value in this dict.
result = defaultdict(list)
for i, line in enumerate(lines(filename)):
for key in line.keys():
# This key has not been seen yet, so set it to None for all
# previous lines.
if key not in result:
result[key] = [None] * i
# Ensure this row has some value for each column.
for key in result.keys():
result[key].append(line.get(key, None))
return pd.DataFrame(result)
def lines(filename):
"""Open an optionally gzipped GTF file and generate a dict for each line.
"""
fn_open = gzip.open if filename.endswith('.gz') else open
with fn_open(filename) as fh:
for line in fh:
if line.startswith('#'):
continue
else:
yield parse(line)
def parse(line):
"""Parse a single GTF line and return a dict.
"""
result = {}
fields = line.rstrip().split('\t')
for i, col in enumerate(GTF_HEADER):
result[col] = _get_value(fields[i])
# INFO field consists of "key1=value;key2=value;...".
infos = [x for x in re.split(R_SEMICOLON, fields[8]) if x.strip()]
for i, info in enumerate(infos, 1):
# It should be key="value".
try:
key, _, value = re.split(R_KEYVALUE, info, 1)
# But sometimes it is just "value".
except ValueError:
key = 'INFO{}'.format(i)
value = info
# Ignore the field if there is no value.
if value:
result[key] = _get_value(value)
return result
def _get_value(value):
if not value:
return None
# Strip double and single quotes.
value = value.strip('"\'')
# Return a list if the value has a comma.
if ',' in value:
value = re.split(R_COMMA, value)
# These values are equivalent to None.
elif value in ['', '.', 'NA']:
return None
return value
def get_seq(row):
chrom = row['seqname']
start = row['start']
end = row['end']
strand = row['strand']
if strand == '+':
return fasta.get_seq(chrom, start, end)
else:
seq = fasta.get_seq(chrom, start, end)
return str(Seq(seq, generic_dna).reverse_complement())
In [6]:
chrom_sizes
Out[6]:
In [2]:
gtf_df = dataframe('/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.96.gtf')
In [7]:
chrom_sizes = pd.read_table('/home/cmb-06/as/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.sizes',
names=['chrom', 'size'])
chrom_sizes = dict(chrom_sizes.values.tolist())
In [8]:
def is_within_chrom_size(chrom, end):
is_true = int(int(chrom_sizes[str(chrom)]) >= int(end))
#print(chrom, end,int(chrom_sizes[str(chrom)]), is_true)
return is_true
gtf_df['is_within_chrom_size'] = gtf_df.apply(lambda row: is_within_chrom_size(row.seqname, row.end), axis=1)
In [9]:
gtf_df[gtf_df['is_within_chrom_size']!=1]
Out[9]:
In [12]:
db = gffutils.FeatureDB(gtf_db, keep_order=True)
In [14]:
with open('/home/cmb-06/as/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.96_fixed.gtf', 'w') as fout:
for feature in db.all_features():
max_size = chrom_sizes[feature.chrom]
# If end is longer than the chromosome
if feature.end > max_size:
feature.end = max_size
fout.write(str(feature) + '\n')
In [ ]: