In [10]:
from collections import defaultdict, OrderedDict
import warnings
import gffutils
import pybedtools
import pandas as pd
import copy
import os
import re
from gffutils.pybedtools_integration import tsses
from copy import deepcopy
from collections import OrderedDict, Callable
import errno

def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise
            
class DefaultOrderedDict(OrderedDict):
    # Source: http://stackoverflow.com/a/6190500/562769
    def __init__(self, default_factory=None, *a, **kw):
        if (default_factory is not None and
           not isinstance(default_factory, Callable)):
            raise TypeError('first argument must be callable')
        OrderedDict.__init__(self, *a, **kw)
        self.default_factory = default_factory

    def __getitem__(self, key):
        try:
            return OrderedDict.__getitem__(self, key)
        except KeyError:
            return self.__missing__(key)

    def __missing__(self, key):
        if self.default_factory is None:
            raise KeyError(key)
        self[key] = value = self.default_factory()
        return value

    def __reduce__(self):
        if self.default_factory is None:
            args = tuple()
        else:
            args = self.default_factory,
        return type(self), args, None, None, self.items()

    def copy(self):
        return self.__copy__()

    def __copy__(self):
        return type(self)(self.default_factory, self)

    def __deepcopy__(self, memo):
        import copy
        return type(self)(self.default_factory,
                          copy.deepcopy(self.items()))

    def __repr__(self):
        return 'OrderedDefaultDict(%s, %s)' % (self.default_factory,
                                               OrderedDict.__repr__(self))
gtf = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.94.gtf' gtf_db = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.94.gtf.db' prefix = '/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/Mmul8/v94/' chrsizes = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.sizes' mkdir_p(prefix)

In [11]:
gtf = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.96.gtf'
gtf_db = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.96.gtf.db'
prefix = '/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/Mmul8/v96/'
chrsizes = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.sizes'
mkdir_p(prefix)

In [3]:
def create_gene_dict(db):
    '''
    Store each feature line db.all_features() as a dict of dicts
    '''
    gene_dict = DefaultOrderedDict(lambda: DefaultOrderedDict(lambda: DefaultOrderedDict(list)))
    for line_no, feature in enumerate(db.all_features()):
        gene_ids = feature.attributes['gene_id']
        feature_type = feature.featuretype
        if feature_type == 'gene':
            if len(gene_ids)!=1:
                logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))
                break
            else:
                gene_id = gene_ids[0]
                gene_dict[gene_id]['gene'] = feature
        else:
            transcript_ids = feature.attributes['transcript_id']

            for gene_id in gene_ids:
                for transcript_id in transcript_ids:
                    gene_dict[gene_id][transcript_id][feature_type].append(feature)
    return gene_dict

In [4]:
db = gffutils.create_db(gtf, dbfn=gtf_db, merge_strategy='merge', force=True, disable_infer_transcripts=True, disable_infer_genes=True)

In [5]:
db = gffutils.FeatureDB(gtf_db, keep_order=True)
gene_dict = create_gene_dict(db)

In [6]:
for x in db.featuretypes():
    print(x)


CDS
exon
five_prime_utr
gene
start_codon
stop_codon
three_prime_utr
transcript

In [7]:
def get_gene_list(gene_dict):
    return list(set(gene_dict.keys()))

def get_UTR_regions(gene_dict, gene_id, transcript, cds):
    if len(cds)==0:
        return [], []
    utr5_regions = []
    utr3_regions = []
    utrs = gene_dict[gene_id][transcript]['UTR']
    first_cds = cds[0]
    last_cds = cds[-1]
    for utr in utrs:
        ## Push all cds at once
        ## Sort later to remove duplicates
        strand = utr.strand
        if strand == '+':
            if utr.stop < first_cds.start:
                utr.feature_type = 'five_prime_UTR'
                utr5_regions.append(utr)
            elif utr.start > last_cds.stop:
                utr.feature_type = 'three_prime_UTR'
                utr3_regions.append(utr)
            else:
                raise RuntimeError('Error with cds')
        elif strand == '-':
            if utr.stop < first_cds.start:
                utr.feature_type = 'three_prime_UTR'
                utr3_regions.append(utr)
            elif utr.start > last_cds.stop:
                utr.feature_type = 'five_prime_UTR'
                utr5_regions.append(utr)                
            else:
                raise RuntimeError('Error with cds')    
    return utr5_regions, utr3_regions
    
def create_bed(regions, bedtype='0'):
    '''Create bed from list of regions
    bedtype: 0 or 1
        0-Based or 1-based coordinate of the BED
    '''
    bedstr = ''
    for region in regions:
        assert len(region.attributes['gene_id']) == 1
        ## GTF start is 1-based, so shift by one while writing 
        ## to 0-based BED format
        if bedtype == '0':
            start = region.start - 1
        else:
            start = region.start
        bedstr += '{}\t{}\t{}\t{}\t{}\t{}\n'.format(region.chrom,
                                             start,
                                             region.stop,
                                             re.sub('\.\d+', '', region.attributes['gene_id'][0]),
                                             '.',
                                             region.strand)
    return bedstr

def rename_regions(regions, gene_id):
    regions = list(regions)
    if len(regions) == 0:
        return []
    for region in regions:
        region.attributes['gene_id'] = gene_id
    return regions

def merge_regions(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start))
    return merged

def merge_regions_nostrand(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start), ignore_strand=True)
    return merged

In [8]:
utr5_bed = ''
utr3_bed = ''
gene_bed = ''
exon_bed = ''
intron_bed = ''
start_codon_bed = ''
stop_codon_bed = ''
cds_bed = ''

gene_list = []

for gene_id in get_gene_list(gene_dict):
    gene_list.append(gene_dict[gene_id]['gene'])
    
    utr5_regions, utr3_regions = [], []
    exon_regions, intron_regions = [], []
    star_codon_regions, stop_codon_regions = [], []
    cds_regions = []
    
    for feature in gene_dict[gene_id].keys():
        if feature == 'gene':
            continue
        cds = list(gene_dict[gene_id][feature]['CDS'])
        exons = list(gene_dict[gene_id][feature]['exon'])
        merged_exons = merge_regions(db, exons)
        introns = db.interfeatures(merged_exons)
        #utr5_region, utr3_region = get_UTR_regions(gene_dict, gene_id, feature, cds)
        utr5_region = list(gene_dict[gene_id][feature]['five_prime_utr'])
        utr3_region = list(gene_dict[gene_id][feature]['three_prime_utr'])
        utr5_regions += utr5_region
        utr3_regions += utr3_region
        exon_regions += exons
        intron_regions += introns
        cds_regions += cds
        
    merged_utr5 = merge_regions(db, utr5_regions)
    renamed_utr5 = rename_regions(merged_utr5, gene_id)
    
    merged_utr3 = merge_regions(db, utr3_regions)
    renamed_utr3 = rename_regions(merged_utr3, gene_id)
    
    merged_exons = merge_regions(db, exon_regions)
    renamed_exons = rename_regions(merged_exons, gene_id)
    
    merged_introns = merge_regions(db, intron_regions)
    renamed_introns = rename_regions(merged_introns, gene_id)
    
    merged_cds = merge_regions(db, cds_regions)
    renamed_cds = rename_regions(merged_cds, gene_id)
    
    utr3_bed += create_bed(renamed_utr3)
    utr5_bed += create_bed(renamed_utr5)
    exon_bed += create_bed(renamed_exons)
    intron_bed += create_bed(renamed_introns)
    cds_bed += create_bed(renamed_cds)
    
    
gene_bed = create_bed(gene_list)
gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)
utr5_bedtool = pybedtools.BedTool(utr5_bed, from_string=True)
utr3_bedtool = pybedtools.BedTool(utr3_bed, from_string=True)
exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)
intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)
cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)

utr5_cds_subtracted = utr5_bedtool.subtract(cds_bedtool)
utr3_cds_subtracted = utr3_bedtool.subtract(cds_bedtool)
utr5_cds_subtracted.remove_invalid().sort().saveas(os.path.join(prefix, 'utr5.bed.gz'))
utr3_cds_subtracted.remove_invalid().sort().saveas(os.path.join(prefix, 'utr3.bed.gz'))
gene_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'gene.bed.gz'))
exon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'exon.bed.gz'))
intron_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'intron.bed.gz'))
cds_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'cds.bed.gz'))


Out[8]:
<BedTool(/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/Mmul8/v96/cds.bed.gz)>

In [9]:
for gene_id in get_gene_list(gene_dict):
    start_codons = []
    stop_codons = []
    for start_codon in db.children(gene_id, featuretype='start_codon'):
        ## 1 -based stop
        ## 0-based start handled while converting to bed
        start_codon.stop = start_codon.start
        start_codons.append(start_codon)
    for stop_codon in db.children(gene_id, featuretype='stop_codon'):
        stop_codon.start = stop_codon.stop
        stop_codon.stop = stop_codon.stop+1
        stop_codons.append(stop_codon)
    merged_start_codons = merge_regions(db, start_codons)
    renamed_start_codons = rename_regions(merged_start_codons, gene_id)
    merged_stop_codons = merge_regions(db, stop_codons)
    renamed_stop_codons = rename_regions(merged_stop_codons, gene_id)
    
    start_codon_bed += create_bed(renamed_start_codons)    
    stop_codon_bed += create_bed(renamed_stop_codons)

    
start_codon_bedtool = pybedtools.BedTool(start_codon_bed, from_string=True)
stop_codon_bedtool = pybedtools.BedTool(stop_codon_bed, from_string=True)
start_codon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'start_codon.bed.gz'))
stop_codon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'stop_codon.bed.gz'))


Out[9]:
<BedTool(/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/Mmul8/v96/stop_codon.bed.gz)>

Check GTF regions do not overshoot fasta


In [1]:
from collections import defaultdict
import gzip
import pandas as pd
import re


GTF_HEADER  = ['seqname', 'source', 'feature', 'start', 'end', 'score',
               'strand', 'frame']
R_SEMICOLON = re.compile(r'\s*;\s*')
R_COMMA     = re.compile(r'\s*,\s*')
R_KEYVALUE  = re.compile(r'(\s+|\s*=\s*)')


def dataframe(filename):
    """Open an optionally gzipped GTF file and return a pandas.DataFrame.
    """
    # Each column is a list stored as a value in this dict.
    result = defaultdict(list)

    for i, line in enumerate(lines(filename)):
        for key in line.keys():
            # This key has not been seen yet, so set it to None for all
            # previous lines.
            if key not in result:
                result[key] = [None] * i

        # Ensure this row has some value for each column.
        for key in result.keys():
            result[key].append(line.get(key, None))

    return pd.DataFrame(result)


def lines(filename):
    """Open an optionally gzipped GTF file and generate a dict for each line.
    """
    fn_open = gzip.open if filename.endswith('.gz') else open

    with fn_open(filename) as fh:
        for line in fh:
            if line.startswith('#'):
                continue
            else:
                yield parse(line)


def parse(line):
    """Parse a single GTF line and return a dict.
    """
    result = {}

    fields = line.rstrip().split('\t')

    for i, col in enumerate(GTF_HEADER):
        result[col] = _get_value(fields[i])

    # INFO field consists of "key1=value;key2=value;...".
    infos = [x for x in re.split(R_SEMICOLON, fields[8]) if x.strip()]

    for i, info in enumerate(infos, 1):
        # It should be key="value".
        try:
            key, _, value = re.split(R_KEYVALUE, info, 1)
        # But sometimes it is just "value".
        except ValueError:
            key = 'INFO{}'.format(i)
            value = info
        # Ignore the field if there is no value.
        if value:
            result[key] = _get_value(value)

    return result


def _get_value(value):
    if not value:
        return None

    # Strip double and single quotes.
    value = value.strip('"\'')

    # Return a list if the value has a comma.
    if ',' in value:
        value = re.split(R_COMMA, value)
    # These values are equivalent to None.
    elif value in ['', '.', 'NA']:
        return None

    return value

def get_seq(row):
    chrom = row['seqname']
    start = row['start']
    end = row['end']
    strand = row['strand']
    
    if strand == '+':
        return fasta.get_seq(chrom, start, end)
    else:
        seq = fasta.get_seq(chrom, start, end)
        return str(Seq(seq, generic_dna).reverse_complement())

In [6]:
chrom_sizes


Out[6]:
chrom size
0 1 225584828
1 2 204787373
2 3 185818997
3 4 172585720
4 5 190429646
5 6 180051392
6 7 169600520
7 8 144306982
8 9 129882849
9 10 92844088
10 11 133663169
11 12 125506784
12 13 108979918
13 14 127894412
14 15 111343173
15 16 77216781
16 17 95684472
17 18 70235451
18 19 53671032
19 20 74971481
20 X 149150640
21 Y 11753682
22 MT 16564
23 KQ739485.1 919841
24 KQ739657.1 805525
25 KQ732613.1 643885
26 KQ733369.1 610816
27 KQ742746.1 602698
28 KQ732984.1 576867
29 KQ734207.1 516943
... ... ...
284698 JSUE03175605.1 202
284699 JSUE03263931.1 201
284700 JSUE03160683.1 200
284701 JSUE03246639.1 200
284702 JSUE03096016.1 199
284703 JSUE03334604.1 198
284704 JSUE03056228.1 196
284705 JSUE03246815.1 194
284706 JSUE03263583.1 193
284707 JSUE03174083.1 192
284708 JSUE03297290.1 191
284709 JSUE03092936.1 188
284710 JSUE03155852.1 186
284711 JSUE03055955.1 186
284712 JSUE03279751.1 184
284713 JSUE03318858.1 179
284714 JSUE03238914.1 179
284715 JSUE03154903.1 179
284716 JSUE03136600.1 174
284717 JSUE03107000.1 173
284718 JSUE03174048.1 170
284719 JSUE03272611.1 163
284720 JSUE03140296.1 157
284721 JSUE03165938.1 155
284722 JSUE03289583.1 147
284723 JSUE03178651.1 141
284724 JSUE03100647.1 134
284725 JSUE03333562.1 134
284726 JSUE03109914.1 130
284727 JSUE03199803.1 116

284728 rows × 2 columns


In [2]:
gtf_df = dataframe('/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.96.gtf')

In [7]:
chrom_sizes = pd.read_table('/home/cmb-06/as/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.sizes',
                            names=['chrom', 'size'])
chrom_sizes = dict(chrom_sizes.values.tolist())


/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\t'.
  

In [8]:
def is_within_chrom_size(chrom, end):
    is_true = int(int(chrom_sizes[str(chrom)]) >= int(end))
    #print(chrom, end,int(chrom_sizes[str(chrom)]), is_true)
    return is_true

gtf_df['is_within_chrom_size'] = gtf_df.apply(lambda row: is_within_chrom_size(row.seqname, row.end), axis=1)

In [9]:
gtf_df[gtf_df['is_within_chrom_size']!=1]


Out[9]:
seqname source feature start end score strand frame gene_id gene_version ... transcript_version transcript_name transcript_source transcript_biotype exon_number exon_id exon_version protein_id protein_version is_within_chrom_size
1164394 JSUE03241461.1 ensembl gene 861 1038 None - None ENSMMUG00000044518 1 ... None None None None None None None None None 0
1164395 JSUE03241461.1 ensembl transcript 861 1038 None - None ENSMMUG00000044518 1 ... 1 RF01241-201 ensembl snoRNA None None None None None 0
1164396 JSUE03241461.1 ensembl exon 861 1038 None - None ENSMMUG00000044518 1 ... 1 RF01241-201 ensembl snoRNA 1 ENSMMUE00000391984 1 None None 0
1164453 JSUE03318976.1 ensembl gene 916 1039 None + None ENSMMUG00000048552 1 ... None None None None None None None None None 0
1164454 JSUE03318976.1 ensembl transcript 916 1039 None + None ENSMMUG00000048552 1 ... 1 RF00003-201 ensembl snRNA None None None None None 0
1164455 JSUE03318976.1 ensembl exon 916 1039 None + None ENSMMUG00000048552 1 ... 1 RF00003-201 ensembl snRNA 1 ENSMMUE00000384562 1 None None 0
1164628 JSUE03158362.1 ensembl gene 908 995 None + None ENSMMUG00000046452 1 ... None None None None None None None None None 0
1164629 JSUE03158362.1 ensembl transcript 908 995 None + None ENSMMUG00000046452 1 ... 1 RF01518-201 ensembl misc_RNA None None None None None 0
1164630 JSUE03158362.1 ensembl exon 908 995 None + None ENSMMUG00000046452 1 ... 1 RF01518-201 ensembl misc_RNA 1 ENSMMUE00000382989 1 None None 0
1165906 JSUE03156690.1 ensembl gene 546 751 None + None ENSMMUG00000045055 1 ... None None None None None None None None None 0
1165907 JSUE03156690.1 ensembl transcript 546 751 None + None ENSMMUG00000045055 1 ... 1 RF00100-201 ensembl misc_RNA None None None None None 0
1165908 JSUE03156690.1 ensembl exon 546 751 None + None ENSMMUG00000045055 1 ... 1 RF00100-201 ensembl misc_RNA 1 ENSMMUE00000406663 1 None None 0
1166392 JSUE03144147.1 ensembl gene 207 526 None + None ENSMMUG00000039097 1 ... None None None None None None None None None 0
1166393 JSUE03144147.1 ensembl transcript 207 526 None + None ENSMMUG00000039097 1 ... 1 RF02104-201 ensembl misc_RNA None None None None None 0
1166394 JSUE03144147.1 ensembl exon 207 526 None + None ENSMMUG00000039097 1 ... 1 RF02104-201 ensembl misc_RNA 1 ENSMMUE00000392075 1 None None 0

15 rows × 24 columns


In [12]:
db = gffutils.FeatureDB(gtf_db, keep_order=True)

In [14]:
with open('/home/cmb-06/as/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.96_fixed.gtf', 'w') as fout:
    for feature in db.all_features():
        max_size = chrom_sizes[feature.chrom]
        # If end is longer than the chromosome
        if feature.end > max_size:
            feature.end = max_size
        fout.write(str(feature) + '\n')

In [ ]: