Table of Contents


In [1]:
from collections import defaultdict
import warnings
import logging
import gffutils
import pybedtools
import pandas as pd
import copy
import re
from gffutils.pybedtools_integration import tsses

logging.basicConfig(level=logging.INFO)

In [2]:
gencode_gtf = '/home/cmb-panasas2/skchoudh/genomes/C_albicans_SC5314/Assembly22/annotation/C_albicans_SC5314_version_A22-s07-m01-r50_features.encode.gtf'
gencode_gtf_db = '/home/cmb-panasas2/skchoudh/genomes/C_albicans_SC5314/Assembly22/annotation/C_albicans_SC5314_version_A22-s07-m01-r50_features.encode.gtf.db'
prefix = '/home/cmb-panasas2/skchoudh/genomes/C_albicans_SC5314/Assembly22/annotation/C_albicans_SC5314_version_A22-s07-m01-r50_features.encode.gffutils'
chrsizes = '/home/cmb-panasas2/skchoudh/genomes/C_albicans_SC5314/Assembly22/fasta_v50/C_albicans_SC5314_version_A22-s07-m01-r50_chromosomes_clean_records.sizes'

In [3]:
def create_gene_dict(db):
    '''
    Store each feature line db.all_features() as a dict of dicts
    '''
    gene_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    for line_no, feature in enumerate(db.all_features()):
        gene_ids = feature.attributes['gene_id']
        feature_type = feature.featuretype
        if feature_type == 'gene':
            if len(gene_ids)!=1:
                logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))
                break
            else:
                gene_id = gene_ids[0]
                gene_dict[gene_id]['gene'] = feature
        else:
            transcript_ids = feature.attributes['transcript_id']

            for gene_id in gene_ids:
                for transcript_id in transcript_ids:
                    gene_dict[gene_id][transcript_id][feature_type].append(feature)
    return gene_dict

In [4]:
db = gffutils.create_db(gencode_gtf, dbfn=gencode_gtf_db, keep_order=True,
                        merge_strategy='merge', force=True)

db = gffutils.FeatureDB(gencode_gtf_db)
gene_dict = create_gene_dict(db)


/home/cmb-panasas2/skchoudh/software_frozen/anaconda27/lib/python2.7/site-packages/gffutils/create.py:730: UserWarning: It appears you have a gene feature in your GTF file. You may want to use the `disable_infer_genes` option to speed up database creation
  "It appears you have a gene feature in your GTF "

In [5]:
def get_gene_list(gene_dict):
    return list(set(gene_dict.keys()))

def get_UTR_regions(gene_dict, gene_id, transcript, cds):
    if len(cds)==0:
        return [], []
    utr5_regions = []
    utr3_regions = []
    utrs = gene_dict[gene_id][transcript]['UTR']
    first_cds = cds[0]
    last_cds = cds[-1]
    for utr in utrs:
        ## Push all cds at once
        ## Sort later to remove duplicates
        strand = utr.strand
        if strand == '+':
            if utr.stop < first_cds.start:
                utr.feature_type = 'five_prime_UTR'
                utr5_regions.append(utr)
            elif utr.start > last_cds.stop:
                utr.feature_type = 'three_prime_UTR'
                utr3_regions.append(utr)
            else:
                #raise RuntimeError('Error with cds: {}\t {} \t {}'.format(utr, last_cds, first_cds))
                print('Error with cds: {}\t {} \t {}'.format(utr, last_cds, first_cds))
        elif strand == '-':
            if utr.stop < first_cds.start:
                utr.feature_type = 'three_prime_UTR'
                utr3_regions.append(utr)
            elif utr.start > last_cds.stop:
                utr.feature_type = 'five_prime_UTR'
                utr5_regions.append(utr)                
            else:
                #raise RuntimeError('Error with cds')    
                print('Error with cds: {}\t {} \t {}'.format(utr, last_cds, first_cds))
    return utr5_regions, utr3_regions
    
def create_bed(regions, bedtype='0'):
    '''Create bed from list of regions
    bedtype: 0 or 1
        0-Based or 1-based coordinate of the BED
    '''
    bedstr = ''
    for region in regions:
        assert len(region.attributes['gene_id']) == 1
        ## GTF start is 1-based, so shift by one while writing 
        ## to 0-based BED format
        if bedtype == '0':
            start = region.start - 1
        else:
            start = region.start
        bedstr += '{}\t{}\t{}\t{}\t{}\t{}\n'.format(region.chrom,
                                             start,
                                             region.stop,
                                             re.sub('\.\d+', '', region.attributes['gene_id'][0]),
                                             '.',
                                             region.strand)
    return bedstr

def rename_regions(regions, gene_id):
    regions = list(regions)
    if len(regions) == 0:
        return []
    for region in regions:
        region.attributes['gene_id'] = gene_id
    return regions

def merge_regions(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start))
    return merged

def merge_regions_nostrand(db, regions):
    if len(regions) == 0:
        return []
    merged = db.merge(sorted(list(regions), key=lambda x: x.start), ignore_strand=True)
    return merged

In [6]:
utr5_bed = ''
utr3_bed = ''
gene_bed = ''
exon_bed = ''
intron_bed = ''
start_codon_bed = ''
stop_codon_bed = ''
cds_bed = ''

gene_list = []

for gene_id in get_gene_list(gene_dict):
    gene_list.append(gene_dict[gene_id]['gene'])
    
    utr5_regions, utr3_regions = [], []
    exon_regions, intron_regions = [], []
    star_codon_regions, stop_codon_regions = [], []
    cds_regions = []
    
    for feature in gene_dict[gene_id].keys():
        if feature == 'gene':
            continue
        cds = list(gene_dict[gene_id][feature]['CDS'])
        exons = list(gene_dict[gene_id][feature]['exon'])
        merged_exons = merge_regions(db, exons)
        introns = db.interfeatures(merged_exons)
        utr5_region, utr3_region = get_UTR_regions(gene_dict, gene_id, feature, cds)
        utr5_regions += utr5_region
        utr3_regions += utr3_region
        exon_regions += exons
        intron_regions += introns
        cds_regions += cds
        
    merged_utr5 = merge_regions(db, utr5_regions)
    renamed_utr5 = rename_regions(merged_utr5, gene_id)
    
    merged_utr3 = merge_regions(db, utr3_regions)
    renamed_utr3 = rename_regions(merged_utr3, gene_id)
    
    merged_exons = merge_regions(db, exon_regions)
    renamed_exons = rename_regions(merged_exons, gene_id)
    
    merged_introns = merge_regions(db, intron_regions)
    renamed_introns = rename_regions(merged_introns, gene_id)
    
    merged_cds = merge_regions(db, cds_regions)
    renamed_cds = rename_regions(merged_cds, gene_id)
    
    utr3_bed += create_bed(renamed_utr3)
    utr5_bed += create_bed(renamed_utr5)
    exon_bed += create_bed(renamed_exons)
    intron_bed += create_bed(renamed_introns)
    cds_bed += create_bed(renamed_cds)
    
    
gene_bed = create_bed(gene_list)
gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)
utr5_bedtool = pybedtools.BedTool(utr5_bed, from_string=True)
utr3_bedtool = pybedtools.BedTool(utr3_bed, from_string=True)
exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)
intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)
cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)

gene_bedtool.remove_invalid().sort().saveas('{}.genes.bed'.format(prefix))
utr5_bedtool.remove_invalid().sort().saveas('{}.UTR5.bed'.format(prefix))
utr3_bedtool.remove_invalid().sort().saveas('{}.UTR3.bed'.format(prefix))
exon_bedtool.remove_invalid().sort().saveas('{}.exon.bed'.format(prefix))
intron_bedtool.remove_invalid().sort().saveas('{}.intron.bed'.format(prefix))
cds_bedtool.remove_invalid().sort().saveas('{}.cds.bed'.format(prefix))


Error with cds: Ca22chr6B_C_albicans_SC5314	CGD	UTR	356825	356899	.	+	1	gene_status "NOVEL"; level "1"; transcript_name "C6_01700W_B-T"; transcript_status "NOVEL"; gene_id "C6_01700W_B"; exon_id "C6_01700W_B-T"; exon_number "1"; ccdsid "CAL0000192067"; transcript_id "C6_01700W_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "ACC1";	 Ca22chr6B_C_albicans_SC5314	CGD	CDS	357342	357733	.	+	1	gene_status "NOVEL"; level "1"; transcript_name "C6_01700W_B-T"; transcript_status "NOVEL"; gene_id "C6_01700W_B"; exon_id "C6_01700W_B-T"; exon_number "2"; ccdsid "CAL0000192067"; transcript_id "C6_01700W_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "ACC1"; 	 Ca22chr6B_C_albicans_SC5314	CGD	CDS	356899	356899	.	+	1	gene_status "NOVEL"; level "1"; transcript_name "C6_01700W_B-T"; transcript_status "NOVEL"; gene_id "C6_01700W_B"; exon_id "C6_01700W_B-T"; exon_number "1"; ccdsid "CAL0000192067"; transcript_id "C6_01700W_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "ACC1";
Error with cds: Ca22chr6A_C_albicans_SC5314	CGD	UTR	356871	356945	.	+	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C6_01700W_A"; exon_id "C6_01700W_A-T"; transcript_type "protein_coding"; transcript_name "C6_01700W_A-T"; ccdsid "CAL0000179592"; transcript_id "C6_01700W_A-T"; gene_type "protein_coding"; protein_id "orf19.3415"; gene_name "ACC1";	 Ca22chr6A_C_albicans_SC5314	CGD	CDS	357388	357779	.	+	1	gene_status "KNOWN"; exon_number "2"; level "1"; transcript_status "KNOWN"; gene_id "C6_01700W_A"; exon_id "C6_01700W_A-T"; transcript_type "protein_coding"; transcript_name "C6_01700W_A-T"; ccdsid "CAL0000179592"; transcript_id "C6_01700W_A-T"; gene_type "protein_coding"; protein_id "orf19.3415"; gene_name "ACC1"; 	 Ca22chr6A_C_albicans_SC5314	CGD	CDS	356945	356945	.	+	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C6_01700W_A"; exon_id "C6_01700W_A-T"; transcript_type "protein_coding"; transcript_name "C6_01700W_A-T"; ccdsid "CAL0000179592"; transcript_id "C6_01700W_A-T"; gene_type "protein_coding"; protein_id "orf19.3415"; gene_name "ACC1";
Error with cds: Ca22chr1A_C_albicans_SC5314	CGD	UTR	888024	888186	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C1_04290C_A"; exon_id "C1_04290C_A-T"; transcript_type "protein_coding"; transcript_name "C1_04290C_A-T"; ccdsid "CAL0000197968"; transcript_id "C1_04290C_A-T"; gene_type "protein_coding"; protein_id "orf19.1064,orf19.8666"; gene_name "C1_14550C_A";	 Ca22chr1A_C_albicans_SC5314	CGD	CDS	889117	889117	.	-	1	gene_status "KNOWN"; exon_number "2"; level "1"; transcript_status "KNOWN"; gene_id "C1_04290C_A"; exon_id "C1_04290C_A-T"; transcript_type "protein_coding"; transcript_name "C1_04290C_A-T"; ccdsid "CAL0000197968"; transcript_id "C1_04290C_A-T"; gene_type "protein_coding"; protein_id "orf19.1064,orf19.8666"; gene_name "C1_14550C_A"; 	 Ca22chr1A_C_albicans_SC5314	CGD	CDS	886013	888039	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C1_04290C_A"; exon_id "C1_04290C_A-T"; transcript_type "protein_coding"; transcript_name "C1_04290C_A-T"; ccdsid "CAL0000197968"; transcript_id "C1_04290C_A-T"; gene_type "protein_coding"; protein_id "orf19.1064,orf19.8666"; gene_name "C1_14550C_A";
Error with cds: Ca22chr1B_C_albicans_SC5314	CGD	UTR	888063	888226	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C1_04290C_B-T"; transcript_status "NOVEL"; gene_id "C1_04290C_B"; exon_id "C1_04290C_B-T"; exon_number "1"; ccdsid "CAL0000189728"; transcript_id "C1_04290C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C4_03050C_B";	 Ca22chr1B_C_albicans_SC5314	CGD	CDS	889157	889157	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C1_04290C_B-T"; transcript_status "NOVEL"; gene_id "C1_04290C_B"; exon_id "C1_04290C_B-T"; exon_number "2"; ccdsid "CAL0000189728"; transcript_id "C1_04290C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C4_03050C_B"; 	 Ca22chr1B_C_albicans_SC5314	CGD	CDS	886052	888078	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C1_04290C_B-T"; transcript_status "NOVEL"; gene_id "C1_04290C_B"; exon_id "C1_04290C_B-T"; exon_number "1"; ccdsid "CAL0000189728"; transcript_id "C1_04290C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C4_03050C_B";
Error with cds: Ca22chr3A_C_albicans_SC5314	CGD	UTR	214757	214758	.	-	1	gene_status "NOVEL"; exon_number "2"; level "1"; transcript_status "NOVEL"; gene_id "C3_01040C_A"; exon_id "C3_01040C_A-T"; transcript_type "protein_coding"; transcript_name "C3_01040C_A-T"; ccdsid "CAL0000184047"; transcript_id "C3_01040C_A-T"; gene_type "protein_coding"; protein_id "orf19.2508,orf19.10044"; gene_name "C3_07550C_B";	 Ca22chr3A_C_albicans_SC5314	CGD	CDS	214829	215352	.	-	1	gene_status "NOVEL"; exon_number "2"; level "1"; transcript_status "NOVEL"; gene_id "C3_01040C_A"; exon_id "C3_01040C_A-T"; transcript_type "protein_coding"; transcript_name "C3_01040C_A-T"; ccdsid "CAL0000184047"; transcript_id "C3_01040C_A-T"; gene_type "protein_coding"; protein_id "orf19.2508,orf19.10044"; gene_name "C3_07550C_B"; 	 Ca22chr3A_C_albicans_SC5314	CGD	CDS	213601	214756	.	-	1	gene_status "NOVEL"; exon_number "1"; level "1"; transcript_status "NOVEL"; gene_id "C3_01040C_A"; exon_id "C3_01040C_A-T"; transcript_type "protein_coding"; transcript_name "C3_01040C_A-T"; ccdsid "CAL0000184047"; transcript_id "C3_01040C_A-T"; gene_type "protein_coding"; protein_id "orf19.2508,orf19.10044"; gene_name "C3_07550C_B";
Error with cds: Ca22chr1A_C_albicans_SC5314	CGD	UTR	917169	917311	.	-	1	gene_status "NOVEL"; exon_number "2"; level "1"; transcript_status "NOVEL"; gene_id "C1_04430C_A"; exon_id "C1_04430C_A-T"; transcript_type "protein_coding"; transcript_name "C1_04430C_A-T"; ccdsid "CAL0000178260"; transcript_id "C1_04430C_A-T"; gene_type "protein_coding"; protein_id "orf19.5194"; gene_name "END3";	 Ca22chr1A_C_albicans_SC5314	CGD	CDS	917227	917263	.	-	1	gene_status "NOVEL"; exon_number "2"; level "1"; transcript_status "NOVEL"; gene_id "C1_04430C_A"; exon_id "C1_04430C_A-T"; transcript_type "protein_coding"; transcript_name "C1_04430C_A-T"; ccdsid "CAL0000178260"; transcript_id "C1_04430C_A-T"; gene_type "protein_coding"; protein_id "orf19.5194"; gene_name "END3"; 	 Ca22chr1A_C_albicans_SC5314	CGD	CDS	916440	917161	.	-	1	gene_status "NOVEL"; exon_number "1"; level "1"; transcript_status "NOVEL"; gene_id "C1_04430C_A"; exon_id "C1_04430C_A-T"; transcript_type "protein_coding"; transcript_name "C1_04430C_A-T"; ccdsid "CAL0000178260"; transcript_id "C1_04430C_A-T"; gene_type "protein_coding"; protein_id "orf19.5194"; gene_name "END3";
Error with cds: Ca22chr1A_C_albicans_SC5314	CGD	UTR	917262	917263	.	-	1	gene_status "NOVEL"; exon_number "1"; level "1"; transcript_status "NOVEL"; gene_id "C1_04430C_A"; exon_id "C1_04430C_A-T"; transcript_type "protein_coding"; transcript_name "C1_04430C_A-T"; ccdsid "CAL0000178260"; transcript_id "C1_04430C_A-T"; gene_type "protein_coding"; protein_id "orf19.5194"; gene_name "END3";	 Ca22chr1A_C_albicans_SC5314	CGD	CDS	917227	917263	.	-	1	gene_status "NOVEL"; exon_number "2"; level "1"; transcript_status "NOVEL"; gene_id "C1_04430C_A"; exon_id "C1_04430C_A-T"; transcript_type "protein_coding"; transcript_name "C1_04430C_A-T"; ccdsid "CAL0000178260"; transcript_id "C1_04430C_A-T"; gene_type "protein_coding"; protein_id "orf19.5194"; gene_name "END3"; 	 Ca22chr1A_C_albicans_SC5314	CGD	CDS	916440	917161	.	-	1	gene_status "NOVEL"; exon_number "1"; level "1"; transcript_status "NOVEL"; gene_id "C1_04430C_A"; exon_id "C1_04430C_A-T"; transcript_type "protein_coding"; transcript_name "C1_04430C_A-T"; ccdsid "CAL0000178260"; transcript_id "C1_04430C_A-T"; gene_type "protein_coding"; protein_id "orf19.5194"; gene_name "END3";
Error with cds: Ca22chr1B_C_albicans_SC5314	CGD	UTR	917204	917346	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C1_04430C_B-T"; transcript_status "NOVEL"; gene_id "C1_04430C_B"; exon_id "C1_04430C_B-T"; exon_number "2"; ccdsid "CAL0000197740"; transcript_id "C1_04430C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C1_04430C_A";	 Ca22chr1B_C_albicans_SC5314	CGD	CDS	917262	917298	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C1_04430C_B-T"; transcript_status "NOVEL"; gene_id "C1_04430C_B"; exon_id "C1_04430C_B-T"; exon_number "2"; ccdsid "CAL0000197740"; transcript_id "C1_04430C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C1_04430C_A"; 	 Ca22chr1B_C_albicans_SC5314	CGD	CDS	916475	917196	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C1_04430C_B-T"; transcript_status "NOVEL"; gene_id "C1_04430C_B"; exon_id "C1_04430C_B-T"; exon_number "1"; ccdsid "CAL0000197740"; transcript_id "C1_04430C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C1_04430C_A";
Error with cds: Ca22chr1B_C_albicans_SC5314	CGD	UTR	917297	917298	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C1_04430C_B-T"; transcript_status "NOVEL"; gene_id "C1_04430C_B"; exon_id "C1_04430C_B-T"; exon_number "1"; ccdsid "CAL0000197740"; transcript_id "C1_04430C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C1_04430C_A";	 Ca22chr1B_C_albicans_SC5314	CGD	CDS	917262	917298	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C1_04430C_B-T"; transcript_status "NOVEL"; gene_id "C1_04430C_B"; exon_id "C1_04430C_B-T"; exon_number "2"; ccdsid "CAL0000197740"; transcript_id "C1_04430C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C1_04430C_A"; 	 Ca22chr1B_C_albicans_SC5314	CGD	CDS	916475	917196	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C1_04430C_B-T"; transcript_status "NOVEL"; gene_id "C1_04430C_B"; exon_id "C1_04430C_B-T"; exon_number "1"; ccdsid "CAL0000197740"; transcript_id "C1_04430C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C1_04430C_A";
Error with cds: Ca22chrRA_C_albicans_SC5314	CGD	UTR	1809336	1809367	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "CR_08360C_A"; exon_id "CR_08360C_A-T"; transcript_type "protein_coding"; transcript_name "CR_08360C_A-T"; ccdsid "CAL0000175968"; transcript_id "CR_08360C_A-T"; gene_type "protein_coding"; protein_id "orf19.6403"; gene_name "RPF2";	 Ca22chrRA_C_albicans_SC5314	CGD	CDS	1809337	1809337	.	-	1	gene_status "KNOWN"; exon_number "2"; level "1"; transcript_status "KNOWN"; gene_id "CR_08360C_A"; exon_id "CR_08360C_A-T"; transcript_type "protein_coding"; transcript_name "CR_08360C_A-T"; ccdsid "CAL0000175968"; transcript_id "CR_08360C_A-T"; gene_type "protein_coding"; protein_id "orf19.6403"; gene_name "RPF2"; 	 Ca22chrRA_C_albicans_SC5314	CGD	CDS	1808669	1808991	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "CR_08360C_A"; exon_id "CR_08360C_A-T"; transcript_type "protein_coding"; transcript_name "CR_08360C_A-T"; ccdsid "CAL0000175968"; transcript_id "CR_08360C_A-T"; gene_type "protein_coding"; protein_id "orf19.6403"; gene_name "RPF2";
Error with cds: Ca22chrRB_C_albicans_SC5314	CGD	UTR	1808798	1808829	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "CR_08360C_B-T"; transcript_status "NOVEL"; gene_id "CR_08360C_B"; exon_id "CR_08360C_B-T"; exon_number "1"; ccdsid "CAL0000195616"; transcript_id "CR_08360C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "CR_09040W_B";	 Ca22chrRB_C_albicans_SC5314	CGD	CDS	1808799	1808799	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "CR_08360C_B-T"; transcript_status "NOVEL"; gene_id "CR_08360C_B"; exon_id "CR_08360C_B-T"; exon_number "2"; ccdsid "CAL0000195616"; transcript_id "CR_08360C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "CR_09040W_B"; 	 Ca22chrRB_C_albicans_SC5314	CGD	CDS	1808131	1808453	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "CR_08360C_B-T"; transcript_status "NOVEL"; gene_id "CR_08360C_B"; exon_id "CR_08360C_B-T"; exon_number "1"; ccdsid "CAL0000195616"; transcript_id "CR_08360C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "CR_09040W_B";
Error with cds: Ca22chrRA_C_albicans_SC5314	CGD	UTR	743076	743077	.	-	1	gene_status "NOVEL"; exon_number "2"; level "1"; transcript_status "NOVEL"; gene_id "CR_03310C_A"; exon_id "CR_03310C_A-T"; transcript_type "protein_coding"; transcript_name "CR_03310C_A-T"; ccdsid "CAL0000182324"; transcript_id "CR_03310C_A-T"; gene_type "protein_coding"; protein_id "orf19.2393,orf19.9929"; gene_name "SAP6";	 Ca22chrRA_C_albicans_SC5314	CGD	CDS	743039	743076	.	-	1	gene_status "NOVEL"; exon_number "2"; level "1"; transcript_status "NOVEL"; gene_id "CR_03310C_A"; exon_id "CR_03310C_A-T"; transcript_type "protein_coding"; transcript_name "CR_03310C_A-T"; ccdsid "CAL0000182324"; transcript_id "CR_03310C_A-T"; gene_type "protein_coding"; protein_id "orf19.2393,orf19.9929"; gene_name "SAP6"; 	 Ca22chrRA_C_albicans_SC5314	CGD	CDS	742233	742936	.	-	1	gene_status "NOVEL"; exon_number "1"; level "1"; transcript_status "NOVEL"; gene_id "CR_03310C_A"; exon_id "CR_03310C_A-T"; transcript_type "protein_coding"; transcript_name "CR_03310C_A-T"; ccdsid "CAL0000182324"; transcript_id "CR_03310C_A-T"; gene_type "protein_coding"; protein_id "orf19.2393,orf19.9929"; gene_name "SAP6";
Error with cds: Ca22chr3A_C_albicans_SC5314	CGD	UTR	596445	597173	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C5_02650C_B-T"; transcript_status "NOVEL"; gene_id "C5_02650C_B"; exon_id "C5_02650C_B-T"; exon_number "1"; ccdsid "CAL0000195171"; transcript_id "C5_02650C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "PGA44";	 Ca22chr3A_C_albicans_SC5314	CGD	CDS	595555	598209	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C5_02650C_B-T"; transcript_status "NOVEL"; gene_id "C5_02650C_B"; exon_id "C5_02650C_B-T"; exon_number "1"; ccdsid "CAL0000195171"; transcript_id "C5_02650C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "PGA44"; 	 Ca22chr3A_C_albicans_SC5314	CGD	CDS	595555	598209	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C5_02650C_B-T"; transcript_status "NOVEL"; gene_id "C5_02650C_B"; exon_id "C5_02650C_B-T"; exon_number "1"; ccdsid "CAL0000195171"; transcript_id "C5_02650C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "PGA44";
Error with cds: Ca22chr3A_C_albicans_SC5314	CGD	UTR	596444	597127	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C5_02650C_B-T"; transcript_status "NOVEL"; gene_id "C5_02650C_B"; exon_id "C5_02650C_B-T"; exon_number "1"; ccdsid "CAL0000195171"; transcript_id "C5_02650C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "PGA44";	 Ca22chr3A_C_albicans_SC5314	CGD	CDS	595555	598209	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C5_02650C_B-T"; transcript_status "NOVEL"; gene_id "C5_02650C_B"; exon_id "C5_02650C_B-T"; exon_number "1"; ccdsid "CAL0000195171"; transcript_id "C5_02650C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "PGA44"; 	 Ca22chr3A_C_albicans_SC5314	CGD	CDS	595555	598209	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C5_02650C_B-T"; transcript_status "NOVEL"; gene_id "C5_02650C_B"; exon_id "C5_02650C_B-T"; exon_number "1"; ccdsid "CAL0000195171"; transcript_id "C5_02650C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "PGA44";
Error with cds: Ca22chr3A_C_albicans_SC5314	CGD	UTR	598208	598571	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C5_02650C_B-T"; transcript_status "NOVEL"; gene_id "C5_02650C_B"; exon_id "C5_02650C_B-T"; exon_number "1"; ccdsid "CAL0000195171"; transcript_id "C5_02650C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "PGA44";	 Ca22chr3A_C_albicans_SC5314	CGD	CDS	595555	598209	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C5_02650C_B-T"; transcript_status "NOVEL"; gene_id "C5_02650C_B"; exon_id "C5_02650C_B-T"; exon_number "1"; ccdsid "CAL0000195171"; transcript_id "C5_02650C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "PGA44"; 	 Ca22chr3A_C_albicans_SC5314	CGD	CDS	595555	598209	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C5_02650C_B-T"; transcript_status "NOVEL"; gene_id "C5_02650C_B"; exon_id "C5_02650C_B-T"; exon_number "1"; ccdsid "CAL0000195171"; transcript_id "C5_02650C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "PGA44";
Error with cds: Ca22chrRA_C_albicans_SC5314	CGD	UTR	596355	597083	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C5_02650C_A"; exon_id "C5_02650C_A-T"; transcript_type "protein_coding"; transcript_name "C5_02650C_A-T"; ccdsid "CAL0000182191"; transcript_id "C5_02650C_A-T"; gene_type "protein_coding"; protein_id "orf19.4282,orf19.11757,orf19.11758,orf19.4281"; gene_name "C5_02650C_B";	 Ca22chrRA_C_albicans_SC5314	CGD	CDS	595465	598119	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C5_02650C_A"; exon_id "C5_02650C_A-T"; transcript_type "protein_coding"; transcript_name "C5_02650C_A-T"; ccdsid "CAL0000182191"; transcript_id "C5_02650C_A-T"; gene_type "protein_coding"; protein_id "orf19.4282,orf19.11757,orf19.11758,orf19.4281"; gene_name "C5_02650C_B"; 	 Ca22chrRA_C_albicans_SC5314	CGD	CDS	595465	598119	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C5_02650C_A"; exon_id "C5_02650C_A-T"; transcript_type "protein_coding"; transcript_name "C5_02650C_A-T"; ccdsid "CAL0000182191"; transcript_id "C5_02650C_A-T"; gene_type "protein_coding"; protein_id "orf19.4282,orf19.11757,orf19.11758,orf19.4281"; gene_name "C5_02650C_B";
Error with cds: Ca22chrRA_C_albicans_SC5314	CGD	UTR	596354	597037	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C5_02650C_A"; exon_id "C5_02650C_A-T"; transcript_type "protein_coding"; transcript_name "C5_02650C_A-T"; ccdsid "CAL0000182191"; transcript_id "C5_02650C_A-T"; gene_type "protein_coding"; protein_id "orf19.4282,orf19.11757,orf19.11758,orf19.4281"; gene_name "C5_02650C_B";	 Ca22chrRA_C_albicans_SC5314	CGD	CDS	595465	598119	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C5_02650C_A"; exon_id "C5_02650C_A-T"; transcript_type "protein_coding"; transcript_name "C5_02650C_A-T"; ccdsid "CAL0000182191"; transcript_id "C5_02650C_A-T"; gene_type "protein_coding"; protein_id "orf19.4282,orf19.11757,orf19.11758,orf19.4281"; gene_name "C5_02650C_B"; 	 Ca22chrRA_C_albicans_SC5314	CGD	CDS	595465	598119	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C5_02650C_A"; exon_id "C5_02650C_A-T"; transcript_type "protein_coding"; transcript_name "C5_02650C_A-T"; ccdsid "CAL0000182191"; transcript_id "C5_02650C_A-T"; gene_type "protein_coding"; protein_id "orf19.4282,orf19.11757,orf19.11758,orf19.4281"; gene_name "C5_02650C_B";
Error with cds: Ca22chrRA_C_albicans_SC5314	CGD	UTR	598118	598478	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C5_02650C_A"; exon_id "C5_02650C_A-T"; transcript_type "protein_coding"; transcript_name "C5_02650C_A-T"; ccdsid "CAL0000182191"; transcript_id "C5_02650C_A-T"; gene_type "protein_coding"; protein_id "orf19.4282,orf19.11757,orf19.11758,orf19.4281"; gene_name "C5_02650C_B";	 Ca22chrRA_C_albicans_SC5314	CGD	CDS	595465	598119	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C5_02650C_A"; exon_id "C5_02650C_A-T"; transcript_type "protein_coding"; transcript_name "C5_02650C_A-T"; ccdsid "CAL0000182191"; transcript_id "C5_02650C_A-T"; gene_type "protein_coding"; protein_id "orf19.4282,orf19.11757,orf19.11758,orf19.4281"; gene_name "C5_02650C_B"; 	 Ca22chrRA_C_albicans_SC5314	CGD	CDS	595465	598119	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C5_02650C_A"; exon_id "C5_02650C_A-T"; transcript_type "protein_coding"; transcript_name "C5_02650C_A-T"; ccdsid "CAL0000182191"; transcript_id "C5_02650C_A-T"; gene_type "protein_coding"; protein_id "orf19.4282,orf19.11757,orf19.11758,orf19.4281"; gene_name "C5_02650C_B";
Error with cds: Ca22chr3B_C_albicans_SC5314	CGD	UTR	940780	941259	.	-	.	gene_status "NOVEL"; level "1"; transcript_name "C3_04500C_B-T"; transcript_status "NOVEL"; gene_id "C3_04500C_B"; exon_id "C3_04500C_B-T"; exon_number "1"; ccdsid "CAL0000197042"; transcript_id "C3_04500C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C1_05660C_B";	 Ca22chr3B_C_albicans_SC5314	CGD	CDS	942263	942264	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C3_04500C_B-T"; transcript_status "NOVEL"; gene_id "C3_04500C_B"; exon_id "C3_04500C_B-T"; exon_number "2"; ccdsid "CAL0000197042"; transcript_id "C3_04500C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C1_05660C_B"; 	 Ca22chr3B_C_albicans_SC5314	CGD	CDS	941259	941826	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C3_04500C_B-T"; transcript_status "NOVEL"; gene_id "C3_04500C_B"; exon_id "C3_04500C_B-T"; exon_number "1"; ccdsid "CAL0000197042"; transcript_id "C3_04500C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C1_05660C_B";
Error with cds: Ca22chr3A_C_albicans_SC5314	CGD	UTR	940802	941281	.	-	.	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C3_04500C_A"; exon_id "C3_04500C_A-T"; transcript_type "protein_coding"; transcript_name "C3_04500C_A-T"; ccdsid "CAL0000180894"; transcript_id "C3_04500C_A-T"; gene_type "protein_coding"; protein_id "orf19.5904,orf19.13325"; gene_name "C1_05660C_B";	 Ca22chr3A_C_albicans_SC5314	CGD	CDS	942285	942286	.	-	1	gene_status "KNOWN"; exon_number "2"; level "1"; transcript_status "KNOWN"; gene_id "C3_04500C_A"; exon_id "C3_04500C_A-T"; transcript_type "protein_coding"; transcript_name "C3_04500C_A-T"; ccdsid "CAL0000180894"; transcript_id "C3_04500C_A-T"; gene_type "protein_coding"; protein_id "orf19.5904,orf19.13325"; gene_name "C1_05660C_B"; 	 Ca22chr3A_C_albicans_SC5314	CGD	CDS	941281	941848	.	-	1	gene_status "KNOWN"; exon_number "1"; level "1"; transcript_status "KNOWN"; gene_id "C3_04500C_A"; exon_id "C3_04500C_A-T"; transcript_type "protein_coding"; transcript_name "C3_04500C_A-T"; ccdsid "CAL0000180894"; transcript_id "C3_04500C_A-T"; gene_type "protein_coding"; protein_id "orf19.5904,orf19.13325"; gene_name "C1_05660C_B";
Error with cds: Ca22chrRB_C_albicans_SC5314	CGD	UTR	743014	743015	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "CR_03310C_B-T"; transcript_status "NOVEL"; gene_id "CR_03310C_B"; exon_id "CR_03310C_B-T"; exon_number "2"; ccdsid "CAL0000201512"; transcript_id "CR_03310C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "MSH2";	 Ca22chrRB_C_albicans_SC5314	CGD	CDS	742977	743014	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "CR_03310C_B-T"; transcript_status "NOVEL"; gene_id "CR_03310C_B"; exon_id "CR_03310C_B-T"; exon_number "2"; ccdsid "CAL0000201512"; transcript_id "CR_03310C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "MSH2"; 	 Ca22chrRB_C_albicans_SC5314	CGD	CDS	742171	742874	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "CR_03310C_B-T"; transcript_status "NOVEL"; gene_id "CR_03310C_B"; exon_id "CR_03310C_B-T"; exon_number "1"; ccdsid "CAL0000201512"; transcript_id "CR_03310C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "MSH2";
Error with cds: Ca22chr6A_C_albicans_SC5314	CGD	UTR	492795	492878	.	-	1	gene_status "NOVEL"; exon_number "2"; level "1"; transcript_status "NOVEL"; gene_id "C6_02350C_A"; exon_id "C6_02350C_A-T"; transcript_type "protein_coding"; transcript_name "C6_02350C_A-T"; ccdsid "CAL0000183350"; transcript_id "C6_02350C_A-T"; gene_type "protein_coding"; protein_id "orf19.3477,orf19.10981,orf19.10983,orf19.3479"; gene_name "GCD11";	 Ca22chr6A_C_albicans_SC5314	CGD	CDS	492986	493362	.	-	1	gene_status "NOVEL"; exon_number "2"; level "1"; transcript_status "NOVEL"; gene_id "C6_02350C_A"; exon_id "C6_02350C_A-T"; transcript_type "protein_coding"; transcript_name "C6_02350C_A-T"; ccdsid "CAL0000183350"; transcript_id "C6_02350C_A-T"; gene_type "protein_coding"; protein_id "orf19.3477,orf19.10981,orf19.10983,orf19.3479"; gene_name "GCD11"; 	 Ca22chr6A_C_albicans_SC5314	CGD	CDS	491419	492877	.	-	1	gene_status "NOVEL"; exon_number "1"; level "1"; transcript_status "NOVEL"; gene_id "C6_02350C_A"; exon_id "C6_02350C_A-T"; transcript_type "protein_coding"; transcript_name "C6_02350C_A-T"; ccdsid "CAL0000183350"; transcript_id "C6_02350C_A-T"; gene_type "protein_coding"; protein_id "orf19.3477,orf19.10981,orf19.10983,orf19.3479"; gene_name "GCD11";
Error with cds: Ca22chr6A_C_albicans_SC5314	CGD	UTR	492794	492877	.	-	1	gene_status "NOVEL"; exon_number "1"; level "1"; transcript_status "NOVEL"; gene_id "C6_02350C_A"; exon_id "C6_02350C_A-T"; transcript_type "protein_coding"; transcript_name "C6_02350C_A-T"; ccdsid "CAL0000183350"; transcript_id "C6_02350C_A-T"; gene_type "protein_coding"; protein_id "orf19.3477,orf19.10981,orf19.10983,orf19.3479"; gene_name "GCD11";	 Ca22chr6A_C_albicans_SC5314	CGD	CDS	492986	493362	.	-	1	gene_status "NOVEL"; exon_number "2"; level "1"; transcript_status "NOVEL"; gene_id "C6_02350C_A"; exon_id "C6_02350C_A-T"; transcript_type "protein_coding"; transcript_name "C6_02350C_A-T"; ccdsid "CAL0000183350"; transcript_id "C6_02350C_A-T"; gene_type "protein_coding"; protein_id "orf19.3477,orf19.10981,orf19.10983,orf19.3479"; gene_name "GCD11"; 	 Ca22chr6A_C_albicans_SC5314	CGD	CDS	491419	492877	.	-	1	gene_status "NOVEL"; exon_number "1"; level "1"; transcript_status "NOVEL"; gene_id "C6_02350C_A"; exon_id "C6_02350C_A-T"; transcript_type "protein_coding"; transcript_name "C6_02350C_A-T"; ccdsid "CAL0000183350"; transcript_id "C6_02350C_A-T"; gene_type "protein_coding"; protein_id "orf19.3477,orf19.10981,orf19.10983,orf19.3479"; gene_name "GCD11";
Error with cds: Ca22chr6B_C_albicans_SC5314	CGD	UTR	492744	492827	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C6_02350C_B-T"; transcript_status "NOVEL"; gene_id "C6_02350C_B"; exon_id "C6_02350C_B-T"; exon_number "2"; ccdsid "CAL0000195438"; transcript_id "C6_02350C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "GCD11";	 Ca22chr6B_C_albicans_SC5314	CGD	CDS	492936	493312	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C6_02350C_B-T"; transcript_status "NOVEL"; gene_id "C6_02350C_B"; exon_id "C6_02350C_B-T"; exon_number "2"; ccdsid "CAL0000195438"; transcript_id "C6_02350C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "GCD11"; 	 Ca22chr6B_C_albicans_SC5314	CGD	CDS	491368	492826	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C6_02350C_B-T"; transcript_status "NOVEL"; gene_id "C6_02350C_B"; exon_id "C6_02350C_B-T"; exon_number "1"; ccdsid "CAL0000195438"; transcript_id "C6_02350C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "GCD11";
Error with cds: Ca22chr6B_C_albicans_SC5314	CGD	UTR	492743	492826	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C6_02350C_B-T"; transcript_status "NOVEL"; gene_id "C6_02350C_B"; exon_id "C6_02350C_B-T"; exon_number "1"; ccdsid "CAL0000195438"; transcript_id "C6_02350C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "GCD11";	 Ca22chr6B_C_albicans_SC5314	CGD	CDS	492936	493312	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C6_02350C_B-T"; transcript_status "NOVEL"; gene_id "C6_02350C_B"; exon_id "C6_02350C_B-T"; exon_number "2"; ccdsid "CAL0000195438"; transcript_id "C6_02350C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "GCD11"; 	 Ca22chr6B_C_albicans_SC5314	CGD	CDS	491368	492826	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C6_02350C_B-T"; transcript_status "NOVEL"; gene_id "C6_02350C_B"; exon_id "C6_02350C_B-T"; exon_number "1"; ccdsid "CAL0000195438"; transcript_id "C6_02350C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "GCD11";
Error with cds: Ca22chr5A_C_albicans_SC5314	CGD	UTR	997749	997817	.	-	1	gene_status "NOVEL"; exon_number "1"; level "1"; transcript_status "NOVEL"; gene_id "C5_04590C_A"; exon_id "C5_04590C_A-T"; transcript_type "protein_coding"; transcript_name "C5_04590C_A-T"; ccdsid "CAL0000185522"; transcript_id "C5_04590C_A-T"; gene_type "protein_coding"; protein_id "orf19.3942"; gene_name "CDC39";	 Ca22chr5A_C_albicans_SC5314	CGD	CDS	997749	997750	.	-	1	gene_status "NOVEL"; exon_number "2"; level "1"; transcript_status "NOVEL"; gene_id "C5_04590C_A"; exon_id "C5_04590C_A-T"; transcript_type "protein_coding"; transcript_name "C5_04590C_A-T"; ccdsid "CAL0000185522"; transcript_id "C5_04590C_A-T"; gene_type "protein_coding"; protein_id "orf19.3942"; gene_name "CDC39"; 	 Ca22chr5A_C_albicans_SC5314	CGD	CDS	997102	997375	.	-	1	gene_status "NOVEL"; exon_number "1"; level "1"; transcript_status "NOVEL"; gene_id "C5_04590C_A"; exon_id "C5_04590C_A-T"; transcript_type "protein_coding"; transcript_name "C5_04590C_A-T"; ccdsid "CAL0000185522"; transcript_id "C5_04590C_A-T"; gene_type "protein_coding"; protein_id "orf19.3942"; gene_name "CDC39";
Error with cds: Ca22chr5B_C_albicans_SC5314	CGD	UTR	997872	997940	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C5_04590C_B-T"; transcript_status "NOVEL"; gene_id "C5_04590C_B"; exon_id "C5_04590C_B-T"; exon_number "1"; ccdsid "CAL0000199046"; transcript_id "C5_04590C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C6_00560W_B";	 Ca22chr5B_C_albicans_SC5314	CGD	CDS	997872	997873	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C5_04590C_B-T"; transcript_status "NOVEL"; gene_id "C5_04590C_B"; exon_id "C5_04590C_B-T"; exon_number "2"; ccdsid "CAL0000199046"; transcript_id "C5_04590C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C6_00560W_B"; 	 Ca22chr5B_C_albicans_SC5314	CGD	CDS	997227	997500	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C5_04590C_B-T"; transcript_status "NOVEL"; gene_id "C5_04590C_B"; exon_id "C5_04590C_B-T"; exon_number "1"; ccdsid "CAL0000199046"; transcript_id "C5_04590C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C6_00560W_B";
Error with cds: Ca22chr3B_C_albicans_SC5314	CGD	UTR	214764	214765	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C3_01040C_B-T"; transcript_status "NOVEL"; gene_id "C3_01040C_B"; exon_id "C3_01040C_B-T"; exon_number "2"; ccdsid "CAL0000191253"; transcript_id "C3_01040C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C3_07550C_B";	 Ca22chr3B_C_albicans_SC5314	CGD	CDS	214836	215359	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C3_01040C_B-T"; transcript_status "NOVEL"; gene_id "C3_01040C_B"; exon_id "C3_01040C_B-T"; exon_number "2"; ccdsid "CAL0000191253"; transcript_id "C3_01040C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C3_07550C_B"; 	 Ca22chr3B_C_albicans_SC5314	CGD	CDS	213608	214763	.	-	1	gene_status "NOVEL"; level "1"; transcript_name "C3_01040C_B-T"; transcript_status "NOVEL"; gene_id "C3_01040C_B"; exon_id "C3_01040C_B-T"; exon_number "1"; ccdsid "CAL0000191253"; transcript_id "C3_01040C_B-T"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "C3_07550C_B";
Out[6]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/C_albicans_SC5314/Assembly22/annotation/C_albicans_SC5314_version_A22-s07-m01-r50_features.encode.gffutils.cds.bed)>

In [7]:
for gene_id in get_gene_list(gene_dict):
    start_codons = []
    stop_codons = []
    for start_codon in db.children(gene_id, featuretype='start_codon'):
        ## 1 -based stop
        ## 0-based start handled while converting to bed
        start_codon.stop = start_codon.start
        start_codons.append(start_codon)
    for stop_codon in db.children(gene_id, featuretype='stop_codon'):
        stop_codon.start = stop_codon.stop
        stop_codon.stop = stop_codon.stop+1
        stop_codons.append(stop_codon)
    merged_start_codons = merge_regions(db, start_codons)
    renamed_start_codons = rename_regions(merged_start_codons, gene_id)
    merged_stop_codons = merge_regions(db, stop_codons)
    renamed_stop_codons = rename_regions(merged_stop_codons, gene_id)
    
    start_codon_bed += create_bed(renamed_start_codons)    
    stop_codon_bed += create_bed(renamed_stop_codons)

    
start_codon_bedtool = pybedtools.BedTool(start_codon_bed, from_string=True)
stop_codon_bedtool = pybedtools.BedTool(stop_codon_bed, from_string=True)
start_codon_bedtool.remove_invalid().sort().saveas('{}.start_codon.bed'.format(prefix))
stop_codon_bedtool.remove_invalid().sort().saveas('{}.stop_codon.bed'.format(prefix))


Out[7]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/C_albicans_SC5314/Assembly22/annotation/C_albicans_SC5314_version_A22-s07-m01-r50_features.encode.gffutils.stop_codon.bed)>

In [8]:
## TSS
polyA_sites_bed = ''
tss_sites_bed = ''
for gene_id in get_gene_list(gene_dict):
    tss_sites = []
    polyA_sites = []
    for transcript in db.children(gene_id, featuretype='transcript'):
        start_t = copy.deepcopy(transcript)
        stop_t = copy.deepcopy(transcript)
        
        start_t.stop = start_t.start + 1
        
        stop_t.start = stop_t.stop
        
        if transcript.strand == '-':
            start_t, stop_t = stop_t, start_t
        polyA_sites.append(start_t)
        tss_sites.append(stop_t)
    merged_polyA_sites = merge_regions(db, polyA_sites)
    renamed_polyA_sites = rename_regions(merged_polyA_sites, gene_id)    
    merged_tss_sites = merge_regions(db, tss_sites)
    renamed_tss_sites = rename_regions(merged_tss_sites, gene_id)
    polyA_sites_bed += create_bed(renamed_polyA_sites)    
    tss_sites_bed += create_bed(renamed_tss_sites)

polyA_sites_bedtool = pybedtools.BedTool(polyA_sites_bed, from_string=True)
tss_sites_bedtool = pybedtools.BedTool(tss_sites_bed, from_string=True)
polyA_sites_bedtool.remove_invalid().sort().saveas('{}.polyA_sites.bed'.format(prefix))
tss_sites_bedtool.remove_invalid().sort().saveas('{}.tss_sites.bed'.format(prefix))


Out[8]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/C_albicans_SC5314/Assembly22/annotation/C_albicans_SC5314_version_A22-s07-m01-r50_features.encode.gffutils.tss_sites.bed)>

TSS


In [9]:
tss = tsses(db, as_bed6=True, merge_overlapping=False)
tss.remove_invalid().sort().saveas('{}.tss_temp.bed'.format(prefix))
promoter = tss.slop(l=1000, r=1000, s=True, g=chrsizes)
promoter.remove_invalid().sort().saveas('{}.promoter.1000.bed'.format(prefix))


Out[9]:
<BedTool(/home/cmb-panasas2/skchoudh/genomes/C_albicans_SC5314/Assembly22/annotation/C_albicans_SC5314_version_A22-s07-m01-r50_features.encode.gffutils.promoter.1000.bed)>

In [10]:
for l in [1000, 2000, 3000, 4000, 5000]:
    promoter = tss.slop(l=l, r=l, s=True, g=chrsizes)
    promoter.remove_invalid().sort().saveas('{}.promoter.{}.bed'.format(prefix, l))

In [11]:
for x in db.featuretypes():
    print(x)


CDS
UTR
exon
gene
start_codon
stop_codon
transcript

In [12]:
for gene_id in get_gene_list(gene_dict):
    for transcript in db.children(gene_id, featuretype='transcript'):
        #print(transcript.attributes)
        pass
chrom_sizes = pd.read_table('/home/cmb-06/as/skchoudh/genomes/C_albicans_SC5314/Assembly22/fasta_v50/C_albicans_SC5314_version_A22-s07-m01-r50_chromosomes_clean_records.sizes', names=['chrom', 'size']).set_index('chrom')
chrom_sizes


Out[12]:
size
chrom
Ca22chr1A_C_albicans_SC5314 3188341
Ca22chr1B_C_albicans_SC5314 3188396
Ca22chr2A_C_albicans_SC5314 2231883
Ca22chr2B_C_albicans_SC5314 2231750
Ca22chr3A_C_albicans_SC5314 1799298
Ca22chr3B_C_albicans_SC5314 1799271
Ca22chr4A_C_albicans_SC5314 1603259
Ca22chr4B_C_albicans_SC5314 1603311
Ca22chr5A_C_albicans_SC5314 1190869
Ca22chr5B_C_albicans_SC5314 1190991
Ca22chr6A_C_albicans_SC5314 1033292
Ca22chr6B_C_albicans_SC5314 1033212
Ca22chr7A_C_albicans_SC5314 949580
Ca22chr7B_C_albicans_SC5314 949611
Ca22chrM_C_albicans_SC5314 40420
Ca22chrRA_C_albicans_SC5314 2286237
Ca22chrRB_C_albicans_SC5314 2285697

In [13]:
def clean_gff(db, chrom_sizes, gffout):
    
    # check if the coordinates are withing the chromosome's boundaries
    with open(gffout, 'w') as f:
        for feature in db.all_features():
            if feature.stop <= chrom_sizes.loc[feature.chrom]['size']:
                f.write('{}\n'.format(feature))
clean_gff(db, chrom_sizes, '/home/cmb-panasas2/skchoudh/genomes/C_albicans_SC5314/Assembly22/annotation/C_albicans_SC5314_version_A22-s07-m01-r50_features.encode.cleaned.gtf')

In [14]:
for feature in db.all_features():
    print feature.stop
    print chrom_sizes.loc[feature.chrom]['size']
    break


434200
1799298

In [ ]:


In [ ]: