In [1]:
from collections import OrderedDict,deque
import re

In [2]:
abinitio_filename = "TcIV-X10825.v2.masked-AbInitio.gff"
hints_filename = "TcIV-X10825.v2.masked-AugustusHints.gff"
merged_filename = "merged.gff"

Constants


In [3]:
#GFF column names
CTG_ID = 0
ENTRY_TYPE = 2 #gene, transcript, start_codon, stop_codon, CDS
START_POS = 3
STOP_POS = 4
ATTRIBS = 8

#loaded gff genes
GEN_INTERVAL = 0
GFF_LINES = 1

Functions


In [4]:
def extractID(line):
    id_match = re.search(r"ID=(.+?)(;|\n|$)",line)
    gene_id = id_match.group(1) if id_match else None
    return gene_id

def extractParent(line):
    parent_id_match = re.search(r"Parent=(.+?)(\..+?)?(;|\n|$)",line)
    gene_id = parent_id_match.group(1) if parent_id_match else None
    return gene_id

In [5]:
x ="a"
x.startswith


Out[5]:
<function startswith>

In [6]:
def loadgff(filename):
    data = OrderedDict()

    current_ctg = ""
    current_gene_id = ""
    current_gene_info = ""
    current_interval = None
    capture_prot_seq = False
    with open(filename,"r") as fh:
        for line in fh:
            if line[0] != "#" : #ignore comment lines
                line_fields = line.rstrip("\n").split("\t")
                
                if line_fields[ENTRY_TYPE] == "gene":
                    if current_gene_id != "":
                        #Initialize list for each contig
                        if current_ctg not in data:
                            data[current_ctg] = deque()
                        data[current_ctg].append( (current_interval, current_gene_info)   )
                    
                    current_ctg = line_fields[CTG_ID]
                    current_gene_id = extractID(line_fields[ATTRIBS])
                    current_interval = ( int(line_fields[START_POS]) , int(line_fields[STOP_POS])  )
                    current_gene_info = line
                else: #Accumulate under gene
                    parent_id = extractParent(line_fields[ATTRIBS])
                    assert( current_gene_id == parent_id)
                    current_gene_info += line
            elif line.startswith("# protein sequence"):
                capture_prot_seq = True
                current_gene_info += line
            elif capture_prot_seq:
                current_gene_info += line
                capture_prot_seq = not line.endswith("]\n")
            elif line.rstrip("\n") == "# command line:":
                break

    return data

In [7]:
def fullyContains(container, contained):
    return (container[0] <= contained[0]) and (contained[1] <= container[1])

def no_overlap(coord1,coord2):
    return coord1[1] < coord2[0] or coord2[1] < coord1[0]

def pct_overlap(main_annot,secondary_annot):
    ovlp_pct = 0
    #if there is some overlap
    if not(main_annot[1] < secondary_annot[0] or secondary_annot[1] < main_annot[0] ):
        ovlp_size = min(main_annot[1],secondary_annot[1]) - max(main_annot[0],secondary_annot[0])
        #Overlap pct defined as the % of the main annotation length covered by the secondary annotation
        ovlp_pct = (ovlp_size*100)/(main_annot[1]-main_annot[0])      
    return ovlp_pct

Defines the merging rules for both files

Rules are as follows:

If there is no overlap:
    Write the "gene" with the lower coordinates
If there is overlap:
    if the "hints" fully contains the "ab initio"
        Write hints and discard ab initio
    if "ab initio" fully contains hints
        Write "ab initio" only if hints covers less than 65% of the length of "ab initio"
        Write "hints" otherwise
    if there is a partial overlap:

In [8]:
#Defines the merging rules for both files

def mergeGenes(abinit_genes, hint_genes, merged_fh):
    abinit_coord, abinit_gff = abinit_genes.popleft()
    hint_coord, hint_gff = hint_genes.popleft()

    while abinit_genes and hint_genes:
        if no_overlap(hint_coord, abinit_coord):
            if hint_coord[0] < abinit_coord[0]:
                merged_fh.write(hint_gff)
                hint_coord, hint_gff = hint_genes.popleft()
            else:
                merged_fh.write(abInitioDataDecorator(abinit_gff))
                abinit_coord, abinit_gff = abinit_genes.popleft()

        else: #If there is some overlap
            if fullyContains(hint_coord,abinit_coord): #If hint interval contains the ab initio
                #Keep hints, discard ab initio
                merged_fh.write(hint_gff) #Do I want to write this out? 
                abinit_coord, abinit_gff = abinit_genes.popleft()
                hint_coord, hint_gff = hint_genes.popleft()
            
            elif fullyContains(abinit_coord,hint_coord): #If the ab initio interval contains completely the hints interval
                #If the hits interval spans less the 65% of the ab initio interval - Use ab initio
                if pct_overlap(abinit_coord,hint_coord ) < 65:
                    merged_fh.write(abInitioDataDecorator(abinit_gff) )
                else: #If it is covered more than 65% , then keep the hits 
                    merged_fh.write(hint_gff)
                abinit_coord, abinit_gff = abinit_genes.popleft()
                hint_coord, hint_gff = hint_genes.popleft()
                
            else: #Overlap is only partial
                #If overlap spans less than 10% of the length of hint, consider them different
                if pct_overlap(hint_coord, abinit_coord) <= 10:
                    if hint_coord[0] < abinit_coord[0]:
                        merged_fh.write(hint_gff)
                        hint_coord, hint_gff = hint_genes.popleft()
                    else:
                        merged_fh.write(abInitioDataDecorator(abinit_gff))
                        abinit_coord, abinit_gff = abinit_genes.popleft()
                else: #If overlap covers more than 10% of hint, discard ab_init
                    print "Discarded ab-initio, for more than 10% overlap"
                    print abinit_coord
                    abinit_coord, abinit_gff = abinit_genes.popleft()            
                
def writeWholeContig(genes, merged_fh,is_abinitio):
    while genes:
        _,data = genes.pop()
        data = data if not is_abinitio else abInitioDataDecorator(data)
        merged_fh.write(data)

In [9]:
def extractCtgNumber(ctg_id, ctg_prefix="TcIV-"):
    assert ctg_id.startswith(ctg_prefix)
    return int(ctg_id[ len(ctg_prefix):] )

def abInitioDataDecorator( ab_init_data):
    return re.sub(r"\t(.*)\n",r"\t\g<1>;src=abinit\n",ab_init_data)

Actual processing!

Load data from Ab Initio annotation and Augustus Hints into memory


In [10]:
#load info into memory
abinitio_data = loadgff(abinitio_filename)
hints_data = loadgff(hints_filename)

Merge both samples giving priority to hints


In [11]:
current_abinitio = None
current_hints = None
with open("merged.gff","w") as merged_fh:
    abinit_ctg, abinit_genes  = abinitio_data.popitem(False)
    hint_ctg, hint_genes = hints_data.popitem(False)
    while abinitio_data and hints_data:
        if abinit_ctg == hint_ctg:
            mergeGenes(abinit_genes,hint_genes, merged_fh)
            abinit_ctg, abinit_genes  = abinitio_data.popitem(False)
            hint_ctg, hint_genes = hints_data.popitem(False)
        elif extractCtgNumber(abinit_ctg) < extractCtgNumber(hint_ctg):
            #If there is only ab initio annotatio for this contig
            writeWholeContig(abinit_genes, merged_fh,True)
            abinit_ctg, abinit_genes  = abinitio_data.popitem(False)
        else:
            #If there is only hint annotation for this contig
            writeWholeContig(hint_genes, merged_fh,is_abinitio= False)
            hint_ctg, hint_genes = hints_data.popitem(False)
            
    assert (len(abinitio_data) == 0) or (len(hints_data) == 0 )
    #Write remaining annotation 
    while abinitio_data:
        writeWholeContig(abinit_genes, merged_fh,True)
        abinit_ctg, abinit_genes  = abinitio_data.popitem(False)
    
    while hints_data:
        writeWholeContig(hint_genes, merged_fh,False)
        hint_ctg, hint_genes = hints_data.popitem(False)


Discarded ab-initio, for more than 10% overlap
(18848, 19186)
Discarded ab-initio, for more than 10% overlap
(159286, 160239)
Discarded ab-initio, for more than 10% overlap
(44908, 45636)
Discarded ab-initio, for more than 10% overlap
(30648, 31361)
Discarded ab-initio, for more than 10% overlap
(10550, 10960)
Discarded ab-initio, for more than 10% overlap
(40638, 41000)
Discarded ab-initio, for more than 10% overlap
(19, 729)
Discarded ab-initio, for more than 10% overlap
(3321, 3890)
Discarded ab-initio, for more than 10% overlap
(28891, 29595)
Discarded ab-initio, for more than 10% overlap
(7692, 8312)
Discarded ab-initio, for more than 10% overlap
(10093, 10806)
Discarded ab-initio, for more than 10% overlap
(8338, 8820)
Discarded ab-initio, for more than 10% overlap
(14525, 14989)
Discarded ab-initio, for more than 10% overlap
(42, 1409)
Discarded ab-initio, for more than 10% overlap
(3337, 3810)
Discarded ab-initio, for more than 10% overlap
(2551, 3264)
Discarded ab-initio, for more than 10% overlap
(2517, 3962)