In [1]:
from collections import OrderedDict,deque
import re
In [2]:
abinitio_filename = "TcIV-X10825.v2.masked-AbInitio.gff"
hints_filename = "TcIV-X10825.v2.masked-AugustusHints.gff"
merged_filename = "merged.gff"
In [3]:
#GFF column names
CTG_ID = 0
ENTRY_TYPE = 2 #gene, transcript, start_codon, stop_codon, CDS
START_POS = 3
STOP_POS = 4
ATTRIBS = 8
#loaded gff genes
GEN_INTERVAL = 0
GFF_LINES = 1
In [4]:
def extractID(line):
id_match = re.search(r"ID=(.+?)(;|\n|$)",line)
gene_id = id_match.group(1) if id_match else None
return gene_id
def extractParent(line):
parent_id_match = re.search(r"Parent=(.+?)(\..+?)?(;|\n|$)",line)
gene_id = parent_id_match.group(1) if parent_id_match else None
return gene_id
In [5]:
x ="a"
x.startswith
Out[5]:
In [6]:
def loadgff(filename):
data = OrderedDict()
current_ctg = ""
current_gene_id = ""
current_gene_info = ""
current_interval = None
capture_prot_seq = False
with open(filename,"r") as fh:
for line in fh:
if line[0] != "#" : #ignore comment lines
line_fields = line.rstrip("\n").split("\t")
if line_fields[ENTRY_TYPE] == "gene":
if current_gene_id != "":
#Initialize list for each contig
if current_ctg not in data:
data[current_ctg] = deque()
data[current_ctg].append( (current_interval, current_gene_info) )
current_ctg = line_fields[CTG_ID]
current_gene_id = extractID(line_fields[ATTRIBS])
current_interval = ( int(line_fields[START_POS]) , int(line_fields[STOP_POS]) )
current_gene_info = line
else: #Accumulate under gene
parent_id = extractParent(line_fields[ATTRIBS])
assert( current_gene_id == parent_id)
current_gene_info += line
elif line.startswith("# protein sequence"):
capture_prot_seq = True
current_gene_info += line
elif capture_prot_seq:
current_gene_info += line
capture_prot_seq = not line.endswith("]\n")
elif line.rstrip("\n") == "# command line:":
break
return data
In [7]:
def fullyContains(container, contained):
return (container[0] <= contained[0]) and (contained[1] <= container[1])
def no_overlap(coord1,coord2):
return coord1[1] < coord2[0] or coord2[1] < coord1[0]
def pct_overlap(main_annot,secondary_annot):
ovlp_pct = 0
#if there is some overlap
if not(main_annot[1] < secondary_annot[0] or secondary_annot[1] < main_annot[0] ):
ovlp_size = min(main_annot[1],secondary_annot[1]) - max(main_annot[0],secondary_annot[0])
#Overlap pct defined as the % of the main annotation length covered by the secondary annotation
ovlp_pct = (ovlp_size*100)/(main_annot[1]-main_annot[0])
return ovlp_pct
Defines the merging rules for both files
Rules are as follows:
If there is no overlap:
Write the "gene" with the lower coordinates
If there is overlap:
if the "hints" fully contains the "ab initio"
Write hints and discard ab initio
if "ab initio" fully contains hints
Write "ab initio" only if hints covers less than 65% of the length of "ab initio"
Write "hints" otherwise
if there is a partial overlap:
In [8]:
#Defines the merging rules for both files
def mergeGenes(abinit_genes, hint_genes, merged_fh):
abinit_coord, abinit_gff = abinit_genes.popleft()
hint_coord, hint_gff = hint_genes.popleft()
while abinit_genes and hint_genes:
if no_overlap(hint_coord, abinit_coord):
if hint_coord[0] < abinit_coord[0]:
merged_fh.write(hint_gff)
hint_coord, hint_gff = hint_genes.popleft()
else:
merged_fh.write(abInitioDataDecorator(abinit_gff))
abinit_coord, abinit_gff = abinit_genes.popleft()
else: #If there is some overlap
if fullyContains(hint_coord,abinit_coord): #If hint interval contains the ab initio
#Keep hints, discard ab initio
merged_fh.write(hint_gff) #Do I want to write this out?
abinit_coord, abinit_gff = abinit_genes.popleft()
hint_coord, hint_gff = hint_genes.popleft()
elif fullyContains(abinit_coord,hint_coord): #If the ab initio interval contains completely the hints interval
#If the hits interval spans less the 65% of the ab initio interval - Use ab initio
if pct_overlap(abinit_coord,hint_coord ) < 65:
merged_fh.write(abInitioDataDecorator(abinit_gff) )
else: #If it is covered more than 65% , then keep the hits
merged_fh.write(hint_gff)
abinit_coord, abinit_gff = abinit_genes.popleft()
hint_coord, hint_gff = hint_genes.popleft()
else: #Overlap is only partial
#If overlap spans less than 10% of the length of hint, consider them different
if pct_overlap(hint_coord, abinit_coord) <= 10:
if hint_coord[0] < abinit_coord[0]:
merged_fh.write(hint_gff)
hint_coord, hint_gff = hint_genes.popleft()
else:
merged_fh.write(abInitioDataDecorator(abinit_gff))
abinit_coord, abinit_gff = abinit_genes.popleft()
else: #If overlap covers more than 10% of hint, discard ab_init
print "Discarded ab-initio, for more than 10% overlap"
print abinit_coord
abinit_coord, abinit_gff = abinit_genes.popleft()
def writeWholeContig(genes, merged_fh,is_abinitio):
while genes:
_,data = genes.pop()
data = data if not is_abinitio else abInitioDataDecorator(data)
merged_fh.write(data)
In [9]:
def extractCtgNumber(ctg_id, ctg_prefix="TcIV-"):
assert ctg_id.startswith(ctg_prefix)
return int(ctg_id[ len(ctg_prefix):] )
def abInitioDataDecorator( ab_init_data):
return re.sub(r"\t(.*)\n",r"\t\g<1>;src=abinit\n",ab_init_data)
In [10]:
#load info into memory
abinitio_data = loadgff(abinitio_filename)
hints_data = loadgff(hints_filename)
In [11]:
current_abinitio = None
current_hints = None
with open("merged.gff","w") as merged_fh:
abinit_ctg, abinit_genes = abinitio_data.popitem(False)
hint_ctg, hint_genes = hints_data.popitem(False)
while abinitio_data and hints_data:
if abinit_ctg == hint_ctg:
mergeGenes(abinit_genes,hint_genes, merged_fh)
abinit_ctg, abinit_genes = abinitio_data.popitem(False)
hint_ctg, hint_genes = hints_data.popitem(False)
elif extractCtgNumber(abinit_ctg) < extractCtgNumber(hint_ctg):
#If there is only ab initio annotatio for this contig
writeWholeContig(abinit_genes, merged_fh,True)
abinit_ctg, abinit_genes = abinitio_data.popitem(False)
else:
#If there is only hint annotation for this contig
writeWholeContig(hint_genes, merged_fh,is_abinitio= False)
hint_ctg, hint_genes = hints_data.popitem(False)
assert (len(abinitio_data) == 0) or (len(hints_data) == 0 )
#Write remaining annotation
while abinitio_data:
writeWholeContig(abinit_genes, merged_fh,True)
abinit_ctg, abinit_genes = abinitio_data.popitem(False)
while hints_data:
writeWholeContig(hint_genes, merged_fh,False)
hint_ctg, hint_genes = hints_data.popitem(False)