In [1]:
from collections import defaultdict
import re

In [2]:
#interprot_annot= defaultdict(list)
interprot_annot = {}
GENE_ID = 0
ANNOT_TOOL = 1
TAGS = 8

In [3]:
def extract_signature_desc(tag_line):
    signature_desc = ""
    tag_fields = tag_line.split(";")
    for field in tag_fields:
        if "signature_desc" in field:
            value_start = field.find("=")
            signature_desc = field[value_start+1:]
    return signature_desc

In [4]:
#Extract the signature description from Interpro annotation using 
with open("TcIV-X10825.v2-AugustusHintsProt-InterproScan.gff3","r") as interprot_fh:
    for line in interprot_fh:
        if line[0] != "#" : #ignore comment lines
            line_fields = line.rstrip("\n").split("\t")
            
            #Extract from Pfam lines the
            if line_fields[ANNOT_TOOL].lower() == "pfam":
                signature_desc = extract_signature_desc(line_fields[TAGS])
                #Remove the .t1 to the gene_id
                dot_position = line_fields[GENE_ID].rfind(".")
                if dot_position != -1:
                    line_fields[GENE_ID] = line_fields[GENE_ID][:dot_position]
                interprot_annot[  line_fields[GENE_ID] ] = signature_desc
        elif line.rstrip("\n") == "##FASTA":
            break

In [5]:
put_prot_id = 0
def getSignatureDescription(gene_id, signature_descriptions,parent=False):
    global put_prot_id
    return signature_descriptions[gene_id] if gene_id in signature_descriptions else "putative_protein"+str(put_prot_id)

In [6]:
#Substitutes the orignal ID, or parent ID for the signature_desc from interprot pfam if available
def processAugLine(line, signature_descriptions):
    global put_prot_id
    new_line = line
    if "\tgene" not in line:
        id_match = re.search( r"ID=(.+?)(\..*?)?(;|$)", line)
        if id_match:
            gene_id = id_match.group(1)
            new_gene_id = getSignatureDescription(gene_id,signature_descriptions)
            new_line = re.sub(r"ID=(.+?)(\..+?)(;|\n)", "ID='"+new_gene_id+r"'\g<3>",line)
        if "\ttranscript" not in line:
            parent_match = re.search( r"Parent=(.+?)(\..*?)?(;|$)", line)
            if parent_match:
                parent_id = parent_match.group(1)
                new_parent_id = getSignatureDescription(parent_id,signature_descriptions)
                new_line = re.sub(r"Parent=(.+?)(;|\n)", "Parent='"+new_parent_id+r"'\g<2>",new_line)
    else:
        #New protein, increase counter
        put_prot_id += 1
    return new_line

In [7]:
#putative_prot_counter = 0
with open("merged.gff","r") as aug_fh , open("annotated_merged.gff","w") as out_fh:
    for line in aug_fh:
        if line[0] == "#":
            out_fh.write(line)
        else:
            out_fh.write(processAugLine(line, interprot_annot))