In [1]:
from collections import defaultdict
import re
In [2]:
#interprot_annot= defaultdict(list)
interprot_annot = {}
GENE_ID = 0
ANNOT_TOOL = 1
TAGS = 8
In [3]:
def extract_signature_desc(tag_line):
signature_desc = ""
tag_fields = tag_line.split(";")
for field in tag_fields:
if "signature_desc" in field:
value_start = field.find("=")
signature_desc = field[value_start+1:]
return signature_desc
In [4]:
#Extract the signature description from Interpro annotation using
with open("TcIV-X10825.v2-AugustusHintsProt-InterproScan.gff3","r") as interprot_fh:
for line in interprot_fh:
if line[0] != "#" : #ignore comment lines
line_fields = line.rstrip("\n").split("\t")
#Extract from Pfam lines the
if line_fields[ANNOT_TOOL].lower() == "pfam":
signature_desc = extract_signature_desc(line_fields[TAGS])
#Remove the .t1 to the gene_id
dot_position = line_fields[GENE_ID].rfind(".")
if dot_position != -1:
line_fields[GENE_ID] = line_fields[GENE_ID][:dot_position]
interprot_annot[ line_fields[GENE_ID] ] = signature_desc
elif line.rstrip("\n") == "##FASTA":
break
In [5]:
put_prot_id = 0
def getSignatureDescription(gene_id, signature_descriptions,parent=False):
global put_prot_id
return signature_descriptions[gene_id] if gene_id in signature_descriptions else "putative_protein"+str(put_prot_id)
In [6]:
#Substitutes the orignal ID, or parent ID for the signature_desc from interprot pfam if available
def processAugLine(line, signature_descriptions):
global put_prot_id
new_line = line
if "\tgene" not in line:
id_match = re.search( r"ID=(.+?)(\..*?)?(;|$)", line)
if id_match:
gene_id = id_match.group(1)
new_gene_id = getSignatureDescription(gene_id,signature_descriptions)
new_line = re.sub(r"ID=(.+?)(\..+?)(;|\n)", "ID='"+new_gene_id+r"'\g<3>",line)
if "\ttranscript" not in line:
parent_match = re.search( r"Parent=(.+?)(\..*?)?(;|$)", line)
if parent_match:
parent_id = parent_match.group(1)
new_parent_id = getSignatureDescription(parent_id,signature_descriptions)
new_line = re.sub(r"Parent=(.+?)(;|\n)", "Parent='"+new_parent_id+r"'\g<2>",new_line)
else:
#New protein, increase counter
put_prot_id += 1
return new_line
In [7]:
#putative_prot_counter = 0
with open("merged.gff","r") as aug_fh , open("annotated_merged.gff","w") as out_fh:
for line in aug_fh:
if line[0] == "#":
out_fh.write(line)
else:
out_fh.write(processAugLine(line, interprot_annot))