In [1]:
# Imports
from GffParser import GffParser
from Bio import SeqIO
from gzip import open as gopen
from time import time
start_time = time()
# Open the gff file and store only transcript informations
print ("Parse the gff file")
gp = GffParser(gff_file="./gencode.v23.primary_assembly.annotation.gff3.gz", fusion=False, offset=0, features=["transcript"])
print ("Found {} valid transcript".format(gp.valid_features))
# Flaten the collection of features in all chromosomes in a simple transcript_id access dictionnary
print ("Flaten the transcript feature dictionnary")
transcript_dict = {}
for seq in gp.gff_dict.values():
for feature in seq.features:
transcript_dict[feature.attributes['ID'][0]] = feature
# Process the fasta file containing the transcript sequences
print ("Parse the fasta file and write a new one")
with gopen ("./gencode.v23.transcripts.fa.gz", "r") as fin, gopen ("./gencode.v23.transcripts_full.fa.gz", "w") as fout:
for n, sequence in enumerate (SeqIO.parse(fin, 'fasta')):
# Extract the transcript id from the fasta sequence name
ID = sequence.id.split("|")[0]
# Find the correspondance in the parsed gff transcripts
gff_line = str(transcript_dict[ID]).replace("\t", "|").replace(" ", "_")
# Write a fasta file with the new header line in a new file
fout.write (">{}\n{}\n".format(gff_line, str(sequence.seq)))
print ("Wrote {} sequences".format(n))
print ("\nDone in {}s".format(round(time()-start_time, 3)))