In [1]:

    
# Imports
from GffParser import GffParser
from Bio import SeqIO
from gzip import open as gopen
from time import time

start_time = time()

# Open the gff file and store only transcript informations
print ("Parse the gff file")
gp = GffParser(gff_file="./gencode.v23.primary_assembly.annotation.gff3.gz", fusion=False, offset=0, features=["transcript"])
print ("Found {} valid transcript".format(gp.valid_features))

# Flaten the collection of features in all chromosomes in a simple transcript_id access dictionnary
print ("Flaten the transcript feature dictionnary")
transcript_dict = {}
for seq in gp.gff_dict.values():
    for feature in seq.features:
        transcript_dict[feature.attributes['ID'][0]] = feature

# Process the fasta file containing the transcript sequences
print ("Parse the fasta file and write a new one")
with gopen ("./gencode.v23.transcripts.fa.gz", "r") as fin, gopen ("./gencode.v23.transcripts_full.fa.gz", "w") as fout:
    for n, sequence in enumerate (SeqIO.parse(fin, 'fasta')):
        
        # Extract the transcript id from the fasta sequence name
        ID = sequence.id.split("|")[0]
        
        # Find the correspondance in the parsed gff transcripts
        gff_line = str(transcript_dict[ID]).replace("\t", "|").replace(" ", "_")
        
        # Write a fasta file with the new header line in a new file 
        fout.write (">{}\n{}\n".format(gff_line, str(sequence.seq)))

print ("Wrote {} sequences".format(n))
        
print ("\nDone in {}s".format(round(time()-start_time, 3)))









    



Parse the gff file
  100000 features parsed
  200000 features parsed
  300000 features parsed
  400000 features parsed
  500000 features parsed
  600000 features parsed
  700000 features parsed
  800000 features parsed
  900000 features parsed
  1000000 features parsed
  1100000 features parsed
  1200000 features parsed
  1300000 features parsed
  1400000 features parsed
  1500000 features parsed
  1600000 features parsed
  1700000 features parsed
  1800000 features parsed
  1900000 features parsed
  2000000 features parsed
  2100000 features parsed
  2200000 features parsed
  2300000 features parsed
  2400000 features parsed
  2500000 features parsed
Found 198798 valid transcript
Flaten the transcript feature dictionnary
Parse the fasta file and write a new one
Wrote 198618 sequences

Done in 124.937s