In [7]:
import pandas as pd
df = pd.read_table('/home/cmb-panasas2/skchoudh/genomes/S_cerevisiae_BY4741/annotation/BY4741_JRIS00000000.gff',
sep=' ',
names= ['sequence', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attributes'])
In [8]:
## Row 11348 (1-based) onwards are sequences
df = df #.iloc[:11348-1]
In [29]:
lines_to_write = """##description: modified gtf from BY4741_JRIS00000000.gff
##provider: saketkc
##format: gtf
##date: 2017-10-27
"""
contigs_info = ''
undef_count = 1
with open('/home/cmb-panasas2/skchoudh/genomes/S_cerevisiae_BY4741/annotation/BY4741_JRIS00000000.gff') as fh:
for line in fh:
line_splitted = line.strip().split(' ')
chrom, source, feature, start, end, score, strand, frame, attributes = line_splitted
start = int(start)
end = int(end)
if feature == 'gene':
feature = 'exon'
if feature == 'contig':
contigs_info += '{}\t{}\n'.format(chrom, end-start+1)
continue
## Process attributes
attribute = attributes.split(';')[0]
## The attrobutes are separated by ; indicating multiple blast hits
## We deal with them by assigning to the hit with max percent identity
## which is always the first one
if attribute=='UNDEF':
mod_attribute = 'gene_id "UNDEF-{}"; transcript_id "UNDEF-{}-T"; gene_name "UNDEF-{};"'\
.format(undef_count, undef_count, undef_count)
undef_count +=1
else:
gene_id, genome, chromRef, startRef, startEnd, gene_name, evalue, percent_identity = attribute.split(',')
mod_attribute = 'gene_id "{}"; transcript_id "{}-T"; gene_name "{}"'\
.format(gene_id, gene_id, gene_name)
lines_to_write += '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{};\n'\
.format(chrom, source, feature, start, end, score, strand, frame, mod_attribute)
In [31]:
with open('/home/cmb-panasas2/skchoudh/genomes/S_cerevisiae_BY4741/annotation/BY4741_JRIS00000000.modified.gtf', 'w') as fh:
fh.write(lines_to_write)