In [5]:
%pylab inline
import pandas as pd
import pyfaidx
import os
In [2]:
five_prime_fasta = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/SGD_all_ORFs/SGD_all_ORFs_5prime_UTRs.fsa'
three_prime_fasta = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/SGD_all_ORFs/SGD_all_ORFs_3prime_UTRs.fsa'
In [4]:
five_prime_idx = pyfaidx.Fasta(five_prime_fasta)
three_prime_idx = pyfaidx.Fasta(three_prime_fasta)
In [6]:
five_prime_cleaned_fasta = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/SGD_all_ORFs/SGD_all_ORFs_5prime_UTRs_cleaned.fsa'
three_prime_cleaned_fasta = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/SGD_all_ORFs/SGD_all_ORFs_3prime_UTRs_cleaned.fsa'
In [9]:
lines = []
with open(five_prime_fasta) as fh:
for line in fh:
if line[0]!="#":
lines.append(line)
with open(five_prime_cleaned_fasta, 'w') as fh:
fh.writelines(lines)
In [10]:
lines = []
with open(three_prime_fasta) as fh:
for line in fh:
if line[0]!="#":
lines.append(line)
with open(three_prime_cleaned_fasta, 'w') as fh:
fh.writelines(lines)
In [11]:
five_prime_idx = pyfaidx.Fasta(five_prime_cleaned_fasta)
three_prime_idx = pyfaidx.Fasta(three_prime_cleaned_fasta)
In [67]:
from Bio import SeqIO
five_prime_records = []
for record in SeqIO.parse(five_prime_cleaned_fasta, "fasta"):
id = record.id
description = record.description.replace(id, '').strip()
chrom_start_stop = description.split(" 5'pad")[0].replace('range=', '')
chrom = chrom_start_stop.split(':')[0].replace('chr', '')
start,stop = chrom_start_stop.split('-')
stop = int(stop)
start = start.split(':')[1]
start = int(start)
strand = description.split('strand=')[1][0]
five_pad = description.split("5'pad")[1].split(" ")[0]
three_pad = description.split("3'pad")[1].split(" ")[0]
gene_name = id.split('_')[4]
source = ('_').join(id.split('_')[8:10])
five_prime_records.append((chrom, source, 'five_prime_utr', start, stop, '.', strand, '.', 'gene_id "{}"; transcript_id "{}_mRNA"; gene_source "{}"; gene_biotype "protein_coding"; transcript_biotype "protein_coding";'.format(gene_name, gene_name, source)))
five_prime_records.append((chrom, source, 'exon', start, stop, '.', strand, '.', 'gene_id "{}"; transcript_id "{}_mRNA"; gene_source "{}"; gene_biotype "protein_coding"; transcript_biotype "protein_coding";'.format(gene_name, gene_name, source)))
five_prime_records_df = pd.DataFrame(five_prime_records).drop_duplicates()
In [68]:
from Bio import SeqIO
three_prime_records = []
for record in SeqIO.parse(three_prime_cleaned_fasta, "fasta"):
id = record.id
description = record.description.replace(id, '').strip()
chrom_start_stop = description.split(" 5'pad")[0].replace('range=', '')
chrom = chrom_start_stop.split(':')[0].replace('chr', '')
start,stop = chrom_start_stop.split('-')
stop = int(stop)
start = start.split(':')[1]
start = int(start)
strand = description.split('strand=')[1][0]
three_pad = description.split("5'pad")[1].split(" ")[0]
three_pad = description.split("3'pad")[1].split(" ")[0]
gene_name = id.split('_')[4]
source = ('_').join(id.split('_')[8:10])
three_prime_records.append((chrom, source, 'three_prime_utr', start, stop, '.', strand, '.', 'gene_id "{}"; transcript_id "{}_mRNA"; gene_source "{}"; gene_biotype "protein_coding"; transcript_biotype "protein_coding";'.format(gene_name, gene_name, source)))
three_prime_records.append((chrom, source, 'exon', start, stop, '.', strand, '.', 'gene_id "{}"; transcript_id "{}_mRNA"; gene_source "{}"; gene_biotype "protein_coding"; transcript_biotype "protein_coding";'.format(gene_name, gene_name, source)))
three_prime_records_df = pd.DataFrame(three_prime_records).drop_duplicates()
In [69]:
three_prime_records_df
Out[69]:
In [77]:
os.path.basename(original_gtf)
Out[77]:
In [70]:
original_gtf = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/Saccharomyces_cerevisiae.R64-1-1.96.gtf'
original_gtf_lines = []
with open(original_gtf) as fh:
for line in fh:
original_gtf_lines.append(line)
In [71]:
original_gtf_df = pd.read_csv(original_gtf, sep="\t", skiprows=5, header=None)
original_gtf_df.head()
Out[71]:
In [72]:
gtf_df_combined = pd.concat([original_gtf_df, five_prime_records_df, three_prime_records_df])
In [73]:
gtf_df_combined = gtf_df_combined.sort_values(by=[0, 3,4,2, 6])
In [74]:
gtf_df_combined.to_csv("/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/Saccharomyces_cerevisiae.R64-1-1.96.combined.UTR5UTR3.gtf", sep="\t", index=False, header=False)
In [75]:
gtf_df_combined.head()
Out[75]:
In [ ]: