In [5]:
%pylab inline
import pandas as pd
import pyfaidx
import os


Populating the interactive namespace from numpy and matplotlib

In [2]:
five_prime_fasta = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/SGD_all_ORFs/SGD_all_ORFs_5prime_UTRs.fsa'
three_prime_fasta = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/SGD_all_ORFs/SGD_all_ORFs_3prime_UTRs.fsa'

In [4]:
five_prime_idx = pyfaidx.Fasta(five_prime_fasta)
three_prime_idx = pyfaidx.Fasta(three_prime_fasta)


---------------------------------------------------------------------------
FastaIndexingError                        Traceback (most recent call last)
<ipython-input-4-fed57beb4bd2> in <module>
----> 1 five_prime_idx = pyfaidx.Fasta(five_prime_fasta)
      2 three_prime_idx = pyfaidx.Fasta(three_prime_fasta)

/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/pyfaidx/__init__.py in __init__(self, filename, default_seq, key_function, as_raw, strict_bounds, read_ahead, mutable, split_char, filt_function, one_based_attributes, read_long_names, duplicate_action, sequence_always_upper, rebuild, build_index)
    994             sequence_always_upper=sequence_always_upper,
    995             rebuild=rebuild,
--> 996             build_index=build_index)
    997         self.keys = self.faidx.index.keys
    998         if not self.mutable:

/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/pyfaidx/__init__.py in __init__(self, filename, default_seq, key_function, as_raw, strict_bounds, read_ahead, mutable, split_char, duplicate_action, filt_function, one_based_attributes, read_long_names, sequence_always_upper, rebuild, build_index)
    421                             self.indexname, self.filename), RuntimeWarning)
    422                 elif build_index:
--> 423                     self.build_index()
    424                     self.read_fai()
    425                 else:

/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/pyfaidx/__init__.py in build_index(self)
    592                     % self.indexname)
    593             elif isinstance(e, FastaIndexingError):
--> 594                 raise e
    595 
    596     def write_fai(self):

/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/pyfaidx/__init__.py in build_index(self)
    535                                     "Inconsistent line found in >{0} at "
    536                                     "line {1:n}.".format(
--> 537                                         rname, bad_lines[0][0] + 1))
    538                             blen = None
    539                             rlen = 0

FastaIndexingError: Line length of fasta file is not consistent! Inconsistent line found in >sacCer3_ct_UserTrack_3545_YAL055W_SC_Transcript_00009455_Miura_2006 at line 21.

Remove random # lines from fasta


In [6]:
five_prime_cleaned_fasta = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/SGD_all_ORFs/SGD_all_ORFs_5prime_UTRs_cleaned.fsa'
three_prime_cleaned_fasta = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/SGD_all_ORFs/SGD_all_ORFs_3prime_UTRs_cleaned.fsa'

In [9]:
lines = []
with open(five_prime_fasta) as fh:
    for line in fh:
        if line[0]!="#":
            lines.append(line)
with open(five_prime_cleaned_fasta, 'w') as fh:
    fh.writelines(lines)

In [10]:
lines = []
with open(three_prime_fasta) as fh:
    for line in fh:
        if line[0]!="#":
            lines.append(line)
with open(three_prime_cleaned_fasta, 'w') as fh:
    fh.writelines(lines)

In [11]:
five_prime_idx = pyfaidx.Fasta(five_prime_cleaned_fasta)
three_prime_idx = pyfaidx.Fasta(three_prime_cleaned_fasta)

In [67]:
from Bio import SeqIO
five_prime_records = []
for record in SeqIO.parse(five_prime_cleaned_fasta, "fasta"):
    id = record.id
    description = record.description.replace(id, '').strip()
    chrom_start_stop = description.split(" 5'pad")[0].replace('range=', '')
    chrom = chrom_start_stop.split(':')[0].replace('chr', '')
    start,stop = chrom_start_stop.split('-')
    stop = int(stop)
    start = start.split(':')[1]
    start = int(start)
    strand = description.split('strand=')[1][0]
    five_pad = description.split("5'pad")[1].split(" ")[0]
    three_pad = description.split("3'pad")[1].split(" ")[0]
    
    gene_name = id.split('_')[4]
    source = ('_').join(id.split('_')[8:10])
    five_prime_records.append((chrom, source, 'five_prime_utr', start, stop, '.', strand, '.',  'gene_id "{}"; transcript_id "{}_mRNA"; gene_source "{}"; gene_biotype "protein_coding"; transcript_biotype "protein_coding";'.format(gene_name, gene_name, source)))
    five_prime_records.append((chrom, source, 'exon', start, stop, '.', strand, '.',  'gene_id "{}"; transcript_id "{}_mRNA"; gene_source "{}"; gene_biotype "protein_coding"; transcript_biotype "protein_coding";'.format(gene_name, gene_name, source)))
five_prime_records_df = pd.DataFrame(five_prime_records).drop_duplicates()

In [68]:
from Bio import SeqIO
three_prime_records = []
for record in SeqIO.parse(three_prime_cleaned_fasta, "fasta"):
    id = record.id
    description = record.description.replace(id, '').strip()
    chrom_start_stop = description.split(" 5'pad")[0].replace('range=', '')
    chrom = chrom_start_stop.split(':')[0].replace('chr', '')
    start,stop = chrom_start_stop.split('-')
    stop = int(stop)
    start = start.split(':')[1]
    start = int(start)
    strand = description.split('strand=')[1][0]
    three_pad = description.split("5'pad")[1].split(" ")[0]
    three_pad = description.split("3'pad")[1].split(" ")[0]
    
    gene_name = id.split('_')[4]
    source = ('_').join(id.split('_')[8:10])
    three_prime_records.append((chrom, source, 'three_prime_utr', start, stop, '.', strand, '.',  'gene_id "{}"; transcript_id "{}_mRNA"; gene_source "{}"; gene_biotype "protein_coding"; transcript_biotype "protein_coding";'.format(gene_name, gene_name, source)))
    three_prime_records.append((chrom, source, 'exon', start, stop, '.', strand, '.',  'gene_id "{}"; transcript_id "{}_mRNA"; gene_source "{}"; gene_biotype "protein_coding"; transcript_biotype "protein_coding";'.format(gene_name, gene_name, source)))
three_prime_records_df = pd.DataFrame(three_prime_records).drop_duplicates()

In [69]:
three_prime_records_df


Out[69]:
0 1 2 3 4 5 6 7 8
0 I Nagalakshmi_2008 three_prime_utr 36304 36420 . + . gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
1 I Nagalakshmi_2008 exon 36304 36420 . + . gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
4 I Yassour_2009 three_prime_utr 36304 36408 . + . gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
5 I Yassour_2009 exon 36304 36408 . + . gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
6 I Xu_2009 three_prime_utr 36304 36393 . + . gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
... ... ... ... ... ... ... ... ... ...
35795 XIII Nagalakshmi_2008 exon 905836 905904 . + . gene_id "YMR316W"; transcript_id "YMR316W_mRNA...
35798 XIII Miura_2006 three_prime_utr 910966 911061 . - . gene_id "YMR318C"; transcript_id "YMR318C_mRNA...
35799 XIII Miura_2006 exon 910966 911061 . - . gene_id "YMR318C"; transcript_id "YMR318C_mRNA...
35800 XIII Xu_2009 three_prime_utr 910966 911061 . - . gene_id "YMR318C"; transcript_id "YMR318C_mRNA...
35801 XIII Xu_2009 exon 910966 911061 . - . gene_id "YMR318C"; transcript_id "YMR318C_mRNA...

15856 rows × 9 columns


In [77]:
os.path.basename(original_gtf)


Out[77]:
'Saccharomyces_cerevisiae.R64-1-1.96.gtf'

Merge the GTF


In [70]:
original_gtf = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/Saccharomyces_cerevisiae.R64-1-1.96.gtf'
original_gtf_lines = []
with open(original_gtf) as fh:
    for line in fh:
        original_gtf_lines.append(line)

In [71]:
original_gtf_df = pd.read_csv(original_gtf, sep="\t", skiprows=5, header=None)
original_gtf_df.head()


Out[71]:
0 1 2 3 4 5 6 7 8
0 IV sgd gene 1802 2953 . + . gene_id "YDL248W"; gene_source "sgd"; gene_bio...
1 IV sgd transcript 1802 2953 . + . gene_id "YDL248W"; transcript_id "YDL248W_mRNA...
2 IV sgd exon 1802 2953 . + . gene_id "YDL248W"; transcript_id "YDL248W_mRNA...
3 IV sgd CDS 1802 2950 . + 0 gene_id "YDL248W"; transcript_id "YDL248W_mRNA...
4 IV sgd start_codon 1802 1804 . + 0 gene_id "YDL248W"; transcript_id "YDL248W_mRNA...

In [72]:
gtf_df_combined = pd.concat([original_gtf_df, five_prime_records_df, three_prime_records_df])

In [73]:
gtf_df_combined = gtf_df_combined.sort_values(by=[0, 3,4,2, 6])

In [74]:
gtf_df_combined.to_csv("/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/Saccharomyces_cerevisiae.R64-1-1.96.combined.UTR5UTR3.gtf", sep="\t", index=False, header=False)

In [75]:
gtf_df_combined.head()


Out[75]:
0 1 2 3 4 5 6 7 8
40567 I sgd start_codon 335 337 . + 0 gene_id "YAL069W"; transcript_id "YAL069W_mRNA...
40566 I sgd CDS 335 646 . + 0 gene_id "YAL069W"; transcript_id "YAL069W_mRNA...
40565 I sgd exon 335 649 . + . gene_id "YAL069W"; transcript_id "YAL069W_mRNA...
40563 I sgd gene 335 649 . + . gene_id "YAL069W"; gene_source "sgd"; gene_bio...
40564 I sgd transcript 335 649 . + . gene_id "YAL069W"; transcript_id "YAL069W_mRNA...

In [ ]: