notebook.community



In [5]:

    
%pylab inline
import pandas as pd
import pyfaidx
import os









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
five_prime_fasta = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/SGD_all_ORFs/SGD_all_ORFs_5prime_UTRs.fsa'
three_prime_fasta = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/SGD_all_ORFs/SGD_all_ORFs_3prime_UTRs.fsa'



In [4]:

    
five_prime_idx = pyfaidx.Fasta(five_prime_fasta)
three_prime_idx = pyfaidx.Fasta(three_prime_fasta)









    



---------------------------------------------------------------------------
FastaIndexingError                        Traceback (most recent call last)
<ipython-input-4-fed57beb4bd2> in <module>
----> 1 five_prime_idx = pyfaidx.Fasta(five_prime_fasta)
      2 three_prime_idx = pyfaidx.Fasta(three_prime_fasta)

/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/pyfaidx/__init__.py in __init__(self, filename, default_seq, key_function, as_raw, strict_bounds, read_ahead, mutable, split_char, filt_function, one_based_attributes, read_long_names, duplicate_action, sequence_always_upper, rebuild, build_index)
    994             sequence_always_upper=sequence_always_upper,
    995             rebuild=rebuild,
--> 996             build_index=build_index)
    997         self.keys = self.faidx.index.keys
    998         if not self.mutable:

/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/pyfaidx/__init__.py in __init__(self, filename, default_seq, key_function, as_raw, strict_bounds, read_ahead, mutable, split_char, duplicate_action, filt_function, one_based_attributes, read_long_names, sequence_always_upper, rebuild, build_index)
    421                             self.indexname, self.filename), RuntimeWarning)
    422                 elif build_index:
--> 423                     self.build_index()
    424                     self.read_fai()
    425                 else:

/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/pyfaidx/__init__.py in build_index(self)
    592                     % self.indexname)
    593             elif isinstance(e, FastaIndexingError):
--> 594                 raise e
    595 
    596     def write_fai(self):

/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/pyfaidx/__init__.py in build_index(self)
    535                                     "Inconsistent line found in >{0} at "
    536                                     "line {1:n}.".format(
--> 537                                         rname, bad_lines[0][0] + 1))
    538                             blen = None
    539                             rlen = 0

FastaIndexingError: Line length of fasta file is not consistent! Inconsistent line found in >sacCer3_ct_UserTrack_3545_YAL055W_SC_Transcript_00009455_Miura_2006 at line 21.

Remove random # lines from fasta



In [6]:

    
five_prime_cleaned_fasta = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/SGD_all_ORFs/SGD_all_ORFs_5prime_UTRs_cleaned.fsa'
three_prime_cleaned_fasta = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/SGD_all_ORFs/SGD_all_ORFs_3prime_UTRs_cleaned.fsa'



In [9]:

    
lines = []
with open(five_prime_fasta) as fh:
    for line in fh:
        if line[0]!="#":
            lines.append(line)
with open(five_prime_cleaned_fasta, 'w') as fh:
    fh.writelines(lines)



In [10]:

    
lines = []
with open(three_prime_fasta) as fh:
    for line in fh:
        if line[0]!="#":
            lines.append(line)
with open(three_prime_cleaned_fasta, 'w') as fh:
    fh.writelines(lines)



In [11]:

    
five_prime_idx = pyfaidx.Fasta(five_prime_cleaned_fasta)
three_prime_idx = pyfaidx.Fasta(three_prime_cleaned_fasta)



In [67]:

    
from Bio import SeqIO
five_prime_records = []
for record in SeqIO.parse(five_prime_cleaned_fasta, "fasta"):
    id = record.id
    description = record.description.replace(id, '').strip()
    chrom_start_stop = description.split(" 5'pad")[0].replace('range=', '')
    chrom = chrom_start_stop.split(':')[0].replace('chr', '')
    start,stop = chrom_start_stop.split('-')
    stop = int(stop)
    start = start.split(':')[1]
    start = int(start)
    strand = description.split('strand=')[1][0]
    five_pad = description.split("5'pad")[1].split(" ")[0]
    three_pad = description.split("3'pad")[1].split(" ")[0]
    
    gene_name = id.split('_')[4]
    source = ('_').join(id.split('_')[8:10])
    five_prime_records.append((chrom, source, 'five_prime_utr', start, stop, '.', strand, '.',  'gene_id "{}"; transcript_id "{}_mRNA"; gene_source "{}"; gene_biotype "protein_coding"; transcript_biotype "protein_coding";'.format(gene_name, gene_name, source)))
    five_prime_records.append((chrom, source, 'exon', start, stop, '.', strand, '.',  'gene_id "{}"; transcript_id "{}_mRNA"; gene_source "{}"; gene_biotype "protein_coding"; transcript_biotype "protein_coding";'.format(gene_name, gene_name, source)))
five_prime_records_df = pd.DataFrame(five_prime_records).drop_duplicates()



In [68]:

    
from Bio import SeqIO
three_prime_records = []
for record in SeqIO.parse(three_prime_cleaned_fasta, "fasta"):
    id = record.id
    description = record.description.replace(id, '').strip()
    chrom_start_stop = description.split(" 5'pad")[0].replace('range=', '')
    chrom = chrom_start_stop.split(':')[0].replace('chr', '')
    start,stop = chrom_start_stop.split('-')
    stop = int(stop)
    start = start.split(':')[1]
    start = int(start)
    strand = description.split('strand=')[1][0]
    three_pad = description.split("5'pad")[1].split(" ")[0]
    three_pad = description.split("3'pad")[1].split(" ")[0]
    
    gene_name = id.split('_')[4]
    source = ('_').join(id.split('_')[8:10])
    three_prime_records.append((chrom, source, 'three_prime_utr', start, stop, '.', strand, '.',  'gene_id "{}"; transcript_id "{}_mRNA"; gene_source "{}"; gene_biotype "protein_coding"; transcript_biotype "protein_coding";'.format(gene_name, gene_name, source)))
    three_prime_records.append((chrom, source, 'exon', start, stop, '.', strand, '.',  'gene_id "{}"; transcript_id "{}_mRNA"; gene_source "{}"; gene_biotype "protein_coding"; transcript_biotype "protein_coding";'.format(gene_name, gene_name, source)))
three_prime_records_df = pd.DataFrame(three_prime_records).drop_duplicates()



In [69]:

    
three_prime_records_df









    Out[69]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
    
  
  
    
      0
      I
      Nagalakshmi_2008
      three_prime_utr
      36304
      36420
      .
      +
      .
      gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
    
    
      1
      I
      Nagalakshmi_2008
      exon
      36304
      36420
      .
      +
      .
      gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
    
    
      4
      I
      Yassour_2009
      three_prime_utr
      36304
      36408
      .
      +
      .
      gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
    
    
      5
      I
      Yassour_2009
      exon
      36304
      36408
      .
      +
      .
      gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
    
    
      6
      I
      Xu_2009
      three_prime_utr
      36304
      36393
      .
      +
      .
      gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      35795
      XIII
      Nagalakshmi_2008
      exon
      905836
      905904
      .
      +
      .
      gene_id "YMR316W"; transcript_id "YMR316W_mRNA...
    
    
      35798
      XIII
      Miura_2006
      three_prime_utr
      910966
      911061
      .
      -
      .
      gene_id "YMR318C"; transcript_id "YMR318C_mRNA...
    
    
      35799
      XIII
      Miura_2006
      exon
      910966
      911061
      .
      -
      .
      gene_id "YMR318C"; transcript_id "YMR318C_mRNA...
    
    
      35800
      XIII
      Xu_2009
      three_prime_utr
      910966
      911061
      .
      -
      .
      gene_id "YMR318C"; transcript_id "YMR318C_mRNA...
    
    
      35801
      XIII
      Xu_2009
      exon
      910966
      911061
      .
      -
      .
      gene_id "YMR318C"; transcript_id "YMR318C_mRNA...
    
  

15856 rows × 9 columns



In [77]:

    
os.path.basename(original_gtf)









    Out[77]:





'Saccharomyces_cerevisiae.R64-1-1.96.gtf'

Merge the GTF



In [70]:

    
original_gtf = '/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/Saccharomyces_cerevisiae.R64-1-1.96.gtf'
original_gtf_lines = []
with open(original_gtf) as fh:
    for line in fh:
        original_gtf_lines.append(line)



In [71]:

    
original_gtf_df = pd.read_csv(original_gtf, sep="\t", skiprows=5, header=None)
original_gtf_df.head()









    Out[71]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
    
  
  
    
      0
      IV
      sgd
      gene
      1802
      2953
      .
      +
      .
      gene_id "YDL248W"; gene_source "sgd"; gene_bio...
    
    
      1
      IV
      sgd
      transcript
      1802
      2953
      .
      +
      .
      gene_id "YDL248W"; transcript_id "YDL248W_mRNA...
    
    
      2
      IV
      sgd
      exon
      1802
      2953
      .
      +
      .
      gene_id "YDL248W"; transcript_id "YDL248W_mRNA...
    
    
      3
      IV
      sgd
      CDS
      1802
      2950
      .
      +
      0
      gene_id "YDL248W"; transcript_id "YDL248W_mRNA...
    
    
      4
      IV
      sgd
      start_codon
      1802
      1804
      .
      +
      0
      gene_id "YDL248W"; transcript_id "YDL248W_mRNA...



In [72]:

    
gtf_df_combined = pd.concat([original_gtf_df, five_prime_records_df, three_prime_records_df])



In [73]:

    
gtf_df_combined = gtf_df_combined.sort_values(by=[0, 3,4,2, 6])



In [74]:

    
gtf_df_combined.to_csv("/panfs/qcb-panasas/skchoudh/genomes/R64-1-1/annotation/Saccharomyces_cerevisiae.R64-1-1.96.combined.UTR5UTR3.gtf", sep="\t", index=False, header=False)



In [75]:

    
gtf_df_combined.head()









    Out[75]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
    
  
  
    
      40567
      I
      sgd
      start_codon
      335
      337
      .
      +
      0
      gene_id "YAL069W"; transcript_id "YAL069W_mRNA...
    
    
      40566
      I
      sgd
      CDS
      335
      646
      .
      +
      0
      gene_id "YAL069W"; transcript_id "YAL069W_mRNA...
    
    
      40565
      I
      sgd
      exon
      335
      649
      .
      +
      .
      gene_id "YAL069W"; transcript_id "YAL069W_mRNA...
    
    
      40563
      I
      sgd
      gene
      335
      649
      .
      +
      .
      gene_id "YAL069W"; gene_source "sgd"; gene_bio...
    
    
      40564
      I
      sgd
      transcript
      335
      649
      .
      +
      .
      gene_id "YAL069W"; transcript_id "YAL069W_mRNA...



In [ ]:

	0	1	2	3	4	5	6	7	8
0	I	Nagalakshmi_2008	three_prime_utr	36304	36420	.	+	.	gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
1	I	Nagalakshmi_2008	exon	36304	36420	.	+	.	gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
4	I	Yassour_2009	three_prime_utr	36304	36408	.	+	.	gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
5	I	Yassour_2009	exon	36304	36408	.	+	.	gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
6	I	Xu_2009	three_prime_utr	36304	36393	.	+	.	gene_id "YAL060W"; transcript_id "YAL060W_mRNA...
...	...	...	...	...	...	...	...	...	...
35795	XIII	Nagalakshmi_2008	exon	905836	905904	.	+	.	gene_id "YMR316W"; transcript_id "YMR316W_mRNA...
35798	XIII	Miura_2006	three_prime_utr	910966	911061	.	-	.	gene_id "YMR318C"; transcript_id "YMR318C_mRNA...
35799	XIII	Miura_2006	exon	910966	911061	.	-	.	gene_id "YMR318C"; transcript_id "YMR318C_mRNA...
35800	XIII	Xu_2009	three_prime_utr	910966	911061	.	-	.	gene_id "YMR318C"; transcript_id "YMR318C_mRNA...
35801	XIII	Xu_2009	exon	910966	911061	.	-	.	gene_id "YMR318C"; transcript_id "YMR318C_mRNA...

	0	1	2	3	4	5	6	7	8
0	IV	sgd	gene	1802	2953	.	+	.	gene_id "YDL248W"; gene_source "sgd"; gene_bio...
1	IV	sgd	transcript	1802	2953	.	+	.	gene_id "YDL248W"; transcript_id "YDL248W_mRNA...
2	IV	sgd	exon	1802	2953	.	+	.	gene_id "YDL248W"; transcript_id "YDL248W_mRNA...
3	IV	sgd	CDS	1802	2950	.	+	0	gene_id "YDL248W"; transcript_id "YDL248W_mRNA...
4	IV	sgd	start_codon	1802	1804	.	+	0	gene_id "YDL248W"; transcript_id "YDL248W_mRNA...

	0	1	2	3	4	5	6	7	8
40567	I	sgd	start_codon	335	337	.	+	0	gene_id "YAL069W"; transcript_id "YAL069W_mRNA...
40566	I	sgd	CDS	335	646	.	+	0	gene_id "YAL069W"; transcript_id "YAL069W_mRNA...
40565	I	sgd	exon	335	649	.	+	.	gene_id "YAL069W"; transcript_id "YAL069W_mRNA...
40563	I	sgd	gene	335	649	.	+	.	gene_id "YAL069W"; gene_source "sgd"; gene_bio...
40564	I	sgd	transcript	335	649	.	+	.	gene_id "YAL069W"; transcript_id "YAL069W_mRNA...