New tasks:

  • make a function/object that read a fasta file from disk and (header, seq) pairs +
  • ex from:

    AB003409.1/96-167 GGGCCCAUAGCUCAGUGGUAGAGUGCCUCCUUUGCAAGGAGGAUGCCCUGGGUUCGAAUC comment CCAGUGGGUCCA AB009835.1/1-71 CAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUU CAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUU CAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUU CAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUU

AJGHDJHASGDJAS khsk skdjfhskdj slkshd skhksjdf CACGUAGCAUGCUAGCAUGCUAGCAUGCUAGCUAGCUGAC 276512764523765423764527365427365427542735427 CAUCGUAGCUAGCUAGCUAGCUACG AUCGUAGUAGCUAGCUAGCUAGCUAGC

  • yield: (AB003409.1/96-167, GGGCCCAUAGCUCAGUGGUAGAGUGCCUCCUUUGCAAGGAGGAUGCCCUGGGUUCGAAUCCCAGUGGGUCCA) (AB009835.1/1-71,CAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUUCAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUUCAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUUCAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUU) (AJGHDJHASGDJAS khsk skdjfhskdj slkshd skhksjdf, CACGUAGCAUGCUAGCAUGCUAGCAUGCUAGCUAGCUGACCAUCGUAGCUAGCUAGCUAGCUACGAUCGUAGUAGCUAGCUAGCUAGCUAGC)

  • separately:

  • make a function that receives in input the list of sequences, and yields structure graphs (use RNAfold)

In [ ]:
%matplotlib inline
import os, sys
import subprocess as sp
from itertools import cycle
import networkx as nx
import re
from eden.util import display

In [ ]:
# read a fasta file separate the head and the sequence
def _readFastaFile(file_path=None):
    head_start = '>'
    head = []
    seq = []
    seq_temps = []
    string_seq = ''        
    
    #for file in os.listdir(path): #open file
    read_file = open(file_path,'r') 
    
    for line in read_file:
        lines = list(line)
            # the read line is the head of the sequence write it in head list
        if lines[0] == head_start:
            line = line.strip('\n')
            line = line.strip(head_start)
            head.append(line)
            seq.append(string_seq)
            seq_temps = []

            # the read line is a sequence write it in a sequence list
            # remove the unwanted charachters and whitespace, tab
        if lines[0] != head_start:
            line = line.strip()
            line = re.sub(r'\ .*?\ ', '', line)
            seq_temps.append(line)
            string_seq= ''.join(seq_temps)
            print ('string_seq', string_seq)
            string_seq = re.sub(r' ', '',string_seq)                
    seq.append(string_seq)
    #to remove empty head or seq
    seq = filter(None, seq)
    head_seq_zip = zip(head, seq)
    print ('Sequences with comments', head_seq_zip)
    return head_seq_zip

In [ ]:
file_path = "/home/alsheikm/GitDir/EeDN_work/fasta/test2"
def _sequeceWrapper(file_path=None):
    #path = "/home/alsheikm/Work/EDeN_examples/fastaFiles/"
    zip_head_seqs = _readFastaFile(file_path)
    print file_path
    return zip_head_seqs
    
def _fold(seq):
    head, seq, struc = _get_sequence_structure(seq)
    #G = self._make_graph(seq, struc)
    return  head, seq, struc

Get the sequence structure


In [ ]:
#call RNAfold to get the sequence structure
def _get_sequence_structure(seqs):
    if mode == 'RNAfold':
        return _rnafold_wrapper(seqs)
    else:
        raise Exception('Not known: %s'% self.mode)
    
def _rnafold_wrapper(sequence):
    head = sequence[0]
    seq = sequence[1].split()[0]
    flags='--noPS'
    cmd = 'echo "%s" | RNAfold %s' % (seq, flags)
    out = sp.check_output(cmd, shell=True)
    #print out
    text = out.strip().split('\n')
    print ('text:', text)
    seq = text[0]
    struc = text[1].split()[0]
    return head, seq, struc

Build the Graph


In [ ]:
#Recognize basepairs and add them to the generated graph
def _make_graph(head, seq, struc):
    print ("Graph title", head)
    open_pran = "("
    close_pran = ")"
    stack_o = []
    stack_c = []
    G = nx.Graph()
    seq_struc_zip = zip(seq, struc)
    #print seq_struc_zip
    for i, k in enumerate(struc):
        G.add_node(i, label = seq[i])
        # connect with the next node
        if i > 0:
            G.add_edge(i-1, i, label= 'x')
            
        # find basepair and connect them
        if struc[i] == open_pran:
            j = i
            stack_o.append(struc[j])
            open_len = len(stack_o)

        if struc[i] == close_pran:
            stack_c.append(struc[i])
            stack_o.pop()
            G.add_edge(i, j, label = 'b')
            j = j-1

    return G

Experiment


In [ ]:
#generating the graph
#seq,seqs are Not correct they do Not take the zipped output

zip_head_seqs= _sequeceWrapper(file_path)
print ('zip_head_seqs here', zip_head_seqs)
for i, seq in enumerate(zip_head_seqs):
    heads = seq[0]
    seq1 = seq[1]
    mode = 'RNAfold'
    head, seq, struc =_fold(seq)
    G = _make_graph(head, seq, struc)
    display.draw_graph(G, node_size=180, font_size=9, node_border=True, prog='neato')

Note