AB003409.1/96-167 GGGCCCAUAGCUCAGUGGUAGAGUGCCUCCUUUGCAAGGAGGAUGCCCUGGGUUCGAAUC comment CCAGUGGGUCCA AB009835.1/1-71 CAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUU CAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUU CAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUU CAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUU
AJGHDJHASGDJAS khsk skdjfhskdj slkshd skhksjdf CACGUAGCAUGCUAGCAUGCUAGCAUGCUAGCUAGCUGAC 276512764523765423764527365427365427542735427 CAUCGUAGCUAGCUAGCUAGCUACG AUCGUAGUAGCUAGCUAGCUAGCUAGC
yield: (AB003409.1/96-167, GGGCCCAUAGCUCAGUGGUAGAGUGCCUCCUUUGCAAGGAGGAUGCCCUGGGUUCGAAUCCCAGUGGGUCCA) (AB009835.1/1-71,CAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUUCAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUUCAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUUCAUUAGAUGACUGAAAGCAAGUACUGGUCUCUUAAACCAUUUAAUAGUAAAUacagugcCUU) (AJGHDJHASGDJAS khsk skdjfhskdj slkshd skhksjdf, CACGUAGCAUGCUAGCAUGCUAGCAUGCUAGCUAGCUGACCAUCGUAGCUAGCUAGCUAGCUACGAUCGUAGUAGCUAGCUAGCUAGCUAGC)
separately:
In [ ]:
%matplotlib inline
import os, sys
import subprocess as sp
from itertools import cycle
import networkx as nx
import re
from eden.util import display
In [ ]:
# read a fasta file separate the head and the sequence
def _readFastaFile(file_path=None):
head_start = '>'
head = []
seq = []
seq_temps = []
string_seq = ''
#for file in os.listdir(path): #open file
read_file = open(file_path,'r')
for line in read_file:
lines = list(line)
# the read line is the head of the sequence write it in head list
if lines[0] == head_start:
line = line.strip('\n')
line = line.strip(head_start)
head.append(line)
seq.append(string_seq)
seq_temps = []
# the read line is a sequence write it in a sequence list
# remove the unwanted charachters and whitespace, tab
if lines[0] != head_start:
line = line.strip()
line = re.sub(r'\ .*?\ ', '', line)
seq_temps.append(line)
string_seq= ''.join(seq_temps)
print ('string_seq', string_seq)
string_seq = re.sub(r' ', '',string_seq)
seq.append(string_seq)
#to remove empty head or seq
seq = filter(None, seq)
head_seq_zip = zip(head, seq)
print ('Sequences with comments', head_seq_zip)
return head_seq_zip
In [ ]:
file_path = "/home/alsheikm/GitDir/EeDN_work/fasta/test2"
def _sequeceWrapper(file_path=None):
#path = "/home/alsheikm/Work/EDeN_examples/fastaFiles/"
zip_head_seqs = _readFastaFile(file_path)
print file_path
return zip_head_seqs
def _fold(seq):
head, seq, struc = _get_sequence_structure(seq)
#G = self._make_graph(seq, struc)
return head, seq, struc
In [ ]:
#call RNAfold to get the sequence structure
def _get_sequence_structure(seqs):
if mode == 'RNAfold':
return _rnafold_wrapper(seqs)
else:
raise Exception('Not known: %s'% self.mode)
def _rnafold_wrapper(sequence):
head = sequence[0]
seq = sequence[1].split()[0]
flags='--noPS'
cmd = 'echo "%s" | RNAfold %s' % (seq, flags)
out = sp.check_output(cmd, shell=True)
#print out
text = out.strip().split('\n')
print ('text:', text)
seq = text[0]
struc = text[1].split()[0]
return head, seq, struc
In [ ]:
#Recognize basepairs and add them to the generated graph
def _make_graph(head, seq, struc):
print ("Graph title", head)
open_pran = "("
close_pran = ")"
stack_o = []
stack_c = []
G = nx.Graph()
seq_struc_zip = zip(seq, struc)
#print seq_struc_zip
for i, k in enumerate(struc):
G.add_node(i, label = seq[i])
# connect with the next node
if i > 0:
G.add_edge(i-1, i, label= 'x')
# find basepair and connect them
if struc[i] == open_pran:
j = i
stack_o.append(struc[j])
open_len = len(stack_o)
if struc[i] == close_pran:
stack_c.append(struc[i])
stack_o.pop()
G.add_edge(i, j, label = 'b')
j = j-1
return G
In [ ]:
#generating the graph
#seq,seqs are Not correct they do Not take the zipped output
zip_head_seqs= _sequeceWrapper(file_path)
print ('zip_head_seqs here', zip_head_seqs)
for i, seq in enumerate(zip_head_seqs):
heads = seq[0]
seq1 = seq[1]
mode = 'RNAfold'
head, seq, struc =_fold(seq)
G = _make_graph(head, seq, struc)
display.draw_graph(G, node_size=180, font_size=9, node_border=True, prog='neato')