In [1]:
def reverseComplement(seq):
complementDict = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
seq = ''.join([complementDict[i] for i in seq])
return seq[::-1]
import networkx as nx
class deburijnGraph():
def __init__(self, readList = [], k = 4):
self.readList = []
self.k = 5
self.g = nx.MultiDiGraph()
for read in readList:
for i in xrange(len(read) - k + 1):
for kmer in (read[i:i+k], reverseComplement(read[i:i+k])):
head = kmer[:-1]
tail = kmer[1:]
self.g.add_node(head)
self.g.add_node(tail)
self.g.add_edge(head, tail, label=kmer)
In [14]:
fp = open('GASM.txt')
readList = map(lambda x:x.strip(), fp.readlines())
fp.close()
print readList
In [3]:
dG = deburijnGraph(readList)
In [4]:
A=nx.to_agraph(dG.g)
A.graph_attr.update(label='Debruijn graph', fontsize='20.0')
A.node_attr.update(color='#336699', fontcolor='black')
A.edge_attr.update(color='#99CC33', fontcolor='black')
A.layout('circo')
A.draw('deburijnGraph.png')
In [5]:
from IPython import display
display.Image('deburijnGraph.png')
Out[5]:
In [10]:
def is_semieulerian(G):
start, end = None, None
if G.is_directed():
for n in G.nodes_iter():
if G.in_degree(n) +1 == G.out_degree(n):
if start != None: return False
else: start = n
elif G.in_degree(n) == G.out_degree(n)+1:
if end != None: return False
else: end = n
elif G.in_degree(n) != G.out_degree(n):
return False
g = G.copy()
g.add_edge(end, start)
if not nx.is_strongly_connected(g):return False
return True
def semi2eulerian(G):
start = None
end = None
if G.is_directed():
for n in G.nodes_iter():
if G.in_degree(n) +1 == G.out_degree(n):
start = n
elif G.in_degree(n) == G.out_degree(n)+1:
end = n
elif G.in_degree(n) != G.out_degree(n):
return False
G.add_edge(end, start)
return start, end
def eulerPath(g):
startNode = None
path = []
if nx.is_eulerian(g):
cycle = list(nx.eulerian_circuit(g, startNode))
path = cycle
elif is_semieulerian(g):
g = g.copy()
startNode, endNode = semi2eulerian(g)
#print startNode, endNode
#if nx.is_eulerian(g): print g.edges()
cycle = list(nx.eulerian_circuit(g, startNode))
i = cycle.index((endNode, startNode))
path = cycle[i+1:]+cycle[:i+1]
return path
In [23]:
contigs = []
for i, subG in enumerate(nx.weakly_connected_component_subgraphs(dG.g, copy=True)):
path = eulerPath(subG)
superStr = ''
walk = ''
for p in path:
walk += p[0]+'->'
superStr += p[0][0]
superStr += path[-1][0][1:]
print walk[:-2]
print superStr
contigs.append(superStr)
In [29]:
for contig in contigs:
print '-'*25
print '>'+contig
for read in readList:
rev_read = reverseComplement(read)
if read in contig:
i = contig.index(read)
print '+'+' '*i+read
elif rev_read in contig:
i = contig.index(rev_read)
print '-'+' '*i+rev_read
else:
print ' '*len(contig)+read
In [ ]:
In [ ]: