Deburijn graph class


In [1]:
def reverseComplement(seq):
    complementDict = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
    seq = ''.join([complementDict[i] for i in seq])
    return seq[::-1]

import networkx as nx
class deburijnGraph():
    def __init__(self, readList = [], k = 4):
        
        self.readList = []
        self.k = 5
        self.g = nx.MultiDiGraph()
        for read in readList:
            for i in xrange(len(read) - k + 1):
                for kmer in (read[i:i+k], reverseComplement(read[i:i+k])):
                    head = kmer[:-1]
                    tail = kmer[1:]
                    self.g.add_node(head)
                    self.g.add_node(tail)
                    self.g.add_edge(head, tail, label=kmer)

Read reads


In [14]:
fp = open('GASM.txt')
readList = map(lambda x:x.strip(), fp.readlines())
fp.close()
print readList


['AATCT', 'TGTAA', 'GATTA', 'ACAGA']

Build deburijn graph


In [3]:
dG = deburijnGraph(readList)

Graphviz deburijn graph


In [4]:
A=nx.to_agraph(dG.g)
A.graph_attr.update(label='Debruijn graph', fontsize='20.0')
A.node_attr.update(color='#336699', fontcolor='black')
A.edge_attr.update(color='#99CC33', fontcolor='black')
A.layout('circo')
A.draw('deburijnGraph.png')

Display graph


In [5]:
from IPython import display
display.Image('deburijnGraph.png')


Out[5]:

Generate Euler path


In [10]:
def is_semieulerian(G):
    start, end = None, None
    if G.is_directed():
        for n in G.nodes_iter():
            if G.in_degree(n) +1 == G.out_degree(n):
                if start != None: return False
                else: start = n
            elif G.in_degree(n) == G.out_degree(n)+1:
                if end != None: return False
                else: end = n
            elif G.in_degree(n) != G.out_degree(n):
                return False
        g = G.copy()
        g.add_edge(end, start)
        if not nx.is_strongly_connected(g):return False
    return True
def semi2eulerian(G):
    start = None
    end = None
    if G.is_directed():
        for n in G.nodes_iter():
            if G.in_degree(n) +1 == G.out_degree(n):
                start = n
            elif G.in_degree(n) == G.out_degree(n)+1:
                end = n
            elif G.in_degree(n) != G.out_degree(n):
                return False
    G.add_edge(end, start)
    return start, end
def eulerPath(g):
    startNode = None
    path = []
    if nx.is_eulerian(g):
        cycle = list(nx.eulerian_circuit(g, startNode))
        path = cycle
    elif is_semieulerian(g):
        g = g.copy()

        startNode, endNode = semi2eulerian(g)
        #print startNode, endNode
        #if nx.is_eulerian(g): print g.edges()
        cycle =  list(nx.eulerian_circuit(g, startNode))
        i = cycle.index((endNode, startNode))
        path = cycle[i+1:]+cycle[:i+1]
    return path

In [23]:
contigs = []
for i, subG in enumerate(nx.weakly_connected_component_subgraphs(dG.g, copy=True)):
    path = eulerPath(subG)
    superStr = ''
    walk = ''
    for p in path:
        walk += p[0]+'->'
        superStr += p[0][0]
    superStr += path[-1][0][1:]
    print walk[:-2]
    print superStr
    contigs.append(superStr)


AAT->ATC->TCT->CTG->TGT->GTA->TAA->AAT->ATC
AATCTGTAATC
GAT->ATT->TTA->TAC->ACA->CAG->AGA->GAT->ATT
GATTACAGATT

Map reads to contig


In [29]:
for contig in contigs:
    print '-'*25
    print '>'+contig
    for read in readList:
        rev_read = reverseComplement(read)
        if read in contig:
            i = contig.index(read)
            print '+'+' '*i+read
        elif rev_read in contig:
            i = contig.index(rev_read)
            print '-'+' '*i+rev_read
        else:
            print ' '*len(contig)+read


-------------------------
>AATCTGTAATC
+AATCT
+    TGTAA
-      TAATC
-  TCTGT
-------------------------
>GATTACAGATT
-      AGATT
-  TTACA
+GATTA
+    ACAGA

In [ ]:


In [ ]: