Strongly Connected Components

The file scc.txt contains the edges of a directed graph. Vertices are labeled as positive integers from 1 to 875714. Every row indicates an edge, the vertex label in first column is the tail and the vertex label in second column is the head (recall the graph is directed, and the edges are directed from the first column vertex to the second column vertex). So for example, the 11th row looks liks : "2 47646". This just means that the vertex with label 2 has an outgoing edge to the vertex with label 47646

Your task is to code up the algorithm from the video lectures for computing strongly connected components (SCCs), and to run this algorithm on the given graph.

Output Format: You should output the sizes of the 5 largest SCCs in the given graph, in decreasing order of sizes, separated by commas (avoid any spaces). So if your algorithm computes the sizes of the five largest SCCs to be 500, 400, 300, 200 and 100, then your answer should be "500,400,300,200,100" (without the quotes). If your algorithm finds less than 5 SCCs, then write 0 for the remaining terms. Thus, if your algorithm computes only 3 SCCs whose sizes are 400, 300, and 100, then your answer should be "400,300,100,0,0" (without the quotes). (Note also that your answer should not have any spaces in it.)

WARNING: This is the most challenging programming assignment of the course. Because of the size of the graph you may have to manage memory carefully. The best way to do this depends on your programming language and environment, and we strongly suggest that you exchange tips for doing this on the discussion forums.


In [ ]:
FILE = "test_cases/scc_test.txt"

fp = open(FILE, 'r')

edges = []
graph = {} # out_edges
vertices = set()

rev_graph = {} # in_edges

for row in fp.readlines():
    u,v = row.strip().split(" ")
    edges.append([u,v])
        
    graph.setdefault(u, [])
    graph[u].append(v)
    
    rev_graph.setdefault(v, [])
    rev_graph[v].append(u)
    
    vertices.add(u)
    vertices.add(v)

In [ ]:
print (graph)
print (rev_graph["1"])
print (vertices)

Functions that implement DFS iteratively and recursively


In [ ]:
def init_state(graph):
    state = {}
    for k in graph.keys():
        state[k] = 0
    return state

def DFS_iterative(graph, state, i):
    UNEXPLORED = 0
    EXPLORING = 1
    EXPLORED = 2
    
    stack = []
    stack.append(i)

    while (stack):
        v = stack.pop() # remove last
        print ("explore", v)
        if state[v] == EXPLORING: 
            state[v] = EXPLORED
            print ("node {} explored".format(v))
            
        if state[v] == UNEXPLORED:
            state[v] = EXPLORING            
            stack.append(v)
            
            for j in graph[v]:
                if state[j]==0:
                    stack.append(j)
        
def DFS_START_ITERATIVE(graph, s):
    state = init_state(graph)    
    DFS_iterative(graph, state, s)
    print (state)


def DFS(graph, state, s):
    
    state[s] = 1
    for i in graph[s]:
        print (i)
        if state[i] == 0:
            DFS(graph, state, i)
    
    print ("node {} explored".format(s))
     
def DFS_START(graph, s):
    state = init_state(graph)    
    DFS(graph, state, s)
     


# test graph

graph = {
    "1":["4"],
    "2":["8"],
    "3":["6"],
    "4":["7"],
    "5":["2"],
    "6":["9"],
    "7":["1"],
    "8":["6", "5"],
    "9":["7", "3"],
}

DFS_START(graph, "9")

print ("start iterative")
DFS_START_ITERATIVE(graph, "9")

SCC Algorithm

The main class dfs_obj contains the implementation of the Strongly Connected Component algorithm

1) Takes graph and reverse graph as input 2) Run DFS on the reverse graph to generates the finishing times and use them as new vertice labels 3) run DFS on the original graph and calculate the leaders

2 implementations:

  • Iterative: run_DFS_LOOP_ITERATIVE
  • Recursive: run_DFS_LOOP

In [43]:
def reverse_graph(graph):
    rev = {}
    
    for v, k in graph.items():
        for i in k:
            rev.setdefault(i, [])
            rev[i].append(v)
            
    return rev

class dfs_obj(object):
    UNEXPLORED = 0
    EXPLORING = 1
    EXPLORED = 2


    def __init__(self, graph, rev_graph, vertices):
        
        self.graph = graph
        self.rev_graph = rev_graph
        self.vertices = vertices
        
        self.state = {}
        self.leaders = {}
        self.finish_time = {} #old label -> new label
        self.finish_time_rev = {} #new label -> old label


        self.t = 0
        self.s = None
        
    def init_state(self):
        state = {}
        for k in self.vertices:
            state[k] = dfs_obj.UNEXPLORED
        return state

    """
        ITERATIVE IMPLEMENTATION        
    """ 
    def DFS_ITERATIVE_1(self, i):

        #print ("node {} explored".format(i))
        #self.leaders[i] = self.s        

        stack = []
        stack.append(i)

        while (stack):
            v = stack.pop() # remove last

            if self.state[v] == dfs_obj.EXPLORING: 
                self.state[v] = dfs_obj.EXPLORED
                self.t += 1
                self.finish_time[v] = str(self.t)

            if self.state[v] == dfs_obj.UNEXPLORED:
                self.state[v] = dfs_obj.EXPLORING            
                stack.append(v)

                if v in self.rev_graph:
                    for j in self.rev_graph[v]:
                        if self.state[j] == dfs_obj.UNEXPLORED:
                            stack.append(j)                        

    def DFS_ITERATIVE_LOOP1(self):
        n = len(self.vertices)
    
        for i in range(n, 0, -1):
            i = str(i)
            
            if self.state[i] != dfs_obj.EXPLORED:
                #print("loop1 explore", i)
                self.s = i
                self.DFS_ITERATIVE_1(i)
       
    
    def DFS_ITERATIVE_2(self,i):
        # i is the new label
        
        stack = []
        stack.append(i)

        while (stack):
            v = stack.pop() # remove last

            if self.state[v] == dfs_obj.EXPLORING: 
                self.state[v] = dfs_obj.EXPLORED
                
                # leaders refer to old label
                self.leaders[self.finish_time_rev[v]] = self.finish_time_rev[self.s]


            if self.state[v] == dfs_obj.UNEXPLORED:
                self.state[v] = dfs_obj.EXPLORING            
                stack.append(v)

                if v in self.finish_time_rev and self.finish_time_rev[v] in self.graph:
                    for j in self.graph[self.finish_time_rev[v]]:
                        j = self.finish_time[j]

                        if self.state[j] == dfs_obj.UNEXPLORED:
                            stack.append(j)                        


    def DFS_ITERATIVE_LOOP2(self):
        n = len(self.vertices)
    
        for i in range(n, 0, -1):
            # iterate of the *new* labels
            i = str(i)
            
            if self.state[i] != dfs_obj.EXPLORED:
                self.s = i
                self.DFS_ITERATIVE_2(i)
    """
        RECURSIVE IMPLEMENTATION        
    """            
    def DFS1(self, i):

        # Set Explored 
        self.state[i] = 1
        
        #print ("node {} explored".format(i))        
        if i in self.rev_graph:
            for j in self.rev_graph[i]:
                if self.state[j] == 0:
                    self.DFS1(j)

        self.t += 1
        self.finish_time[i] = str(self.t)
        

    def DFS_LOOP1(self):
        self.t = 0
        n = len(self.vertices)
    
        for i in range(n, 0, -1):
            i = str(i)
            
            if self.state[i] == 0:
                self.DFS1(i)

                
    def DFS2(self,i):

        self.state[i] = 1
        #print ("node {} explored".format(i))
        
        self.leaders[self.finish_time_rev[i]] = self.finish_time_rev[self.s]

        for j in self.graph[self.finish_time_rev[i]]:
            #print (self.finish_time)
            j = self.finish_time[j]
            
            if self.state[j] == 0:
                self.DFS2(j)


    def DFS_LOOP2(self):
        n = len(self.vertices)
    
        for i in range(n, 0, -1):
            # iterate of the *new* labels
            i = str(i)
            
            if self.state[i] == 0:
                self.s = i
                self.DFS2(i)

    def run_DFS_LOOP(self):
        print (self.graph, self.rev_graph)
        print("step 1")
        self.state = self.init_state()
        self.DFS_LOOP1()

        print ("finish_time", self.finish_time)
        
        print ("step 2")
        self.finish_time_rev = {str(d):str(k) for k,d in self.finish_time.items()}

        self.state = self.init_state()
#         print (self.finish_time_rev)
        self.DFS_LOOP2()
    
    def run_DFS_LOOP_ITERATIVE(self):
        #print("step 1")
        self.state = self.init_state()
        
        self.DFS_ITERATIVE_LOOP1()
        
        #print ("step 2")
        self.finish_time_rev = {str(d):str(k) for k,d in self.finish_time.items()}

        self.state = self.init_state()

#         print (self.finish_time_rev)
        self.DFS_ITERATIVE_LOOP2()

In [ ]:
# Test Graph
test_graph = {
    "1":["4"],
    "2":["8"],
    "3":["6"],
    "4":["7"],
    "5":["2"],
    "6":["9"],
    "7":["1"],
    "9":["7", "3"],
    "8":["6", "5"]
}

vertices = set(["1", "2", "3", "4", "5", "6", "7", "8", "9"])
#print (test_graph, rev_test_graph)

rev_test_graph = reverse_graph(test_graph)

g = dfs_obj(test_graph, rev_test_graph, vertices)

g.run_DFS_LOOP_ITERATIVE()
#print ("states",g.state)
#print ("finish times", g.finish_time)
print ("leaders", g.leaders)

Rank SCC from largest to smallest

This function runs the SCC algorithm and ranks the output by component size, returning the five largest


In [44]:
def find_scc(graph, rev_graph, vertices):
    g = dfs_obj(graph=graph, rev_graph=rev_graph, vertices=vertices)
    
    #print (id(g), id(graph), id(rev_graph), id(vertices))
    # g.DFS_LOOP()
    g.run_DFS_LOOP_ITERATIVE()
    
    #print ("states",g.state)
#     print ("finish times", g.finish_time)
#     print ("leaders", g.leaders)

    count={}
    for key, l in g.leaders.items():
        count.setdefault(l, 0)
        count[l] += 1

    sorted_count_keys = sorted(count, key=lambda x: count[x], reverse=True)

    res = [0, 0 ,0 ,0 ,0]
    for i, k in enumerate(sorted_count_keys):
        if i > len(res)-1:
            break

        res[i] = count[k]

    return res

In [46]:
import os
 
def solution_match(out, sol):
    for k,i in zip(out, sol):
        if str(k) != str(i):
            return False
    return True

def load_file(url):
    print (FILE)
    fp = open(FILE, 'r')

    edges = []
    graph = {} # out_edges
    vertices = set()

    rev_graph = {} # in_edges

    for row in fp.readlines():
        u,v = row.strip().split(" ")
        edges.append([u,v])

        graph.setdefault(u, [])
        graph[u].append(v)

        rev_graph.setdefault(v, [])
        rev_graph[v].append(u)

        vertices.add(u)
        vertices.add(v)
        
    return graph, rev_graph, vertices

for f in os.listdir("test_cases"):
    FILE = "test_cases/"+ f
    graph, rev_graph, vertices = load_file(FILE)
    
    scc = find_scc(graph, rev_graph, vertices)

#     print ("scc", scc)
    
    SOL_FILE = "test_cases_solution/sol_"+ f
    solution = open(SOL_FILE, 'r').readlines()[0].split(",")
    assert solution_match(scc,solution)==True, "Solution don't match! {} {}".format(scc, solution)

print ("All Tests PASS")


test_cases/scc_test.txt
test_cases/scc_test2.txt
test_cases/scc_test3.txt
test_cases/scc_test4.txt
test_cases/scc_test5.txt
All Tests PASS

Validate the Code over some test cases


In [29]:
FILE = "test_cases/scc_test.txt"

fp = open(FILE, 'r')

edges = []
graph = {} # out_edges
vertices = set()

rev_graph = {} # in_edges

for row in fp.readlines():
    u,v = row.strip().split(" ")
    edges.append([u,v])

    graph.setdefault(u, [])
    graph[u].append(v)

    rev_graph.setdefault(v, [])
    rev_graph[v].append(u)

    vertices.add(u)
    vertices.add(v)

    
g = dfs_obj(graph=graph, rev_graph=rev_graph, vertices=vertices)

print (g)
# g.DFS_LOOP()
g.run_DFS_LOOP_ITERATIVE()

#print ("states",g.state)
#print ("finish times", g.finish_time)
print ("leaders", g.leaders)

count={}
for key, l in g.leaders.items():
    count.setdefault(l, 0)
    count[l] += 1

sorted_count_keys = sorted(count, key=lambda x: count[x], reverse=True)

res = [0, 0 ,0 ,0 ,0]
for i, k in enumerate(sorted_count_keys):
    if i > len(res)-1:
        break

    res[i] = count[k]

print (res)


<__main__.dfs_obj object at 0x10fa0b898>
step 1
step 2
leaders {'3': '6', '12': '12', '11': '12', '5': '5', '6': '6', '10': '12', '2': '5', '7': '12', '8': '12', '9': '12', '1': '1', '4': '5'}
[6, 3, 2, 1, 0]

In [ ]:


In [16]:



[1, 2, 3] [10, 3]

In [ ]: