The file scc.txt contains the edges of a directed graph. Vertices are labeled as positive integers from 1 to 875714. Every row indicates an edge, the vertex label in first column is the tail and the vertex label in second column is the head (recall the graph is directed, and the edges are directed from the first column vertex to the second column vertex). So for example, the 11th row looks liks : "2 47646". This just means that the vertex with label 2 has an outgoing edge to the vertex with label 47646
Your task is to code up the algorithm from the video lectures for computing strongly connected components (SCCs), and to run this algorithm on the given graph.
Output Format: You should output the sizes of the 5 largest SCCs in the given graph, in decreasing order of sizes, separated by commas (avoid any spaces). So if your algorithm computes the sizes of the five largest SCCs to be 500, 400, 300, 200 and 100, then your answer should be "500,400,300,200,100" (without the quotes). If your algorithm finds less than 5 SCCs, then write 0 for the remaining terms. Thus, if your algorithm computes only 3 SCCs whose sizes are 400, 300, and 100, then your answer should be "400,300,100,0,0" (without the quotes). (Note also that your answer should not have any spaces in it.)
WARNING: This is the most challenging programming assignment of the course. Because of the size of the graph you may have to manage memory carefully. The best way to do this depends on your programming language and environment, and we strongly suggest that you exchange tips for doing this on the discussion forums.
In [ ]:
FILE = "test_cases/scc_test.txt"
fp = open(FILE, 'r')
edges = []
graph = {} # out_edges
vertices = set()
rev_graph = {} # in_edges
for row in fp.readlines():
u,v = row.strip().split(" ")
edges.append([u,v])
graph.setdefault(u, [])
graph[u].append(v)
rev_graph.setdefault(v, [])
rev_graph[v].append(u)
vertices.add(u)
vertices.add(v)
In [ ]:
print (graph)
print (rev_graph["1"])
print (vertices)
In [ ]:
def init_state(graph):
state = {}
for k in graph.keys():
state[k] = 0
return state
def DFS_iterative(graph, state, i):
UNEXPLORED = 0
EXPLORING = 1
EXPLORED = 2
stack = []
stack.append(i)
while (stack):
v = stack.pop() # remove last
print ("explore", v)
if state[v] == EXPLORING:
state[v] = EXPLORED
print ("node {} explored".format(v))
if state[v] == UNEXPLORED:
state[v] = EXPLORING
stack.append(v)
for j in graph[v]:
if state[j]==0:
stack.append(j)
def DFS_START_ITERATIVE(graph, s):
state = init_state(graph)
DFS_iterative(graph, state, s)
print (state)
def DFS(graph, state, s):
state[s] = 1
for i in graph[s]:
print (i)
if state[i] == 0:
DFS(graph, state, i)
print ("node {} explored".format(s))
def DFS_START(graph, s):
state = init_state(graph)
DFS(graph, state, s)
# test graph
graph = {
"1":["4"],
"2":["8"],
"3":["6"],
"4":["7"],
"5":["2"],
"6":["9"],
"7":["1"],
"8":["6", "5"],
"9":["7", "3"],
}
DFS_START(graph, "9")
print ("start iterative")
DFS_START_ITERATIVE(graph, "9")
The main class dfs_obj contains the implementation of the Strongly Connected Component algorithm
1) Takes graph and reverse graph as input 2) Run DFS on the reverse graph to generates the finishing times and use them as new vertice labels 3) run DFS on the original graph and calculate the leaders
2 implementations:
In [43]:
def reverse_graph(graph):
rev = {}
for v, k in graph.items():
for i in k:
rev.setdefault(i, [])
rev[i].append(v)
return rev
class dfs_obj(object):
UNEXPLORED = 0
EXPLORING = 1
EXPLORED = 2
def __init__(self, graph, rev_graph, vertices):
self.graph = graph
self.rev_graph = rev_graph
self.vertices = vertices
self.state = {}
self.leaders = {}
self.finish_time = {} #old label -> new label
self.finish_time_rev = {} #new label -> old label
self.t = 0
self.s = None
def init_state(self):
state = {}
for k in self.vertices:
state[k] = dfs_obj.UNEXPLORED
return state
"""
ITERATIVE IMPLEMENTATION
"""
def DFS_ITERATIVE_1(self, i):
#print ("node {} explored".format(i))
#self.leaders[i] = self.s
stack = []
stack.append(i)
while (stack):
v = stack.pop() # remove last
if self.state[v] == dfs_obj.EXPLORING:
self.state[v] = dfs_obj.EXPLORED
self.t += 1
self.finish_time[v] = str(self.t)
if self.state[v] == dfs_obj.UNEXPLORED:
self.state[v] = dfs_obj.EXPLORING
stack.append(v)
if v in self.rev_graph:
for j in self.rev_graph[v]:
if self.state[j] == dfs_obj.UNEXPLORED:
stack.append(j)
def DFS_ITERATIVE_LOOP1(self):
n = len(self.vertices)
for i in range(n, 0, -1):
i = str(i)
if self.state[i] != dfs_obj.EXPLORED:
#print("loop1 explore", i)
self.s = i
self.DFS_ITERATIVE_1(i)
def DFS_ITERATIVE_2(self,i):
# i is the new label
stack = []
stack.append(i)
while (stack):
v = stack.pop() # remove last
if self.state[v] == dfs_obj.EXPLORING:
self.state[v] = dfs_obj.EXPLORED
# leaders refer to old label
self.leaders[self.finish_time_rev[v]] = self.finish_time_rev[self.s]
if self.state[v] == dfs_obj.UNEXPLORED:
self.state[v] = dfs_obj.EXPLORING
stack.append(v)
if v in self.finish_time_rev and self.finish_time_rev[v] in self.graph:
for j in self.graph[self.finish_time_rev[v]]:
j = self.finish_time[j]
if self.state[j] == dfs_obj.UNEXPLORED:
stack.append(j)
def DFS_ITERATIVE_LOOP2(self):
n = len(self.vertices)
for i in range(n, 0, -1):
# iterate of the *new* labels
i = str(i)
if self.state[i] != dfs_obj.EXPLORED:
self.s = i
self.DFS_ITERATIVE_2(i)
"""
RECURSIVE IMPLEMENTATION
"""
def DFS1(self, i):
# Set Explored
self.state[i] = 1
#print ("node {} explored".format(i))
if i in self.rev_graph:
for j in self.rev_graph[i]:
if self.state[j] == 0:
self.DFS1(j)
self.t += 1
self.finish_time[i] = str(self.t)
def DFS_LOOP1(self):
self.t = 0
n = len(self.vertices)
for i in range(n, 0, -1):
i = str(i)
if self.state[i] == 0:
self.DFS1(i)
def DFS2(self,i):
self.state[i] = 1
#print ("node {} explored".format(i))
self.leaders[self.finish_time_rev[i]] = self.finish_time_rev[self.s]
for j in self.graph[self.finish_time_rev[i]]:
#print (self.finish_time)
j = self.finish_time[j]
if self.state[j] == 0:
self.DFS2(j)
def DFS_LOOP2(self):
n = len(self.vertices)
for i in range(n, 0, -1):
# iterate of the *new* labels
i = str(i)
if self.state[i] == 0:
self.s = i
self.DFS2(i)
def run_DFS_LOOP(self):
print (self.graph, self.rev_graph)
print("step 1")
self.state = self.init_state()
self.DFS_LOOP1()
print ("finish_time", self.finish_time)
print ("step 2")
self.finish_time_rev = {str(d):str(k) for k,d in self.finish_time.items()}
self.state = self.init_state()
# print (self.finish_time_rev)
self.DFS_LOOP2()
def run_DFS_LOOP_ITERATIVE(self):
#print("step 1")
self.state = self.init_state()
self.DFS_ITERATIVE_LOOP1()
#print ("step 2")
self.finish_time_rev = {str(d):str(k) for k,d in self.finish_time.items()}
self.state = self.init_state()
# print (self.finish_time_rev)
self.DFS_ITERATIVE_LOOP2()
In [ ]:
# Test Graph
test_graph = {
"1":["4"],
"2":["8"],
"3":["6"],
"4":["7"],
"5":["2"],
"6":["9"],
"7":["1"],
"9":["7", "3"],
"8":["6", "5"]
}
vertices = set(["1", "2", "3", "4", "5", "6", "7", "8", "9"])
#print (test_graph, rev_test_graph)
rev_test_graph = reverse_graph(test_graph)
g = dfs_obj(test_graph, rev_test_graph, vertices)
g.run_DFS_LOOP_ITERATIVE()
#print ("states",g.state)
#print ("finish times", g.finish_time)
print ("leaders", g.leaders)
In [44]:
def find_scc(graph, rev_graph, vertices):
g = dfs_obj(graph=graph, rev_graph=rev_graph, vertices=vertices)
#print (id(g), id(graph), id(rev_graph), id(vertices))
# g.DFS_LOOP()
g.run_DFS_LOOP_ITERATIVE()
#print ("states",g.state)
# print ("finish times", g.finish_time)
# print ("leaders", g.leaders)
count={}
for key, l in g.leaders.items():
count.setdefault(l, 0)
count[l] += 1
sorted_count_keys = sorted(count, key=lambda x: count[x], reverse=True)
res = [0, 0 ,0 ,0 ,0]
for i, k in enumerate(sorted_count_keys):
if i > len(res)-1:
break
res[i] = count[k]
return res
In [46]:
import os
def solution_match(out, sol):
for k,i in zip(out, sol):
if str(k) != str(i):
return False
return True
def load_file(url):
print (FILE)
fp = open(FILE, 'r')
edges = []
graph = {} # out_edges
vertices = set()
rev_graph = {} # in_edges
for row in fp.readlines():
u,v = row.strip().split(" ")
edges.append([u,v])
graph.setdefault(u, [])
graph[u].append(v)
rev_graph.setdefault(v, [])
rev_graph[v].append(u)
vertices.add(u)
vertices.add(v)
return graph, rev_graph, vertices
for f in os.listdir("test_cases"):
FILE = "test_cases/"+ f
graph, rev_graph, vertices = load_file(FILE)
scc = find_scc(graph, rev_graph, vertices)
# print ("scc", scc)
SOL_FILE = "test_cases_solution/sol_"+ f
solution = open(SOL_FILE, 'r').readlines()[0].split(",")
assert solution_match(scc,solution)==True, "Solution don't match! {} {}".format(scc, solution)
print ("All Tests PASS")
In [29]:
FILE = "test_cases/scc_test.txt"
fp = open(FILE, 'r')
edges = []
graph = {} # out_edges
vertices = set()
rev_graph = {} # in_edges
for row in fp.readlines():
u,v = row.strip().split(" ")
edges.append([u,v])
graph.setdefault(u, [])
graph[u].append(v)
rev_graph.setdefault(v, [])
rev_graph[v].append(u)
vertices.add(u)
vertices.add(v)
g = dfs_obj(graph=graph, rev_graph=rev_graph, vertices=vertices)
print (g)
# g.DFS_LOOP()
g.run_DFS_LOOP_ITERATIVE()
#print ("states",g.state)
#print ("finish times", g.finish_time)
print ("leaders", g.leaders)
count={}
for key, l in g.leaders.items():
count.setdefault(l, 0)
count[l] += 1
sorted_count_keys = sorted(count, key=lambda x: count[x], reverse=True)
res = [0, 0 ,0 ,0 ,0]
for i, k in enumerate(sorted_count_keys):
if i > len(res)-1:
break
res[i] = count[k]
print (res)
In [ ]:
In [16]:
In [ ]: