In [1]:
import graphframes as gf
# note on databricks: sc and sqlContext are preloaded
In [2]:
rawdat = sqlContext.sql("SELECT * FROM coursera_algos_mincut2")
print rawdat.first()
In [3]:
def verticesList(row):
return [(row.vertex, str(row.vertex))]
def adjacencyList(row):
e = []
v1 = row.vertex
for i in range(1, 40):
v2 = row["C"+str(i)]
if v2 is not None:
edge = (v1, v2) if v1 < v2 else (v2, v1)
e.append(edge)
return e
# http://graphframes.github.io/
# http://cdn2.hubspot.net/hubfs/438089/notebooks/help/Setup_graphframes_package.html
vertices = rawdat.flatMap(verticesList)
edges = rawdat.flatMap(adjacencyList)
# using distinct on edges because this is an undirected graph
G = gf.GraphFrame(vertices.toDF(["id", "name"]), edges.distinct().toDF(["src", "dst"]))
print G.edges.collect()
In [4]:
print(G.edges.filter("src = 1").collect())
In [5]:
print G.edges.count(), G.vertices.count()
In [6]: