notebook.community

Edit and run



In [1]:

    
import graphframes as gf
# note on databricks: sc and sqlContext are preloaded



In [2]:

    
rawdat = sqlContext.sql("SELECT * FROM coursera_algos_mincut2")
print rawdat.first()



In [3]:

    
def verticesList(row):
  return [(row.vertex, str(row.vertex))]

def adjacencyList(row):
  e = []
  v1 = row.vertex
  for i in range(1, 40):
    v2 = row["C"+str(i)]
    if v2 is not None:
      edge = (v1, v2) if v1 < v2 else (v2, v1)
      e.append(edge)
  return e

# http://graphframes.github.io/
# http://cdn2.hubspot.net/hubfs/438089/notebooks/help/Setup_graphframes_package.html
vertices = rawdat.flatMap(verticesList)
edges = rawdat.flatMap(adjacencyList)
# using distinct on edges because this is an undirected graph
G = gf.GraphFrame(vertices.toDF(["id", "name"]), edges.distinct().toDF(["src", "dst"]))
print G.edges.collect()



In [4]:

    
print(G.edges.filter("src = 1").collect())



In [5]:

    
print G.edges.count(), G.vertices.count()



In [6]: