In [1]:
import pandas as pd
projectId = 'spark-151209'
query = """SELECT Actor1CountryCode, Actor2CountryCode, EventRootCode FROM [gdelt-bq:full.events]
WHERE Actor1CountryCode != ""
AND Actor2CountryCode != ""
AND EventRootCode = '19'
LIMIT 100000"""
df = pd.read_gbq(query, projectId)
In [2]:
sqlctx = SQLContext(sc)
sdf = sqlctx.createDataFrame(df)
sdf.show()
In [3]:
import graphframes
In [4]:
keys = sdf.rdd.flatMap(lambda x: (x[0], x[1])).distinct()
keylist = keys.collect()
vertices = keys.map(lambda x: (x,)).toDF(["id"])
edge = sdf.rdd.map(lambda x: (x[0], x[1], x[0] + ":" +x[1])).toDF(["src", "dst", "relation"])
In [5]:
g = graphframes.GraphFrame(vertices, edge)
results = g.pageRank(resetProbability=0.15, tol=0.01)
In [6]:
text = results.vertices.sort('pagerank', ascending=False).select("id", "pagerank").show()
Out[6]:
In [ ]: