Package Install

$ sudo apt-get install python2.7-dev
$ sudo easy_install pip
$ pip install jupyter
$ pip install pandas
$ pip install google-api-python-client

Jupyter spark enviroment.

pyspark --master local[2] --packages graphframes:graphframes:0.3.0-spark2.0-s_2.11

In [1]:
import pandas as pd

projectId = 'spark-151209'
query = """SELECT Actor1CountryCode, Actor2CountryCode, EventRootCode FROM [] 
WHERE Actor1CountryCode != ""
AND Actor2CountryCode != ""
AND EventRootCode = '19'
LIMIT 100000"""
df = pd.read_gbq(query, projectId)

In [2]:
sqlctx = SQLContext(sc)
sdf = sqlctx.createDataFrame(df)

|              USA|              USA|           19|
|              USA|              NZL|           19|
|              USA|              USA|           19|
|              CAN|              POL|           19|
|              USA|              USA|           19|
|              USA|              USA|           19|
|              ZAF|              ZAF|           19|
|              GBR|              USA|           19|
|              USA|              USA|           19|
|              USA|              USA|           19|
|              USA|              USA|           19|
|              BGD|              BGD|           19|
|              USA|              USA|           19|
|              NZL|              NZL|           19|
|              UKR|              POL|           19|
|              SAU|              ARE|           19|
|              SAU|              ARE|           19|
|              AFG|              IRN|           19|
|              AFG|              AFG|           19|
|              ITA|              USA|           19|
only showing top 20 rows

In [3]:
import graphframes

In [4]:
keys = sdf.rdd.flatMap(lambda x: (x[0], x[1])).distinct()
keylist = keys.collect()
vertices = x: (x,)).toDF(["id"])
edge = x: (x[0], x[1], x[0] + ":" +x[1])).toDF(["src", "dst", "relation"])

In [5]:
g = graphframes.GraphFrame(vertices, edge)
results = g.pageRank(resetProbability=0.15, tol=0.01)

In [6]:
text = results.vertices.sort('pagerank', ascending=False).select("id", "pagerank").show()

| id|          pagerank|
|SYR| 8.286111156778835|
|GBR| 6.005561114899255|
|AFG| 5.162184088030115|
|RUS| 3.934112277405712|
|IRQ| 3.879846182250965|
|PAK| 3.021622028295468|
|AUS| 2.740062004184414|
|IRN| 2.332781842266423|
|NGA| 2.202476105742932|
|JPN| 2.175917593090732|
|SAU|  2.11733321954107|
only showing top 20 rows


