In [1]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("PopularHero")
sc = SparkContext(conf = conf)
In [2]:
def countCoOccurences(line):
elements = line.split()
return (int(elements[0]), len(elements) - 1)
def parseNames(line):
fields = line.split('\"')
return (int(fields[0]), fields[1].encode("utf8"))
In [13]:
lines = sc.textFile("file:///opt/ipython/spark_code/example3/marvel-graph.txt")
lines.take(5)
Out[13]:
In [14]:
names = sc.textFile("file:///opt/ipython/spark_code/example3/marvel-names.txt")
namesRdd = names.map(parseNames)
namesRdd.take(5)
Out[14]:
In [17]:
pairings = lines.map(countCoOccurences)
pairings.take(10)
Out[17]:
In [18]:
totalFriendsByCharacter = pairings.reduceByKey(lambda x, y : x + y)
totalFriendsByCharacter.take(10)
Out[18]:
In [19]:
flipped = totalFriendsByCharacter.map(lambda xy : (xy[1], xy[0]))
flipped.take(10)
Out[19]:
In [10]:
mostPopular = flipped.max()
mostPopularName = namesRdd.lookup(mostPopular[1])[0]
In [22]:
print( "(id,name) ( {},{} ) is the most popular superhero".format(mostPopular[0],mostPopularName))
In [ ]: