In [1]:
# Create a RDD with 1 ~ 10
rdd_1_10 = sc.parallelize([1,2,3,4,5,6,7,8,9,10])
In [2]:
print 'First:',rdd_1_10.first()
print 'Have:',rdd_1_10.count(),'Items'
In [3]:
#Multiply with 2
rdd_1_10.map(lambda x : x *2).collect()
Out[3]:
In [4]:
# sum of all
rdd_1_10.reduce(lambda x,y : x+ y )
Out[4]:
In [5]:
# use reduce by key
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
rdd.reduceByKey(lambda x,y : x+y ).collect()
Out[5]:
In [6]:
rdd_over_5 = rdd_1_10.filter(lambda x : x > 5)
print rdd_over_5.collect()
value = rdd_over_5.reduce(lambda x,y:x*y)
print value
if value == 30240 : print "恭喜你答對了"
In [ ]: