In [ ]:
data = sc.parallelize(range(1, 11))
def pow2(i): return i * i
double = data.map(pow2) # ou data.map(lambda i: i * i)
print double.collect()
In [ ]:
data = sc.parallelize(range(1, 11))
even = data.filter(lambda i: i % 2 == 0)
print even.collect()
In [ ]:
data = sc.parallelize(range(1, 11))
def gt_one_divisors(i): return [j for j in range(2, i) if i % j == 0 ]
print 'map:', data.map(gt_one_divisors).collect()
print 'flatMap:', data.flatMap(gt_one_divisors).collect()
In [ ]:
two_multiples = sc.parallelize(range(0, 20, 2))
three_multiples = sc.parallelize(range(0, 20, 3))
print two_multiples.intersection(three_multiples).collect()
In [ ]:
data = sc.parallelize([ ('a', 1), ('b', 2), ('c', 3) , ('a', 2), ('b', 5), ('a', 3)])
for pair in data.groupByKey().collect():
print pair[0], list(pair[1])
In [ ]:
data = sc.parallelize([ ('a', 1), ('b', 2), ('c', 3) , ('a', 2), ('b', 5), ('a', 3)])
def summation(a, b): return a + b
print data.reduceByKey(summation).collect()
In [ ]:
data = sc.parallelize([ ('a', 1), ('b', 2), ('c', 3) , ('a', 2), ('b', 5), ('a', 3)])
def summation(a, b): return a + b
print data.sortByKey(ascending=True).collect()
Links:
In [ ]: