In [3]:
data = sc.parallelize(range(1, 11))
def duplicar(x): return x*x
# data é um rdd
res = data.map( duplicar )
print (res.collect())
In [4]:
data = sc.parallelize(range(1, 11))
res = data.filter(lambda x: x%2 ==1)
print(res.collect())
In [5]:
data = sc.parallelize(["Linha 1", "Linha 2"])
def partir(l): return l.split(" ")
print ('map:', data.map(partir).collect())
print ('flatMap:', data.flatMap(partir).collect())
In [7]:
two_multiples = sc.parallelize(range(0, 20, 2))
three_multiples = sc.parallelize(range(0, 20, 3))
print (two_multiples.intersection(three_multiples).collect())
In [9]:
data = sc.parallelize([ ('a', 1), ('b', 2), ('c', 3) , ('a', 2), ('b', 5), ('a', 3)])
for pair in data.groupByKey().collect():
print (pair[0], list(pair[1]))
In [11]:
data = sc.parallelize([ ('a', 1), ('b', 2), ('c', 3) , ('a', 2), ('b', 5), ('a', 3)])
res = data.reduceByKey( lambda x,y: x+y )
print (res.collect())
In [36]:
data = sc.parallelize([ ('a', 1), ('b', 2), ('c', 3) , ('a', 2), ('b', 5), ('a', 3)])
print(data.sortByKey(ascending=False).collect())
Links:
In [ ]: