In [1]:
import spylon
import spylon.spark as sp
c = sp.SparkConfiguration()
c._spark_home = "/path/to/spark-1.6.2-bin-hadoop2.6"
c.master = ["local[2]"]
In [2]:
# Tab completion for spark properties
c.conf.spark.sql.shuffle.partitions = 50
c.conf.spark.sql.shuffle.partitions
Out[2]:
In [3]:
(sc, sqlContext) = c.sql_context("MyApplicationName")
In [9]:
import time
In [10]:
def delayed(seconds):
def f(x):
time.sleep(seconds)
return x
return f
In [11]:
rdd = sc.parallelize(list(range(10)) + list(range(5)), 5).map(delayed(1))
reduced = rdd.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
reduced.map(delayed(1)).collect()
Out[11]:
In [12]:
sp.start_spark_progress_bar_thread(sc, sleep_time=0.1)
In [13]:
rdd = sc.parallelize(list(range(10)) + list(range(5)), 5).map(delayed(1))
reduced = rdd.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
reduced.map(delayed(1)).collect()
Out[13]: