In [4]:
import os
# make sure pyspark tells workers to use python3 not 2 if both are installed
os.environ['PYSPARK_PYTHON'] = '/opt/conda/bin/python3'

import pyspark
conf = pyspark.SparkConf()

conf.setMaster("spark://spark-master:7077")
conf.setAppName('clustertest')

sc = pyspark.SparkContext(conf=conf)

rdd = sc.parallelize(range(100))
rdd.sumApprox(3)


Out[4]:
4950.0

In [ ]:
sc.stop()

In [ ]: