Run spark in yarn client mode

yarn ( Apache Hadoop 3.1.2)

We run yarn by the following instruction https://hadoop.apache.org/docs/r3.1.2/hadoop-project-dist/hadoop-common/SingleCluster.html

Additionally we have to add the following property to etc/hadoop/yarn-site.xml

<property>
    <name>yarn.nodemanager.vmem-pmem-ratio</name>
    <value>5</value>
</property>

or

<property>
    <name>yarn.nodemanager.pmem-check-enabled</name>
    <value>false</value>
</property>

<property>
    <name>yarn.nodemanager.vmem-check-enabled</name>
    <value>false</value>
</property>

dependencies


In [ ]:
%%classpath add mvn
org.apache.spark spark-repl_2.12 2.4.4
org.apache.spark spark-yarn_2.12 2.4.4

yarn with beakerx support


In [ ]:
%%spark --yarn
SparkSession.builder()
.master("yarn")
.appName("SparkYarnBeakerxSupport")

In [ ]:
import scala.math.random
val NUM_SAMPLES = 10000000

val count2 = spark.sparkContext.parallelize(1 to NUM_SAMPLES).map{i =>
  val x = random
  val y = random
  if (x*x + y*y < 1) 1 else 0
}.reduce(_ + _)

println("Pi is roughly " + 4.0 * count2 / NUM_SAMPLES)

yarn without beakerx support

additional configuartion:

  • spark.yarn.jars
  • spark.repl.class.outputDir

In [ ]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().appName("SparkYarnNative")
.master("yarn")
.config("spark.submit.deployMode", "client")
.config("spark.yarn.jars", KernelInfo.mvnRepoPath()+"/*")
.config("spark.repl.class.outputDir",  KernelInfo.outputDir())
.getOrCreate()

In [ ]:
import scala.math.random
val NUM_SAMPLES = 10000000

val count2 = spark.sparkContext.parallelize(1 to NUM_SAMPLES).map{i =>
  val x = random
  val y = random
  if (x*x + y*y < 1) 1 else 0
}.reduce(_ + _)

println("Pi is roughly " + 4.0 * count2 / NUM_SAMPLES)

In [ ]:
spark.stop()

In [ ]: