Connect to Spark

In this NoteBook the reader finds the code to connect to Spark and create a SparkContext.

Dependencies


In [5]:
#Add all dependencies to PYTHON_PATH
import sys
sys.path.append("/usr/lib/spark/python")
sys.path.append("/usr/lib/spark/python/lib/py4j-0.10.4-src.zip")
sys.path.append("/usr/lib/python3/dist-packages")

#Define environment variables
import os
os.environ["HADOOP_CONF_DIR"] = "/etc/hadoop/conf"
os.environ["PYSPARK_PYTHON"] = "python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "ipython"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-oracle"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[2] pyspark-shell"

#Load PySpark to connect to a Spark cluster
from pyspark import SparkConf, SparkContext

Create Spark Context


In [7]:
appName = "connect_to_spark"
masterURL="spark://pheno0.phenovari-utwente.surf-hosted.nl:7077"
  
sc = SparkContext(conf = SparkConf().setAppName(appName).setMaster(masterURL))


---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-7-7aed057f640f> in <module>()
      2 masterURL="spark://pheno0.phenovari-utwente.surf-hosted.nl:7077"
      3 
----> 4 sc = SparkContext(conf = SparkConf().setAppName(appName).setMaster(masterURL))

/usr/lib/spark/python/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
    113         """
    114         self._callsite = first_spark_call() or CallSite(None, None, None)
--> 115         SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
    116         try:
    117             self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,

/usr/lib/spark/python/pyspark/context.py in _ensure_initialized(cls, instance, gateway, conf)
    257         with SparkContext._lock:
    258             if not SparkContext._gateway:
--> 259                 SparkContext._gateway = gateway or launch_gateway(conf)
    260                 SparkContext._jvm = SparkContext._gateway.jvm
    261 

/usr/lib/spark/python/pyspark/java_gateway.py in launch_gateway(conf)
     93                 callback_socket.close()
     94         if gateway_port is None:
---> 95             raise Exception("Java gateway process exited before sending the driver its port number")
     96 
     97         # In Windows, ensure the Java child processes do not linger after Python has exited.

Exception: Java gateway process exited before sending the driver its port number

In [6]:
appName = "connect_to_spark"
masterURL="spark://pheno0.phenovari-utwente.surf-hosted.nl:7077"

#A context needs to be created if it does not already exist
try:
    sc.stop()
except NameError:
    print("A  new Spark Context will be created.")
    
sc = SparkContext(conf = SparkConf().setAppName(appName).setMaster(masterURL))


A  new Spark Context will be created.
---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-6-b13b0d3eb03e> in <module>()
      8     print("A  new Spark Context will be created.")
      9 
---> 10 sc = SparkContext(conf = SparkConf().setAppName(appName).setMaster(masterURL))

/usr/lib/spark/python/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
    113         """
    114         self._callsite = first_spark_call() or CallSite(None, None, None)
--> 115         SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
    116         try:
    117             self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,

/usr/lib/spark/python/pyspark/context.py in _ensure_initialized(cls, instance, gateway, conf)
    257         with SparkContext._lock:
    258             if not SparkContext._gateway:
--> 259                 SparkContext._gateway = gateway or launch_gateway(conf)
    260                 SparkContext._jvm = SparkContext._gateway.jvm
    261 

/usr/lib/spark/python/pyspark/java_gateway.py in launch_gateway(conf)
     93                 callback_socket.close()
     94         if gateway_port is None:
---> 95             raise Exception("Java gateway process exited before sending the driver its port number")
     96 
     97         # In Windows, ensure the Java child processes do not linger after Python has exited.

Exception: Java gateway process exited before sending the driver its port number

Read Text file


In [ ]:
sc.re

Close Spark Context


In [ ]:
sc.stop()