In [7]:
# You may need to Reconnect (more than Restart) the Kernel to pick up changes to these values
master = '--master spark://spark-master-2-1-0:7077'
conf = '--conf spark.cores.max=2 --conf spark.executor.memory=2g --conf spark.cassandra.connection.host=cassandra'
packages = '--packages com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.1,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.1,com.datastax.spark:spark-cassandra-connector_2.11:2.0.0-M3,com.databricks:spark-avro_2.11:3.0.1,com.databricks:spark-xml_2.11:0.4.1'
jars = '--jars /root/lib/jpmml-sparkml-package-1.0-SNAPSHOT.jar'
py_files = '--py-files /root/lib/jpmml.py'
os.environ['PYSPARK_SUBMIT_ARGS'] = master \
+ ' ' + conf \
+ ' ' + packages \
+ ' ' + jars \
+ ' ' + py_files \
+ ' ' + 'pyspark-shell'
print(os.environ['PYSPARK_SUBMIT_ARGS'])
In [8]:
# Insert your PySpark code here...
In [11]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
In [12]:
# ...
In [ ]: