In [ ]:
# You may need to Reconnect (more than Restart) the Kernel to pick up changes to these sett
master = '--master spark://spark-master-2-1-0:7077'
conf = '--conf spark.cores.max=2 --conf spark.executor.memory=2g'
packages = '--packages com.databricks:spark-xml_2.11:0.4.1'
jars = ''
py_files = ''
os.environ['PYSPARK_SUBMIT_ARGS'] = master \
+ ' ' + conf \
+ ' ' + packages \
+ ' ' + jars \
+ ' ' + py_files \
+ ' ' + 'pyspark-shell'
print(os.environ['PYSPARK_SUBMIT_ARGS'])
In [ ]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.read.format('com.databricks.spark.xml').options(rowTag='book').load('books.xml')
df.select("author", "_id").write \
.format('com.databricks.spark.xml') \
.options(rowTag='book', rootTag='books') \
.save('newbooks.xml')
# Note: Ignore the "Input put does not exist" error... that means Spark successfully resolved the spark-xml library specified above
In [ ]:
In [ ]: