notebook.community

Edit and run



In [ ]:

    
# You may need to Reconnect (more than Restart) the Kernel to pick up changes to these sett
master = '--master spark://spark-master-2-1-0:7077'
conf = '--conf spark.cores.max=2 --conf spark.executor.memory=2g'
packages = '--packages com.databricks:spark-xml_2.11:0.4.1'
jars = ''
py_files = ''

os.environ['PYSPARK_SUBMIT_ARGS'] = master \
  + ' ' + conf \
  + ' ' + packages \
  + ' ' + jars \
  + ' ' + py_files \
  + ' ' + 'pyspark-shell'

print(os.environ['PYSPARK_SUBMIT_ARGS'])



In [ ]:

    
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

df = spark.read.format('com.databricks.spark.xml').options(rowTag='book').load('books.xml')
df.select("author", "_id").write \
    .format('com.databricks.spark.xml') \
    .options(rowTag='book', rootTag='books') \
    .save('newbooks.xml')
    
# Note:  Ignore the "Input put does not exist" error... that means Spark successfully resolved the spark-xml library specified above



In [ ]:



In [ ]: