zoon::workflow downloads ~170 MB of data when called. Best practice for spark cluster is to use data stored locally (accessible via /data). Variables passed to zoon::workflow need to be globally scoped, this practice is likely to cause issues when paralleling runs. For this example, the required variables are set in the global environment (using <<- rather than <-), this may cause issue with the Spark workers.
In [ ]:
# Careful loading the entire zoon library with the SparkR kernel, it will mask some SparkR functions.
# library('zoon')
raster::plot(raster::stack('/data/zoon/data/LCM_raster.tif'))
In [ ]:
dir.create('/data/zoon/maps', showWarnings = FALSE)
dir.create('/data/zoon/rasters', showWarnings = FALSE)
In [ ]:
species_data <- list.files(path = '/data/zoon/data/species_data', full.names = TRUE)
In [ ]:
zoonModel <- function(current_path){
start.time <- Sys.time()
library(zoon) # entire library loaded in worker
# zoon::workflow variables have to be globally scoped
LCM_raster <<- stack('/data/zoon/data/LCM_raster.tif')
base_filename <<- gsub('.csv$', '', basename(current_path))
species_path <<- current_path
dataset_extent <<- c(-9.576638, 3.715762, 49.68241, 61.66173)
w1 <- workflow(occurrence = LocalOccurrenceData(species_path),
covariate = Chain(LocalRaster(LCM_raster),
Bioclim(extent = dataset_extent,
layers = c(2,4,5,12,16,17),
resolution = 2.5)),
process = Chain(Background(1000), Crossvalidate),
model = LogisticRegression,
output = PrintMap(dir = '/data/zoon/maps', filename = base_filename))
writeRaster(Output(w1), format = 'GTiff', filename = file.path('/data/zoon/rasters', base_filename))
end.time <- Sys.time()
return(end.time - start.time)
}
In [ ]:
out <- spark.lapply(species_data[1:2], zoonModel)
sparkR.session.stop() # close sessuion once finished
In [ ]:
out
In [ ]: