In [ ]:
import org.apache.spark.{SparkConf}
val conf = new SparkConf()
conf.setAppName("Variant discovery")

val nodeTotalCores          = 8

val numExecutors            = 1
val executorCores           = nodeTotalCores / numExecutors // How many cores to use per executor
var executorThreads         = executorCores
val hyperThreadingAvailable = false


val executorMem             = "24G"  // Must be enough to fit the index files in memory

conf.set("spark.executor.memory", executorMem)
conf.set("spark.executor.cores", executorCores.toString)
conf.set("spark.mesos.executor.memoryOverhead", "3G")
conf.set("spark.yarn.executor.memoryOverhead", "3G")

val driverCores             = 4
val driverMem               = "24G"

conf.set("spark.driver.memory", driverMem)
conf.set("spark.driver.cores", driverCores.toString)

kernel.createSparkContext(conf)

In [ ]:
// Load the SparkCaller library
%AddJar http://f.128.no/sparkcaller-1.0.jar -f

In [ ]:
import com.github.sparkcaller.Utils

val pathToReference = "/data/hg19/ucsc.hg19.fasta"
val SAMinputFolder = "/data/gcat_set_025_out/"
val outputFolder = "/data/sparkcaller_out"
val pathToConfig = "/data/sparkcaller_confs/sparkcaller.properties"
val knownSites = "/data/dbsnp/dbsnp_138.hg19.vcf"

val toolsExtraArguments = Utils.loadConfigFile(pathToConfig);

In [ ]:
import com.github.sparkcaller.SparkCaller._

val sparkCaller = new SparkCaller(sc, pathToReference, knownSites, toolsExtraArguments, executorCores.toString, outputFolder)

In [ ]:
val preprocessedSAMFiles = sparkCaller.preprocessSAMFiles(SAMinputFolder)

In [ ]:
val outputVariants = sparkCaller.discoverVariants(preprocessedSAMFiles)