In [ ]:
import org.apache.spark.{SparkConf}
val conf = new SparkConf()
conf.setAppName("Variant discovery")
val nodeTotalCores = 8
val numExecutors = 1
val executorCores = nodeTotalCores / numExecutors // How many cores to use per executor
var executorThreads = executorCores
val hyperThreadingAvailable = false
val executorMem = "24G" // Must be enough to fit the index files in memory
conf.set("spark.executor.memory", executorMem)
conf.set("spark.executor.cores", executorCores.toString)
conf.set("spark.mesos.executor.memoryOverhead", "3G")
conf.set("spark.yarn.executor.memoryOverhead", "3G")
val driverCores = 4
val driverMem = "24G"
conf.set("spark.driver.memory", driverMem)
conf.set("spark.driver.cores", driverCores.toString)
kernel.createSparkContext(conf)
In [ ]:
// Load the SparkCaller library
%AddJar http://f.128.no/sparkcaller-1.0.jar -f
In [ ]:
import com.github.sparkcaller.Utils
val pathToReference = "/data/hg19/ucsc.hg19.fasta"
val SAMinputFolder = "/data/gcat_set_025_out/"
val outputFolder = "/data/sparkcaller_out"
val pathToConfig = "/data/sparkcaller_confs/sparkcaller.properties"
val knownSites = "/data/dbsnp/dbsnp_138.hg19.vcf"
val toolsExtraArguments = Utils.loadConfigFile(pathToConfig);
In [ ]:
import com.github.sparkcaller.SparkCaller._
val sparkCaller = new SparkCaller(sc, pathToReference, knownSites, toolsExtraArguments, executorCores.toString, outputFolder)
In [ ]:
val preprocessedSAMFiles = sparkCaller.preprocessSAMFiles(SAMinputFolder)
In [ ]:
val outputVariants = sparkCaller.discoverVariants(preprocessedSAMFiles)