Plongeur

A topological data analysis library.

Core algorithm written in Scala, using Apache Spark.

Executed in a Jupyter notebook, using the Apache Toree kernel and declarative widgets.

Graphs rendered with Sigma/Linkurious, wrapped in a Polymer component.

Reactive machinery powered by Rx RxScala.

Maven dependencies


In [1]:
%AddDeps org.apache.spark spark-mllib_2.10 1.6.2 --repository file:/Users/tmo/.m2/repository
%AddDeps org.scalanlp breeze_2.10 0.12 --transitive
%AddDeps org.scalanlp breeze-natives_2.10 0.12
%AddDeps org.scalanlp breeze-macros_2.10 0.12
%AddDeps com.github.karlhigley spark-neighbors_2.10 0.3.6-FORK --repository file:/Users/tmo/.m2/repository
%AddDeps io.reactivex rxscala_2.10 0.26.1 --transitive --repository file:/Users/tmo/.m2/repository 
%AddDeps com.softwaremill.quicklens quicklens_2.10 1.4.4 --repository file:/Users/tmo/.m2/repository
%AddDeps com.chuusai shapeless_2.10 2.3.0 --repository https://oss.sonatype.org/content/repositories/releases/ --repository file:/Users/tmo/.m2/repository
%AddDeps org.tmoerman plongeur-spark_2.10 0.3.42 --repository file:/Users/tmo/.m2/repository


Marking org.apache.spark:spark-mllib_2.10:1.6.2 for download
Preparing to fetch from:
-> file:/var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/
-> file:/Users/tmo/.m2/repository
-> https://repo1.maven.org/maven2
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/org/apache/spark/spark-mllib_2.10/1.6.2/spark-mllib_2.10-1.6.2.jar
Marking org.scalanlp:breeze_2.10:0.12 for download
Preparing to fetch from:
-> file:/var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/
-> https://repo1.maven.org/maven2
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/org/spire-math/spire-macros_2.10/0.7.4/spire-macros_2.10-0.7.4.jar
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/junit/junit/4.8.2/junit-4.8.2.jar
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/com/github/rwl/jtransforms/2.4.0/jtransforms-2.4.0.jar
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/net/sf/opencsv/opencsv/2.3/opencsv-2.3.jar
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/com/github/fommil/netlib/core/1.1.2/core-1.1.2.jar
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/org/scalanlp/breeze-macros_2.10/0.12/breeze-macros_2.10-0.12.jar
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/net/sourceforge/f2j/arpack_combined_all/0.1/arpack_combined_all-0.1.jar
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/com/chuusai/shapeless_2.10.4/2.0.0/shapeless_2.10.4-2.0.0.jar
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/org/scalanlp/breeze_2.10/0.12/breeze_2.10-0.12.jar
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/org/apache/commons/commons-math3/3.2/commons-math3-3.2.jar
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/org/spire-math/spire_2.10/0.7.4/spire_2.10-0.7.4.jar
Marking org.scalanlp:breeze-natives_2.10:0.12 for download
Preparing to fetch from:
-> file:/var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/
-> https://repo1.maven.org/maven2
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/org/scalanlp/breeze-natives_2.10/0.12/breeze-natives_2.10-0.12.jar
Marking org.scalanlp:breeze-macros_2.10:0.12 for download
Preparing to fetch from:
-> file:/var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/
-> https://repo1.maven.org/maven2
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/org/scalanlp/breeze-macros_2.10/0.12/breeze-macros_2.10-0.12.jar
Marking com.github.karlhigley:spark-neighbors_2.10:0.3.6-FORK for download
Preparing to fetch from:
-> file:/var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/
-> file:/Users/tmo/.m2/repository
-> https://repo1.maven.org/maven2
-> New file at /Users/tmo/.m2/repository/com/github/karlhigley/spark-neighbors_2.10/0.3.6-FORK/spark-neighbors_2.10-0.3.6-FORK.jar
Marking io.reactivex:rxscala_2.10:0.26.1 for download
Preparing to fetch from:
-> file:/var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/
-> file:/Users/tmo/.m2/repository
-> https://repo1.maven.org/maven2
-> New file at /Users/tmo/.m2/repository/io/reactivex/rxscala_2.10/0.26.1/rxscala_2.10-0.26.1.jar
-> New file at /Users/tmo/.m2/repository/io/reactivex/rxjava/1.1.1/rxjava-1.1.1.jar
Marking com.softwaremill.quicklens:quicklens_2.10:1.4.4 for download
Preparing to fetch from:
-> file:/var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/
-> file:/Users/tmo/.m2/repository
-> https://repo1.maven.org/maven2
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/repo1.maven.org/maven2/com/softwaremill/quicklens/quicklens_2.10/1.4.4/quicklens_2.10-1.4.4.jar
Marking com.chuusai:shapeless_2.10:2.3.0 for download
Preparing to fetch from:
-> file:/var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/
-> https://oss.sonatype.org/content/repositories/releases/
-> file:/Users/tmo/.m2/repository
-> https://repo1.maven.org/maven2
-> New file at /var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/https/oss.sonatype.org/content/repositories/releases/com/chuusai/shapeless_2.10/2.3.0/shapeless_2.10-2.3.0.jar
Marking org.tmoerman:plongeur-spark_2.10:0.3.42 for download
Preparing to fetch from:
-> file:/var/folders/zz/zyxvpxvq6csfxvn_n0000000000000/T/toree_add_deps8739186157532571424/
-> file:/Users/tmo/.m2/repository
-> https://repo1.maven.org/maven2
-> New file at /Users/tmo/.m2/repository/org/tmoerman/plongeur-spark_2.10/0.3.42/plongeur-spark_2.10-0.3.42.jar

Import classes


In [3]:
import org.apache.commons.lang.StringUtils.trim
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors.dense
import org.apache.spark.rdd.RDD
import org.joda.time.DateTime
import org.tmoerman.plongeur.tda.Distances.LpNormDistance
import org.tmoerman.plongeur.tda.LSH.LSHParams
import org.tmoerman.plongeur.tda.Model.{DataPoint, TDAContext, dp}
import org.tmoerman.plongeur.tda.knn.FastKNN_BAK.FastKNNParams
import org.tmoerman.plongeur.tda.knn.SampledKNN.SampledKNNParams
import org.tmoerman.plongeur.tda.knn.{FastKNN, SampledKNN, _}
import org.tmoerman.plongeur.util.RDDFunctions._
import org.tmoerman.plongeur.util.TimeUtils.time
import breeze.stats.distributions._

read the L1000 file


In [4]:
def read(file: String)(implicit sc: SparkContext): (Array[String], RDD[DataPoint]) = {

    def parseLine(index: Long, cols: Array[String]) =
      dp(
        index,
        dense(cols.tail.map(_.toDouble)),
        Map("pertID" -> cols.head))

    sc
      .textFile(file)
      .map(line => line.split(",").map(trim))
      .parseWithIndex(parseLine)
}

In [5]:
val wd = "../../src/test/resources/l1000/"

val geneXPSignatures =  wd + "LINCS_Gene_Expression_signatures_CD.csv"

In [6]:
val perts = read(geneXPSignatures)(sc)._2

perts.count.toInt


Out[6]:
20339

set up the benchmarking code


In [8]:
def doStuff(ctx: TDAContext, pctTotal: Double, sample: Either[Int, Double], fastKNNParams: FastKNNParams, baseLine: KNN_RDD) = {
    
    val ((result, accuracy), wallTime) = time {
      val rdd = FastKNN(ctx, fastKNNParams).cache

      (rdd, relativeAccuracy(rdd, baseLine))
    }

    val now = DateTime.now
    
    val lshParams = fastKNNParams.lshParams
    
    val k = fastKNNParams.k
    val B = fastKNNParams.blockSize
    val L = fastKNNParams.nrHashTables
    
    val distance = lshParams.distance
    val radius = lshParams.radius
    val sigLength = lshParams.signatureLength
    
    (distance, k, radius.getOrElse("N/A"), sigLength, L, B, pctTotal, wallTime.toSeconds, accuracy, sample)
}

Attempt at kNN(10) on the entire data set


In [ ]:
val k = 10
val B = 50
val sig = 10
val L = 100
val distance = LpNormDistance(0.5)
val r = Some(10.0)

val lshParams = LSHParams(signatureLength = sig, radius = r, distance = distance)
val params = FastKNNParams(k = k, blockSize = B, nrHashTables = L, lshParams = lshParams)

val (pctTotal, sampleSize) = (1.0, Right(0.005))
val ctx = TDAContext(sc, if (pctTotal < 1.0) perts.sample(false, pctTotal) else perts)

val sampledKNNParams = SampledKNNParams(k = k, sampleSize = sampleSize, distance = distance)
val baseLine = SampledKNN.apply(ctx, sampledKNNParams).cache

val p = params.copy(nrHashTables = 100)

doStuff(ctx, pctTotal, sampleSize, p, baseLine)

In [ ]: