A topological data analysis library.
Core algorithm written in Scala, using Apache Spark.
Executed in a Jupyter notebook, using the Apache Toree kernel and declarative widgets.
Graphs rendered with Sigma/Linkurious, wrapped in a Polymer component.
In [1]:
%AddDeps org.apache.spark spark-mllib_2.10 1.6.2 --repository file:/Users/tmo/.m2/repository
%AddDeps org.scalanlp breeze_2.10 0.12 --transitive
%AddDeps org.scalanlp breeze-natives_2.10 0.12
%AddDeps org.scalanlp breeze-macros_2.10 0.12
%AddDeps com.github.karlhigley spark-neighbors_2.10 0.3.6-FORK --repository file:/Users/tmo/.m2/repository
%AddDeps io.reactivex rxscala_2.10 0.26.1 --transitive --repository file:/Users/tmo/.m2/repository
%AddDeps com.softwaremill.quicklens quicklens_2.10 1.4.4 --repository file:/Users/tmo/.m2/repository
%AddDeps com.chuusai shapeless_2.10 2.3.0 --repository https://oss.sonatype.org/content/repositories/releases/ --repository file:/Users/tmo/.m2/repository
%AddDeps org.tmoerman plongeur-spark_2.10 0.3.42 --repository file:/Users/tmo/.m2/repository
In [3]:
import org.apache.commons.lang.StringUtils.trim
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors.dense
import org.apache.spark.rdd.RDD
import org.joda.time.DateTime
import org.tmoerman.plongeur.tda.Distances.LpNormDistance
import org.tmoerman.plongeur.tda.LSH.LSHParams
import org.tmoerman.plongeur.tda.Model.{DataPoint, TDAContext, dp}
import org.tmoerman.plongeur.tda.knn.FastKNN_BAK.FastKNNParams
import org.tmoerman.plongeur.tda.knn.SampledKNN.SampledKNNParams
import org.tmoerman.plongeur.tda.knn.{FastKNN, SampledKNN, _}
import org.tmoerman.plongeur.util.RDDFunctions._
import org.tmoerman.plongeur.util.TimeUtils.time
import breeze.stats.distributions._
In [4]:
def read(file: String)(implicit sc: SparkContext): (Array[String], RDD[DataPoint]) = {
def parseLine(index: Long, cols: Array[String]) =
dp(
index,
dense(cols.tail.map(_.toDouble)),
Map("pertID" -> cols.head))
sc
.textFile(file)
.map(line => line.split(",").map(trim))
.parseWithIndex(parseLine)
}
In [5]:
val wd = "../../src/test/resources/l1000/"
val geneXPSignatures = wd + "LINCS_Gene_Expression_signatures_CD.csv"
In [6]:
val perts = read(geneXPSignatures)(sc)._2
perts.count.toInt
Out[6]:
In [8]:
def doStuff(ctx: TDAContext, pctTotal: Double, sample: Either[Int, Double], fastKNNParams: FastKNNParams, baseLine: KNN_RDD) = {
val ((result, accuracy), wallTime) = time {
val rdd = FastKNN(ctx, fastKNNParams).cache
(rdd, relativeAccuracy(rdd, baseLine))
}
val now = DateTime.now
val lshParams = fastKNNParams.lshParams
val k = fastKNNParams.k
val B = fastKNNParams.blockSize
val L = fastKNNParams.nrHashTables
val distance = lshParams.distance
val radius = lshParams.radius
val sigLength = lshParams.signatureLength
(distance, k, radius.getOrElse("N/A"), sigLength, L, B, pctTotal, wallTime.toSeconds, accuracy, sample)
}
In [ ]:
val k = 10
val B = 50
val sig = 10
val L = 100
val distance = LpNormDistance(0.5)
val r = Some(10.0)
val lshParams = LSHParams(signatureLength = sig, radius = r, distance = distance)
val params = FastKNNParams(k = k, blockSize = B, nrHashTables = L, lshParams = lshParams)
val (pctTotal, sampleSize) = (1.0, Right(0.005))
val ctx = TDAContext(sc, if (pctTotal < 1.0) perts.sample(false, pctTotal) else perts)
val sampledKNNParams = SampledKNNParams(k = k, sampleSize = sampleSize, distance = distance)
val baseLine = SampledKNN.apply(ctx, sampledKNNParams).cache
val p = params.copy(nrHashTables = 100)
doStuff(ctx, pctTotal, sampleSize, p, baseLine)
In [ ]: