A topological data analysis library.
Core algorithm written in Scala, using Apache Spark.
Executed in a Jupyter notebook, using the Apache Toree kernel and declarative widgets.
Graphs rendered with Sigma/Linkurious, wrapped in a Polymer component.
In [1]:
%AddDeps org.apache.spark spark-mllib_2.10 1.6.2 --repository file:/Users/tmo/.m2/repository
%AddDeps org.scalanlp breeze-natives_2.10 0.12 --repository file:/Users/tmo/.m2/repository
%AddDeps com.github.haifengl smile-core 1.1.0 --transitive --repository file:/Users/tmo/.m2/repository
%AddDeps io.reactivex rxscala_2.10 0.26.1 --transitive --repository file:/Users/tmo/.m2/repository
%AddDeps com.softwaremill.quicklens quicklens_2.10 1.4.4 --repository file:/Users/tmo/.m2/repository
%AddDeps com.chuusai shapeless_2.10 2.3.0 --repository https://oss.sonatype.org/content/repositories/releases/ --repository file:/Users/tmo/.m2/repository
%AddDeps org.tmoerman plongeur-spark_2.10 0.3.37 --repository file:/Users/tmo/.m2/repository
In [2]:
%addjar http://localhost:8888/nbextensions/declarativewidgets/declarativewidgets.jar
In [3]:
import rx.lang.scala.{Observer, Subscription, Observable}
import rx.lang.scala.subjects.PublishSubject
import rx.lang.scala.subjects._
import shapeless.HNil
import org.tmoerman.plongeur.tda._
import org.tmoerman.plongeur.tda.Model._
import org.tmoerman.plongeur.tda.cluster.Clustering._
import org.tmoerman.plongeur.tda.cluster.Scale._
import org.tmoerman.plongeur.tda.Colour._
import org.tmoerman.plongeur.tda.Brewer._
import org.tmoerman.plongeur.ui.Controls._
import declarativewidgets._
initWidgets
import declarativewidgets.WidgetChannels.channel
In [4]:
import java.util.concurrent.atomic.AtomicReference
case class SubRef(val ref: AtomicReference[Option[Subscription]] = new AtomicReference[Option[Subscription]](None)) extends Serializable {
def update(sub: Subscription): Unit = ref.getAndSet(Option(sub)).foreach(old => old.unsubscribe())
def reset(): Unit = update(null)
}
In [5]:
%%html
<link rel='import' href='urth_components/paper-slider/paper-slider.html' is='urth-core-import' package='PolymerElements/paper-slider'>
<link rel='import' href='urth_components/paper-button/paper-button.html' is='urth-core-import' package='PolymerElements/paper-button'>
<link rel='import' href='urth_components/paper-dropdown-menu/paper-dropdown-menu.html' is='urth-core-import' package='PolymerElements/paper-dropdown-menu'>
<link rel='import' href='urth_components/paper-listbox/paper-listbox.html' is='urth-core-import' package='PolymerElements/paper-listbox'>
<link rel='import' href='urth_components/paper-item/paper-item.html' is='urth-core-import' package='PolymerElements/paper-item'>
<link rel='import' href='urth_components/plongeur-graph/plongeur-graph.html' is='urth-core-import' package='tmoerman/plongeur-graph'>
<link rel='import' href='urth_components/urth-viz-scatter/urth-viz-scatter.html' is='urth-core-import'>
Out[5]:
Keep references to Rx Subscription instances apart.
In [6]:
val in$_subRef = SubRef()
Instantiate a PublishSubject. This stream of TDAParams instances represents the input of a TDAMachine. The PublishSubject listens to changes and sets these to the channel "ch_TDA_1" under the "params" key.
In [7]:
val in$ = PublishSubject[TDAParams]
in$_subRef.update(in$.subscribe(p => channel("ch_TDA_1").set("params", p.toString)))
In [8]:
import org.apache.spark.rdd.RDD
import org.apache.commons.lang.StringUtils.trim
import org.apache.spark.mllib.linalg.Vectors
def readMnist(file: String): RDD[DataPoint] =
sc.
textFile(file).
map(s => {
val columns = s.split(",").map(trim).toList
columns match {
case cat :: rawFeatures =>
val nonZero =
rawFeatures.
map(_.toInt).
zipWithIndex.
filter{ case (v, idx) => v != 0 }.
map{ case (v, idx) => (idx, v.toDouble) }
val sparseFeatures = Vectors.sparse(rawFeatures.size, nonZero)
(cat, sparseFeatures)
}}).
zipWithIndex.
map {case ((cat, features), idx) => IndexedDataPoint(idx.toInt, features, Some(Map("cat" -> cat)))}
In [9]:
val mnist_path = "/Users/tmo/Work/batiskav/projects/plongeur/scala/plongeur-spark/src/test/resources/mnist/"
val mnist_train = mnist_path + "mnist_train.csv"
In [10]:
val mnistRDD = readMnist(mnist_train)
In [11]:
val mnistSample5pctRDD = mnistRDD.sample(false, .05, 0l).cache
In [12]:
mnistSample5pctRDD.count
Out[12]:
In [13]:
val ctx = TDAContext(sc, mnistSample5pctRDD)
Turn a TDAResult into a data structure.
In [14]:
val r = scala.util.Random
def format(result: TDAResult) = Map(
"nodes" -> result.clusters.map(c =>
Map(
"id" -> c.id.toString,
"size" -> c.dataPoints.size,
"color" -> c.colours.headOption.getOrElse("#999"),
"x" -> r.nextInt(100),
"y" -> r.nextInt(100))),
"edges" -> result.edges.map(e => {
val (from, to) = e.toArray match {case Array(f, t) => (f, t)}
Map(
"id" -> s"$from--$to",
"source" -> from.toString,
"target" -> to.toString)}))
Run the machine, obtaining an Observable of TDAResult instances
In [15]:
val out$_subRef = SubRef()
In [16]:
val out$: Observable[(TDAParams, TDAResult)] = TDAMachine.run(ctx, in$)
In [26]:
out$_subRef.update(
out$.subscribe(
onNext = (t) => t match {case (p, r) => channel("ch_TDA_1").set("result", format(r))},
onError = (e) => println("Error in TDA machine: ", e)))
In [18]:
val pipe$_subRef = SubRef()
In [19]:
kernel.magics.html(controlsCSS)
Out[19]:
In [67]:
val cat = new AttributePredicate("cat", "0")
val BASE =
TDAParams(
lens = TDALens(
Filter("PCA" :: 0 :: HNil, 30, 0.5),
Filter("PCA" :: 1 :: HNil, 30, 0.5)),
clusteringParams = ClusteringParams(),
scaleSelection = firstGap(20),
collapseDuplicateClusters = false,
colouring = Colouring(Brewer.palettes("PuOr").get(9).map(_.reverse), LocalPercentage(9, cat)))
in$.onNext(BASE)
In [21]:
/*
val (sub, html) = BASE.makeControls(channel("ch_TDA_1"), in$)
pipe$_subRef.update(sub)
kernel.magics.html(html)
*/
Out[21]:
In [52]:
%%html
<template is='urth-core-bind' channel='ch_TDA_1'>
<plongeur-graph height="1200" data="{{result}}"></plongeur-graph>
</template>
Out[52]:
In [23]:
%%html
<template is='urth-core-bind' channel='ch_TDA_1'>
<div style='background: #FFB; padding: 10px;'>
<span style='font-family: "Courier"'>[[params]]</span>
</div>
</template>
Out[23]:
In [ ]:
In [ ]: