In [1]:
import{KMeans, KMeansModel}
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._

//Read raw data
val sqlContext = spark.sqlContext
val baseball ="csv").option("header", "true").option("inferSchema", "true").load("baseball.csv")

val assembler = new VectorAssembler()
  .setInputCols(Array("PutOutRate", "AssistRate", "ErrorRate"))

val dataset = assembler.transform(baseball)

// Trains a k-means model
val kmeans = new KMeans().setK(3).setFeaturesCol("features").setPredictionCol("prediction")
val model =
val transformed = model.transform(dataset)

//Add cluster label field
val toLabel = udf[String, Int] { (a) =>  "Cluster " + a}
val labeled = transformed.withColumn("cluster", toLabel(transformed("prediction")))

sqlContext = org.apache.spark.sql.SQLContext@2928f129
baseball = [Age: double, Weight: int ... 14 more fields]
assembler = vecAssembler_30c0adc68f55
dataset = [Age: double, Weight: int ... 15 more fields]
kmeans = kmeans_4ad2f85a5b28
model = k...

Clustering by how a baseball player participates in outs likely indicates the position that they play

The top row of graphs shows the clusters, click on a cluster bar to see how the data for that cluster across other fields in the data in the bottom row of graphs.

In [2]:
%%brunel data('labeled') 
         x(cluster) y(#count:linear) bar interaction(select) color(cluster) legends(none) |
         x(AssistRate:linear) y(PutOutRate:linear) color(#count) bin(AssistRate, PutOutRate) style('symbol:rect')
            + x(AssistRate:linear) y(PutOutRate:linear) color(cluster) legends(none) |
         x(ErrorRate) y(#count) bin(ErrorRate) opacity(#selection) stack bar |
         x(BattingAvg) y(SlugRate) color(#selection)  
:: width=800, height=600

                'topojson' : '//

In [ ]: