In [1]:

    
import org.apache.spark.ml.clustering.{KMeans, KMeansModel}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._

//Read raw data
val sqlContext = spark.sqlContext
val baseball = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("baseball.csv")

val assembler = new VectorAssembler()
  .setInputCols(Array("PutOutRate", "AssistRate", "ErrorRate"))
  .setOutputCol("features")

val dataset = assembler.transform(baseball)

// Trains a k-means model
val kmeans = new KMeans().setK(3).setFeaturesCol("features").setPredictionCol("prediction")
val model = kmeans.fit(dataset)
val transformed = model.transform(dataset)

//Add cluster label field
val toLabel = udf[String, Int] { (a) =>  "Cluster " + a}
val labeled = transformed.withColumn("cluster", toLabel(transformed("prediction")))









    





sqlContext = org.apache.spark.sql.SQLContext@2928f129
baseball = [Age: double, Weight: int ... 14 more fields]
assembler = vecAssembler_30c0adc68f55
dataset = [Age: double, Weight: int ... 15 more fields]
kmeans = kmeans_4ad2f85a5b28
model = k...







    Out[1]:





kmeans_4ad2f85a5b28

Clustering by how a baseball player participates in outs likely indicates the position that they play

The top row of graphs shows the clusters, click on a cluster bar to see how the data for that cluster across other fields in the data in the bottom row of graphs.



In [2]:

    
%%brunel data('labeled') 
         x(cluster) y(#count:linear) bar interaction(select) color(cluster) legends(none) |
         x(AssistRate:linear) y(PutOutRate:linear) color(#count) bin(AssistRate, PutOutRate) style('symbol:rect')
            + x(AssistRate:linear) y(PutOutRate:linear) color(cluster) legends(none) |
         x(ErrorRate) y(#count) bin(ErrorRate) opacity(#selection) stack bar |
         x(BattingAvg) y(SlugRate) color(#selection)  
         
:: width=800, height=600









    





                'topojson' : '//cdnjs.cloudflare.com...







    Out[2]:



In [ ]: