In [1]:
import org.apache.spark.ml.clustering.{KMeans, KMeansModel}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
//Read raw data
val sqlContext = spark.sqlContext
val baseball = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("baseball.csv")
val assembler = new VectorAssembler()
.setInputCols(Array("PutOutRate", "AssistRate", "ErrorRate"))
.setOutputCol("features")
val dataset = assembler.transform(baseball)
// Trains a k-means model
val kmeans = new KMeans().setK(3).setFeaturesCol("features").setPredictionCol("prediction")
val model = kmeans.fit(dataset)
val transformed = model.transform(dataset)
//Add cluster label field
val toLabel = udf[String, Int] { (a) => "Cluster " + a}
val labeled = transformed.withColumn("cluster", toLabel(transformed("prediction")))
Out[1]:
In [2]:
%%brunel data('labeled')
x(cluster) y(#count:linear) bar interaction(select) color(cluster) legends(none) |
x(AssistRate:linear) y(PutOutRate:linear) color(#count) bin(AssistRate, PutOutRate) style('symbol:rect')
+ x(AssistRate:linear) y(PutOutRate:linear) color(cluster) legends(none) |
x(ErrorRate) y(#count) bin(ErrorRate) opacity(#selection) stack bar |
x(BattingAvg) y(SlugRate) color(#selection)
:: width=800, height=600
Out[2]:
In [ ]: