In [ ]:
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.feature.VectorAssembler
In [ ]:
//use insert to code option in data panel
In [ ]:
taxifile.createOrReplaceTempView("taxifile")
val taxifence = spark.sql("""select Dropoff_latitude,Dropoff_longitude from taxifile where
Dropoff_latitude > 40.70 and
Dropoff_latitude < 40.86 and
Dropoff_longitude > -74.02 and
Dropoff_longitude < -73.93""")
taxifence.count
In [ ]:
val assembler = (new VectorAssembler()
.setInputCols(Array("Dropoff_latitude", "Dropoff_longitude"))
.setOutputCol("features"))
val taxivector = assembler.transform(taxifence)
val taxifeat = taxivector.drop("Dropoff_latitude","Dropoff_longitude")
In [ ]:
val kmeans = new KMeans().setK(2).setSeed(1L)
val model = kmeans.fit(taxifeat)
println("Cluster Centers: ")
model.clusterCenters.foreach(println)
In [ ]: