In [ ]:
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.feature.VectorAssembler

In [ ]:
//use insert to code option in data panel

In [ ]:
taxifile.createOrReplaceTempView("taxifile")

val taxifence = spark.sql("""select Dropoff_latitude,Dropoff_longitude from taxifile where
   Dropoff_latitude > 40.70 and 
   Dropoff_latitude < 40.86 and 
   Dropoff_longitude > -74.02 and 
   Dropoff_longitude < -73.93""")

taxifence.count

In [ ]:
val assembler = (new VectorAssembler()
    .setInputCols(Array("Dropoff_latitude", "Dropoff_longitude"))
    .setOutputCol("features"))
val taxivector = assembler.transform(taxifence)
val taxifeat = taxivector.drop("Dropoff_latitude","Dropoff_longitude")

In [ ]:
val kmeans = new KMeans().setK(2).setSeed(1L)
val model = kmeans.fit(taxifeat)
println("Cluster Centers: ")
model.clusterCenters.foreach(println)

In [ ]: