K-Means Algorithm



In [1]:

    
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.feature import StringIndexer

read the iris dataset



In [2]:

    
def mapLibSVM(row): 
    return (row[5],Vectors.dense(row[:3]))



In [3]:

    
df = spark.read \
        .format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("datasets/iris.data")



In [4]:

    
df.show()









    



+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|      label|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|
|         5.4|        3.7|         1.5|        0.2|Iris-setosa|
|         4.8|        3.4|         1.6|        0.2|Iris-setosa|
|         4.8|        3.0|         1.4|        0.1|Iris-setosa|
|         4.3|        3.0|         1.1|        0.1|Iris-setosa|
|         5.8|        4.0|         1.2|        0.2|Iris-setosa|
|         5.7|        4.4|         1.5|        0.4|Iris-setosa|
|         5.4|        3.9|         1.3|        0.4|Iris-setosa|
|         5.1|        3.5|         1.4|        0.3|Iris-setosa|
|         5.7|        3.8|         1.7|        0.3|Iris-setosa|
|         5.1|        3.8|         1.5|        0.3|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 20 rows



In [5]:

    
indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
indexer = indexer.fit(df).transform(df)
indexer.show()
df = indexer.rdd.map(mapLibSVM).toDF(["label", "features"])
df.show()









    



+------------+-----------+------------+-----------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width|      label|labelIndex|
+------------+-----------+------------+-----------+-----------+----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|       0.0|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|       0.0|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|       0.0|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|       0.0|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|       0.0|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|       0.0|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|       0.0|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|       0.0|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|       0.0|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|       0.0|
|         5.4|        3.7|         1.5|        0.2|Iris-setosa|       0.0|
|         4.8|        3.4|         1.6|        0.2|Iris-setosa|       0.0|
|         4.8|        3.0|         1.4|        0.1|Iris-setosa|       0.0|
|         4.3|        3.0|         1.1|        0.1|Iris-setosa|       0.0|
|         5.8|        4.0|         1.2|        0.2|Iris-setosa|       0.0|
|         5.7|        4.4|         1.5|        0.4|Iris-setosa|       0.0|
|         5.4|        3.9|         1.3|        0.4|Iris-setosa|       0.0|
|         5.1|        3.5|         1.4|        0.3|Iris-setosa|       0.0|
|         5.7|        3.8|         1.7|        0.3|Iris-setosa|       0.0|
|         5.1|        3.8|         1.5|        0.3|Iris-setosa|       0.0|
+------------+-----------+------------+-----------+-----------+----------+
only showing top 20 rows

+-----+-------------+
|label|     features|
+-----+-------------+
|  0.0|[5.1,3.5,1.4]|
|  0.0|[4.9,3.0,1.4]|
|  0.0|[4.7,3.2,1.3]|
|  0.0|[4.6,3.1,1.5]|
|  0.0|[5.0,3.6,1.4]|
|  0.0|[5.4,3.9,1.7]|
|  0.0|[4.6,3.4,1.4]|
|  0.0|[5.0,3.4,1.5]|
|  0.0|[4.4,2.9,1.4]|
|  0.0|[4.9,3.1,1.5]|
|  0.0|[5.4,3.7,1.5]|
|  0.0|[4.8,3.4,1.6]|
|  0.0|[4.8,3.0,1.4]|
|  0.0|[4.3,3.0,1.1]|
|  0.0|[5.8,4.0,1.2]|
|  0.0|[5.7,4.4,1.5]|
|  0.0|[5.4,3.9,1.3]|
|  0.0|[5.1,3.5,1.4]|
|  0.0|[5.7,3.8,1.7]|
|  0.0|[5.1,3.8,1.5]|
+-----+-------------+
only showing top 20 rows



In [6]:

    
# Trains a k-means model (Estimator).
kmeans = KMeans().setK(3).setSeed(3)



In [ ]:



In [7]:

    
model = kmeans.fit(df)



In [8]:

    
# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(df)
print("Within Set Sum of Squared Errors = " + str(wssse))









    



Within Set Sum of Squared Errors = 69.51236666666772



In [9]:

    
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)









    



Cluster Centers: 
[ 5.006  3.418  1.464]
[ 5.86833333  2.74        4.38166667]
[ 6.8525  3.07    5.6925]



In [10]:

    
result = model.transform(df)



In [11]:

    
predictions = result.select(["prediction","label"])
predictions.show()









    



+----------+-----+
|prediction|label|
+----------+-----+
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
+----------+-----+
only showing top 20 rows



In [ ]: