K-Means Algorithm


In [1]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.feature import StringIndexer

read the iris dataset


In [2]:
def mapLibSVM(row): 
    return (row[5],Vectors.dense(row[:3]))

In [3]:
df = spark.read \
        .format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("datasets/iris.data")

In [4]:
df.show()


+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|      label|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|
|         5.4|        3.7|         1.5|        0.2|Iris-setosa|
|         4.8|        3.4|         1.6|        0.2|Iris-setosa|
|         4.8|        3.0|         1.4|        0.1|Iris-setosa|
|         4.3|        3.0|         1.1|        0.1|Iris-setosa|
|         5.8|        4.0|         1.2|        0.2|Iris-setosa|
|         5.7|        4.4|         1.5|        0.4|Iris-setosa|
|         5.4|        3.9|         1.3|        0.4|Iris-setosa|
|         5.1|        3.5|         1.4|        0.3|Iris-setosa|
|         5.7|        3.8|         1.7|        0.3|Iris-setosa|
|         5.1|        3.8|         1.5|        0.3|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 20 rows


In [5]:
indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
indexer = indexer.fit(df).transform(df)
indexer.show()
df = indexer.rdd.map(mapLibSVM).toDF(["label", "features"])
df.show()


+------------+-----------+------------+-----------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width|      label|labelIndex|
+------------+-----------+------------+-----------+-----------+----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|       0.0|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|       0.0|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|       0.0|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|       0.0|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|       0.0|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|       0.0|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|       0.0|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|       0.0|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|       0.0|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|       0.0|
|         5.4|        3.7|         1.5|        0.2|Iris-setosa|       0.0|
|         4.8|        3.4|         1.6|        0.2|Iris-setosa|       0.0|
|         4.8|        3.0|         1.4|        0.1|Iris-setosa|       0.0|
|         4.3|        3.0|         1.1|        0.1|Iris-setosa|       0.0|
|         5.8|        4.0|         1.2|        0.2|Iris-setosa|       0.0|
|         5.7|        4.4|         1.5|        0.4|Iris-setosa|       0.0|
|         5.4|        3.9|         1.3|        0.4|Iris-setosa|       0.0|
|         5.1|        3.5|         1.4|        0.3|Iris-setosa|       0.0|
|         5.7|        3.8|         1.7|        0.3|Iris-setosa|       0.0|
|         5.1|        3.8|         1.5|        0.3|Iris-setosa|       0.0|
+------------+-----------+------------+-----------+-----------+----------+
only showing top 20 rows

+-----+-------------+
|label|     features|
+-----+-------------+
|  0.0|[5.1,3.5,1.4]|
|  0.0|[4.9,3.0,1.4]|
|  0.0|[4.7,3.2,1.3]|
|  0.0|[4.6,3.1,1.5]|
|  0.0|[5.0,3.6,1.4]|
|  0.0|[5.4,3.9,1.7]|
|  0.0|[4.6,3.4,1.4]|
|  0.0|[5.0,3.4,1.5]|
|  0.0|[4.4,2.9,1.4]|
|  0.0|[4.9,3.1,1.5]|
|  0.0|[5.4,3.7,1.5]|
|  0.0|[4.8,3.4,1.6]|
|  0.0|[4.8,3.0,1.4]|
|  0.0|[4.3,3.0,1.1]|
|  0.0|[5.8,4.0,1.2]|
|  0.0|[5.7,4.4,1.5]|
|  0.0|[5.4,3.9,1.3]|
|  0.0|[5.1,3.5,1.4]|
|  0.0|[5.7,3.8,1.7]|
|  0.0|[5.1,3.8,1.5]|
+-----+-------------+
only showing top 20 rows


In [6]:
# Trains a k-means model (Estimator).
kmeans = KMeans().setK(3).setSeed(3)

In [ ]:


In [7]:
model = kmeans.fit(df)

In [8]:
# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(df)
print("Within Set Sum of Squared Errors = " + str(wssse))


Within Set Sum of Squared Errors = 69.51236666666772

In [9]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)


Cluster Centers: 
[ 5.006  3.418  1.464]
[ 5.86833333  2.74        4.38166667]
[ 6.8525  3.07    5.6925]

In [10]:
result = model.transform(df)

In [11]:
predictions = result.select(["prediction","label"])
predictions.show()


+----------+-----+
|prediction|label|
+----------+-----+
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
+----------+-----+
only showing top 20 rows


In [ ]: