Documentação: http://spark.apache.org/docs/1.6.3/mllib-clustering.html#k-means
In [3]:
from __future__ import print_function
from numpy import array
from math import sqrt
from pyspark import SparkContext
from pyspark.mllib.clustering import KMeans, KMeansModel
In [4]:
data = sc.textFile("data/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
In [5]:
clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random")
In [6]:
def error(point):
center = clusters.centers[clusters.predict(point)]
return sqrt(sum([x**2 for x in (point - center)]))
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))
In [7]:
sc.stop()
In [ ]: