In [3]:
from __future__ import print_function
from numpy import array
from math import sqrt
from pyspark import SparkContext
from pyspark.mllib.clustering import KMeans, KMeansModel

Carregando e traduzindo o dataset


In [4]:
data = sc.textFile("data/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

Construindo o modelo


In [5]:
clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random")

Avaliando a clusterização pela soma dos erros quadrados


In [6]:
def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))


Within Set Sum of Squared Error = 0.692820323028

In [7]:
sc.stop()

In [ ]: