In [1]:
import numpy as np
#import pandas as pd
import matplotlib.pyplot as plt
#%matplotlib inline
c_num = 100
centers = [tuple([np.random.rand(), np.random.rand()]) for p in range(c_num)]
print centers
sd = np.eye(2) / 10000
X=[]
n = 1000
for c in centers:
X.extend(np.random.multivariate_normal(c, sd, size=n))
X = np.array(X)
fig = plt.figure()
plt.scatter(x=X[:,0], y=X[:,1])
display(fig)
In [2]:
from pyspark.mllib.clustering import KMeans, KMeansModel
X_parr = sc.parallelize(X)
model = KMeans.train(X_parr, c_num, maxIterations=15, initializationMode="random")
print model.centers
predictions = model.predict(X_parr).collect()
fig = plt.figure()
plt.scatter(x=X[:,0], y=X[:,1], c=predictions)
display(fig)
In [3]: