In [1]:
%pylab inline
In [2]:
# generate some data
def create_cluster(npoints, n_clusters):
"""
create clustered data
Arguments:
ncluster -- (int) number of clusters
npoints -- (int) number of data points in every cluster
Returns a 2D NumPy array of shape npoints, 2
"""
np.random.seed(10)
datax, datay = list(), list()
for _ in range(n_clusters):
dY = np.random.uniform(low = 20e3, high = 200e3) # dispersion in X
dX = np.random.uniform(low = 20.0, high = 70.0) # dispersion in Y
datax.append( np.random.normal(loc = dX, scale = 2.0 , size = npoints) )
datay.append( np.random.normal(loc = dY, scale = 10e3, size = npoints) )
X, Y = np.concatenate(datax), np.concatenate(datay)
return np.array([X,Y]).T
data = create_cluster(npoints = 20, n_clusters = 5)
data.shape
Out[2]:
In [3]:
plt.plot(data[:,0], data[:,1], 'ko', markersize=5);
plt.xlabel('X'), plt.ylabel('Y');
In [4]:
from sklearn.cluster import KMeans
In [5]:
mymodel = KMeans(n_clusters=5)
# Note I'm scaling the data to normalize it! Important for good results.
mymodel.fit(data)
# We can look at the clusters each data point was assigned to
print mymodel.labels_
# And we'll visualize it:
plt.scatter(data[:,0], data[:,1], c=mymodel.labels_.astype(float));
In [6]:
from sklearn.preprocessing import scale
In [7]:
x = np.random.normal(loc=10, scale=2, size=10)
print('[Data]-> Mean = %2.4f, StDev = %2.4f'% (x.mean(), x.std()))
y = scale(x)
print('[Norm]-> Mean = %2.4f, StDev = %2.4f'% (y.mean(), y.std()) )
In [8]:
mymodel.fit(scale(data))
plt.scatter(data[:,0], data[:,1], c=mymodel.labels_.astype(float));
In [9]:
mymodel.score(data) #???
Out[9]:
In [10]:
# I found a KMeans.fit()'s attribute "inertia_
inertia = list()
k_values = range(1,10)
for k in k_values:
model = KMeans( n_clusters = k )
model.fit( scale(data) )
inertia.append( model.inertia_ )
print('k = %d, r2 = %2.4f'%(k, model.inertia_ ))
# calculate the inertia relative the first value ( no groups, k=1 )
inertia = inertia[0] - np.array(inertia)
In [11]:
# Look for bending points
plt.plot(k_values, inertia, lw=4)
plt.xlabel('K-value'), plt.ylabel('Inertia');