In [10]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

In [15]:
df = pd.read_csv('clustering_gmm.csv')
print(df.shape)
df.head()


(500, 2)
Out[15]:
Weight Height
0 67.062924 176.086355
1 68.804094 178.388669
2 60.930863 170.284496
3 59.733843 168.691992
4 65.431230 173.763679

In [6]:
plt.figure(figsize=(7,7))
plt.scatter(df["Weight"], df["Height"])
plt.xlabel('Weight')
plt.ylabel('Height')
plt.title('Data Distribution')
plt.show()


As we can see, there are 4 clusters.

K-Means


In [16]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(df)

#predictions from kmeans
pred = kmeans.predict(df)
df['cluster'] = pred

#plotting results
color=['orange','green','cyan', 'purple']
for k in range(0,4):
    data = df.loc[df["cluster"]==k]
    plt.scatter(data["Weight"],data["Height"],c=color[k])
plt.title('CLustering with K-Means')
plt.show()


GMMs


In [14]:
# reload the data
df = pd.read_csv('clustering_gmm.csv')

gmm = GaussianMixture(n_components=4)
gmm.fit(df)

#predictions from gmm
pred = gmm.predict(df)
df['cluster'] = pred
df.columns = ['Weight', 'Height', 'cluster']

color=['orange','green','cyan', 'purple']
for k in range(0,4):
    data = df[df["cluster"]==k]
    plt.scatter(data["Weight"],data["Height"],c=color[k])
plt.title('Clustering with Gaussian Mixture Models')
plt.show()


In this data sample, distance method such as k-means may not be able to find the right clusters.