notebook.community



In [10]:

    
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture



In [15]:

    
df = pd.read_csv('clustering_gmm.csv')
print(df.shape)
df.head()



In [6]:

    
plt.figure(figsize=(7,7))
plt.scatter(df["Weight"], df["Height"])
plt.xlabel('Weight')
plt.ylabel('Height')
plt.title('Data Distribution')
plt.show()

As we can see, there are 4 clusters.

K-Means



In [16]:

    
kmeans = KMeans(n_clusters=4)
kmeans.fit(df)

#predictions from kmeans
pred = kmeans.predict(df)
df['cluster'] = pred

#plotting results
color=['orange','green','cyan', 'purple']
for k in range(0,4):
    data = df.loc[df["cluster"]==k]
    plt.scatter(data["Weight"],data["Height"],c=color[k])
plt.title('CLustering with K-Means')
plt.show()

GMMs



In [14]:

    
# reload the data
df = pd.read_csv('clustering_gmm.csv')

gmm = GaussianMixture(n_components=4)
gmm.fit(df)

#predictions from gmm
pred = gmm.predict(df)
df['cluster'] = pred
df.columns = ['Weight', 'Height', 'cluster']

color=['orange','green','cyan', 'purple']
for k in range(0,4):
    data = df[df["cluster"]==k]
    plt.scatter(data["Weight"],data["Height"],c=color[k])
plt.title('Clustering with Gaussian Mixture Models')
plt.show()

In this data sample, distance method such as k-means may not be able to find the right clusters.

	Weight	Height
0	67.062924	176.086355
1	68.804094	178.388669
2	60.930863	170.284496
3	59.733843	168.691992
4	65.431230	173.763679