In [10]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
In [15]:
df = pd.read_csv('clustering_gmm.csv')
print(df.shape)
df.head()
Out[15]:
In [6]:
plt.figure(figsize=(7,7))
plt.scatter(df["Weight"], df["Height"])
plt.xlabel('Weight')
plt.ylabel('Height')
plt.title('Data Distribution')
plt.show()
In [16]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(df)
#predictions from kmeans
pred = kmeans.predict(df)
df['cluster'] = pred
#plotting results
color=['orange','green','cyan', 'purple']
for k in range(0,4):
data = df.loc[df["cluster"]==k]
plt.scatter(data["Weight"],data["Height"],c=color[k])
plt.title('CLustering with K-Means')
plt.show()
In [14]:
# reload the data
df = pd.read_csv('clustering_gmm.csv')
gmm = GaussianMixture(n_components=4)
gmm.fit(df)
#predictions from gmm
pred = gmm.predict(df)
df['cluster'] = pred
df.columns = ['Weight', 'Height', 'cluster']
color=['orange','green','cyan', 'purple']
for k in range(0,4):
data = df[df["cluster"]==k]
plt.scatter(data["Weight"],data["Height"],c=color[k])
plt.title('Clustering with Gaussian Mixture Models')
plt.show()
In this data sample, distance method such as k-means may not be able to find the right clusters.