In [19]:
from sklearn import manifold
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
In [2]:
iris = load_iris()
X = iris.data
y = iris.target
print X[4:10]
print y[4:10]
In [3]:
# data scaling
## This step is important in manifold learning, since the algorithms are based on nearest-neighbour search,
## without scaling, the algortihms can be influenced by some specific very large or small columns
sc = StandardScaler()
scaled_X = sc.fit_transform(X)
print scaled_X[4:10]
In [7]:
# From my implementation here: https://github.com/hanhanwu/Hanhan_Data_Science_Practice/blob/master/Outliers_and_Clustering/finding_optimal_k.R
## optimal k tend to be 3
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, n_init=100, n_jobs=-1).fit(scaled_X)
labels = kmeans.labels_
formated_labels = ['Class{}'.format(k) for k in range(1, optimal_k+1)]
formated_labels
Out[7]:
In [29]:
# 2D visualization
n_dimensions = 2
reduced_X = manifold.TSNE(n_components=n_dimensions, learning_rate=10, random_state=410).fit_transform(scaled_X)
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)
for lab, colr in zip((range(optimal_k)), iter(plt.cm.rainbow(np.linspace(0,1,optimal_k)))): # color map
ax.scatter(reduced_X[labels==lab, 0],
reduced_X[labels==lab, 1],
label=y_labels[lab],
c=colr,
alpha=0.5) # alpha for transparency
ax.set_xlabel('Projected X for Class 1')
ax.set_ylabel('Projected X for Class 2')
Out[29]:
In [30]:
# 3D visualization
n_dimensions = 3
reduced_X = manifold.TSNE(n_components=n_dimensions, learning_rate=1, random_state=410).fit_transform(scaled_X)
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
for lab, colr in zip((range(optimal_k)), iter(plt.cm.rainbow(np.linspace(0,1,optimal_k)))): # color map
ax.scatter(reduced_X[labels==lab, 0],
reduced_X[labels==lab, 1],
reduced_X[labels==lab, 2],
label=y_labels[lab],
c=colr,
alpha=0.5) # alpha for transparency
ax.set_xlabel('Projected X for Class 1')
ax.set_ylabel('Projected X for Class 2')
ax.set_zlabel('Projected X for Class 3')
Out[30]: