In [103]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler,Normalizer, RobustScaler
In [3]:
iris=datasets.load_iris()
In [59]:
dir(iris)
Out[59]:
In [7]:
X=iris.data
y=iris.target
In [50]:
## Transform Dataframe
df_iris=pd.DataFrame(X,columns=iris.feature_names)
df_iris['target']=pd.Series(y).astype('category')
df_iris.head()
Out[50]:
In [56]:
### Pairplot
g=sns.pairplot(df_iris,hue="target",vars=iris.feature_names)
plt.show()
In [61]:
## KMeans - fit model
model = KMeans(n_clusters=3)
model.fit(X)
Out[61]:
In [65]:
## Get label
model.labels_
Out[65]:
In [67]:
## Get Cluster Center
centroid=model.cluster_centers_
centroid
Out[67]:
In [63]:
# predict the first
model.predict(X[:3,])
Out[63]:
In [71]:
# Plot centroid in 2D plot
plt.scatter(X[:,2],X[:,3],c=y)
plt.scatter(centroid[:,2],centroid[:,3], marker='D',s=50)
plt.show()
In [73]:
# Inertia - Sum of squared distances of samples to their closest cluster center
model.inertia_
Out[73]:
In [74]:
### Select the best k
ks = range(1,6)
inertias=[]
for k in ks:
model_tmp=KMeans(n_clusters=k).fit(X)
inertias.append(model_tmp.inertia_)
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
In [75]:
eval_label= pd.DataFrame({'label': model.labels_, 'target': y})
In [79]:
pd.crosstab(eval_label['label'],eval_label['target'])
Out[79]:
In [85]:
# In clustering feature with different variance is a problem.
# If a feature has a variance that is orders of magnitude larger that others,
# it might dominate the objective function and make the
# estimator unable to learn from other features correctly as expected.
from numpy import var
pd.DataFrame({'feature': iris.feature_names,
'variance': var(X,ddof=1,axis=0)})
Out[85]:
In [101]:
#Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
kmeans = KMeans(n_clusters=3)
pipeline = make_pipeline(scaler,kmeans)
pipeline.fit(X)
labels=pipeline.predict(X)
In [90]:
eval_df = pd.DataFrame({'labels': labels, 'target': iris.target})
In [92]:
pd.crosstab(eval_df['labels'],eval_df['target'])
Out[92]:
In [102]:
kmeans.inertia_
Out[102]:
In [108]:
#Normalize samples individually to unit norm.
#Each sample (i.e. each row of the data matrix) with at least one non zero
#component is rescaled independently of other samples so that its norm (l1 or l2) equals one.
normalizer = Normalizer()
kmeans = KMeans(n_clusters=3,init='k-means++')
pipeline = make_pipeline(normalizer,kmeans)
pipeline.fit(X)
labels=pipeline.predict(X)
eval_df = pd.DataFrame({'labels': labels, 'target': iris.target})
pd.crosstab(eval_df['labels'],eval_df['target'])
Out[108]:
In [109]:
kmeans.inertia_
Out[109]: