cf. Clustering (Coursera Ng, Machine Learning Introduction)
cf. Clustering K-Means Algorithm
cf Ritchie Ng's example, K-nearest Neighbors (KNN) Classification Model
In [2]:
%matplotlib inline
In [5]:
# sanity check for python setup
import sys
print(sys.executable)
print(sys.path)
In [3]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
In [4]:
from sklearn.cluster import KMeans
from sklearn import datasets
In [6]:
np.random.seed(5)
In [20]:
centers=[[1,1],[-1,-1],[1,-1]]
iris=datasets.load_iris()
X=iris.data
y=iris.target
In [21]:
estimators = {'k_means_iris_3': KMeans(n_clusters=3),
'k_means_iris_8':KMeans(n_clusters=8),
'k_means_iris_bad_init': KMeans(n_clusters=3, n_init=1,init='random')}
In [29]:
fignum = 1
for name, est in estimators.items():
fig = plt.figure(fignum, figsize=(4,3))
plt.clf()
ax=Axes3D(fig,rect=[0,0,.95,1],elev=48,azim=134)
plt.cla()
est.fit(X)
labels=est.labels_
ax.scatter(X[:,3],X[:,0],X[:,2],c=labels.astype(np.float))
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
fignum=fignum+1
# Plot the ground truth
fig=plt.figure(fignum,figsize=(4,3))
plt.clf()
ax=Axes3D(fig,rect=[0,0,.95,1],elev=48,azim=134)
plt.cla()
for name,label in [('Setosa',0),('Versicolour',1),('Virginica',2)]:
ax.text3D(X[y==label,3].mean(),
X[y==label,0].mean()+1.5,
X[y==label,2].mean(),name,
horizontalalignment='center',
bbox=dict(alpha=.5,edgecolor='w',facecolor='w'))
# Reorder the labels to have colors matching the cluster results
# y = np.choose(y,[1,2,0]).astype(np.float)
yint=np.choose(y, [1,2,0])#.astype(np.float)
ax.scatter(X[:,3],X[:,0],X[:,2],c=yint)
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
plt.show()
In [32]:
import pandas as pd
In [37]:
print(type(X))
print(type(y))
print(X.shape)
print(y.shape)
print(pd.DataFrame(y).iloc[:,0].unique())
In [38]:
from sklearn.neighbors import KNeighborsClassifier
In [39]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X,y)
y_pred=knn.predict(X)
In [40]:
# compute classification accuracy for the logistic regression model
from sklearn import metrics
In [41]:
print(metrics.accuracy_score(y,y_pred))
In [42]:
### 1c. KNN (K=1)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X,y)
y_pred=knn.predict(X)
print(metrics.accuracy_score(y,y_pred))
In [43]:
# STEP 1: split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=4)
In [44]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)
print(metrics.accuracy_score(y_test,y_pred))
In [45]:
knn=KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)
print(metrics.accuracy_score(y_test,y_pred))
In [47]:
# try K=1 through K=25 and record testing accuracy
k_range=range(1,26)
# We can create Python dictionary using [] or dict()
scores=[]
# We use a loop through the range 1 to 26
# We append the scores in the dictionary
for k in k_range:
knn=KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))
print(scores)
In [48]:
# import Matplotlib (scientific plotting library)
#import matplotlib.pyplot as plt
# allow plots to appear within the notebook
#%matplotlib inline
# plot the relationship between $K$ and testing accuracy
# plt.plot(x_axis,y_axis)
plt.plot(k_range,scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')
Out[48]:
In [50]:
print(pd.DataFrame(X).describe())
print(pd.DataFrame(X).head())
In [51]:
pd.DataFrame(X).head(10)
Out[51]:
In [ ]: