In [2]:

    
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

Classifying Blobs



In [3]:

    
from sklearn.datasets import make_blobs
X, y = make_blobs(random_state=42, centers=3)
X[:,1] += 0.25*X[:,0]**2
# print(X.shape)
# print(y)
# plt.scatter(X[:, 0], X[:, 1], 20, y, edgecolor='none')
plt.plot(X[:, 0], X[:, 1], 'ok')









    Out[3]:





[<matplotlib.lines.Line2D at 0x7fe029507b00>]



In [7]:

    
from sklearn.cluster import KMeans, AffinityPropagation, SpectralClustering

# cluster = AffinityPropagation()
# cluster = KMeans(n_clusters=3)
cluster = SpectralClustering(n_clusters=3)

# kmeans.fit(X)
# kmeans.labels_
# labels = cluster.predict(X)
labels = cluster.fit_predict(X)

print('Labels: \n', labels)
print('Data: \n', y)
# print(cluster.cluster_centers_)
plt.scatter(X[:, 0], X[:, 1], 20, labels, edgecolor='none')
# for n in range(3):
#     plt.plot(cluster.cluster_centers_[n, 0], cluster.cluster_centers_[n, 1], 'ok', markersize=20)









    



Labels: 
 [0 2 1 2 0 2 1 2 2 1 1 0 0 1 1 0 0 1 0 0 1 0 0 1 1 1 2 0 0 0 0 2 2 0 1 1 1
 1 2 2 0 1 2 1 1 2 0 0 0 2 2 2 1 0 0 0 1 1 2 1 0 2 0 2 0 0 2 0 2 2 2 0 0 1
 2 0 2 0 0 2 1 2 1 0 1 1 1 2 1 2 2 2 1 2 1 1 1 2 0 1]
Data: 
 [2 1 0 1 2 1 0 1 1 0 0 2 2 0 0 2 2 0 2 2 0 2 2 0 0 0 1 2 2 2 2 1 1 2 0 0 0
 0 1 1 2 0 1 0 0 1 2 2 2 1 1 1 0 2 2 2 0 0 1 0 2 1 2 1 2 2 1 2 1 1 1 2 2 0
 1 2 1 2 1 1 0 1 0 2 0 0 0 1 0 1 1 1 0 1 0 0 0 1 2 0]






    Out[7]:





<matplotlib.collections.PathCollection at 0x7fe029c9f400>

Digets examples

Classification using (linear) PCA and (nonlinear) Isometric Maps



In [45]:

    
from sklearn.datasets import load_digits
digits = load_digits()
print(len(digits.images))
fig = plt.figure(figsize=(6, 6))

for i in range(64):
    ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
    ax.matshow(digits.images[i], cmap=plt.cm.binary)
    ax.text(0, 7, str(digits.target[i]))



In [48]:

    
digits.data.shape









    Out[48]:





(1797, 64)



In [51]:

    
from sklearn.decomposition import RandomizedPCA, PCA
pca = PCA(n_components=2)
proj = pca.fit_transform(digits.data)
plt.scatter(proj[:, 0], proj[:, 1], 30, digits.target, edgecolor='none')
plt.colorbar()









    Out[51]:





<matplotlib.colorbar.Colorbar at 0x1103ef710>



In [52]:

    
from sklearn.manifold import Isomap
iso = Isomap(n_neighbors=5, n_components=2)
proj = iso.fit_transform(digits.data)
plt.scatter(proj[:, 1], proj[:, 0], 30, digits.target, edgecolor='none')









    Out[52]:





<matplotlib.collections.PathCollection at 0x1104c5710>

Unsuperised learning

Notice that the training labels are unused. The digits have been separated, but they have no meaning.



In [66]:

    
kmeans = KMeans(n_clusters=10, random_state=42)
labels = kmeans.fit(digits.data)
# kmeans.cluster_centers_.shape

fig, axs = plt.subplots(2, 5, figsize=(8, 3))
axs = axs.flatten()
for n in range(10):
    axs[n].imshow(kmeans.cluster_centers_[n].reshape(8, 8), cmap=plt.cm.gray_r)

Supervised learning

Here, we will use the didgit labels to see how well we can reproduce the labels.



In [22]:

    
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)

clf = GaussianNB()
clf.fit(X_train, y_train)

predicted = clf.predict(X_test)
expected = y_test



In [23]:

    
from sklearn import metrics
print(metrics.classification_report(expected, predicted))









    



             precision    recall  f1-score   support

          0       1.00      0.98      0.99        43
          1       0.72      0.77      0.75        44
          2       0.95      0.55      0.69        33
          3       0.86      0.55      0.67        44
          4       0.95      0.77      0.85        47
          5       0.84      0.88      0.86        52
          6       0.93      1.00      0.96        39
          7       0.60      0.95      0.74        37
          8       0.57      0.86      0.68        58
          9       0.97      0.60      0.74        53

avg / total       0.83      0.79      0.79       450



In [24]:

    
print(metrics.confusion_matrix(expected, predicted))









    



[[42  0  0  0  0  0  0  1  0  0]
 [ 0 34  0  0  0  0  1  2  6  1]
 [ 0  5 18  0  0  0  0  0 10  0]
 [ 0  0  0 24  0  4  0  2 14  0]
 [ 0  1  0  0 36  0  2  8  0  0]
 [ 0  0  0  1  1 46  0  3  1  0]
 [ 0  0  0  0  0  0 39  0  0  0]
 [ 0  0  0  0  1  1  0 35  0  0]
 [ 0  3  1  0  0  0  0  4 50  0]
 [ 0  4  0  3  0  4  0  3  7 32]]



In [ ]: