See UMAP results on basic 'digits' raw image files (per documentation)


In [ ]:
! pip install umap-learn

In [ ]:
import numpy as np
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline

In [ ]:
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

In [ ]:
digits = load_digits()
#print(digits.DESCR) # Documentation
#digits.images.shape # (1797, 8, 8)
digits.data.shape   # (1797, 64)

In [ ]:
import umap

In [ ]:
reducer = umap.UMAP(random_state=42)
reducer.fit(digits.data)  # <3

In [ ]:
embedding = reducer.transform(digits.data)
# Verify that the result of calling transform is
# idenitical to accessing the embedding_ attribute
assert(np.all(embedding == reducer.embedding_))
embedding.shape

In [ ]:
plt.scatter(embedding[:, 0], embedding[:, 1], c=digits.target, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP projection of the Digits dataset', fontsize=24);

In [ ]:

See UMAP results on plain MNIST raw image files


In [ ]:
# ! pip install Pillow-SIMD  # Hmm : Missing jpeg library...

In [ ]:
import numpy as np

import torch
import torchvision.datasets

In [ ]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [ ]:
transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.1307,), (0.3081,)),
])

mnist_data = torchvision.datasets.MNIST('./data/mnist', download=True, 
  train=True, transform=transform, )
mnist_data_test = torchvision.datasets.MNIST('./data/mnist', download=True, 
  train=False, transform=transform, )

data_set = mnist_data

In [ ]:
batch_size=40  # Also works for CNN training below

data_loader = torch.utils.data.DataLoader(
  data_set, batch_size=batch_size, #shuffle=True,
  num_workers=0, 
)

In [ ]:
proj_dim = 50

proj = torch.nn.Linear( 28*28, proj_dim )  # Does the initialisation 'better' than torch.randn()

In [ ]:
# Keep a copy of the basic dataset
xs, ys = np.zeros( (len(data_set), 28*28) ), np.zeros( (len(data_set),) )
for i_batch, (x_batch, y_batch) in enumerate(data_loader):
    x_batch_flat = x_batch.view(-1, 28*28)
    xs[i_batch*batch_size:(i_batch+1)*batch_size, :] = x_batch_flat.detach()
    ys[i_batch*batch_size:(i_batch+1)*batch_size] = y_batch.detach()

In [ ]:
# This maps the result of the projection directly into the results numpy arrays
def map_data_via_proj(ds, d_loader):
    x_arr, y_arr = np.zeros( (len(ds), proj_dim) ), np.zeros( (len(ds),) )
    for i_batch, (x_batch, y_batch) in enumerate(d_loader):
        x_batch_flat = x_batch.view(-1, 28*28)
        
        x_proj = proj( x_batch_flat )
        #print(x_proj.size()); break

        x_arr[i_batch*batch_size:(i_batch+1)*batch_size, :] = x_proj.detach()
        y_arr[i_batch*batch_size:(i_batch+1)*batch_size] = y_batch.detach()    
    return x_arr, y_arr
    
xs_proj, ys_proj = map_data_via_proj(data_set, data_loader)
xt_proj, yt_proj = map_data_via_proj(mnist_data_test, test_loader)

In [ ]:
#i_batch*batch_size
base=59990; ys[base+0:base+10]

In [ ]:
import time
import umap

In [ ]:
t0 = time.time()
reducer_proj = umap.UMAP(random_state=42)
reducer_proj.fit(xs_proj) 
print("Fitting %d-D took %d secs" % (proj_dim, time.time()-t0,) )  
# 400d : 74 secs
# 100d : 76 secs 
#  50d : 69 secs
#  35d : 68 secs
#  20d : 68 secs

In [ ]:
embedding_proj = reducer_proj.transform(xs_proj)
embedding_proj.shape

In [ ]:
plt.figure(figsize=(8,8))
plt.scatter(embedding_proj[:, 0], embedding_proj[:, 1], c=ys_proj, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP %d-D projection of raw MNIST images' % (proj_dim,), fontsize=24);

Train CNN on dataset (with labels)


In [ ]:
# Ok, so now let's build a CNN to classify regular MNIST 
#   with last layer being 50D, and re-run the UMAP on that result...

# Shout out to https://github.com/floydhub/mnist/blob/master/main.py for
#   basic layout of model - updated significantly here for 0.4.1

In [ ]:
class BasicNet(torch.nn.Module):
    def __init__(self):
        super(BasicNet, self).__init__()
        self.conv1 = torch.nn.Conv2d(1, 10, kernel_size=5)
        self.conv1_mp = torch.nn.MaxPool2d(2)
        self.conv2 = torch.nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_mp = torch.nn.MaxPool2d(2)
        self.conv2_drop = torch.nn.Dropout2d()
        self.fc1 = torch.nn.Linear(320, 50)
        self.drop = torch.nn.Dropout()
        self.fc2 = torch.nn.Linear(50, 10)
        self.logsoftmax = torch.nn.LogSoftmax(dim=1)
        self.relu = torch.nn.ReLU()

    def forward(self, x, middle_layer=False):
        x = self.relu(self.conv1_mp(self.conv1(x)))
        x = self.relu(self.conv2_mp(self.conv2_drop(self.conv2(x))))
        x = x.view(-1, 320) # Flatten
        x = self.fc1(x)
        if middle_layer:
            return x
        x = self.drop(self.relu(x))
        x = self.fc2(x)
        return self.logsoftmax(x)

In [ ]:
model = BasicNet().to(device)

In [ ]:
#optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [ ]:
train_loader = torch.utils.data.DataLoader(
  data_set, batch_size=batch_size, #shuffle=True,
  num_workers=0, 
)

In [ ]:
model.train()
loss_fn = torch.nn.NLLLoss()

for epoch in range(5):
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % (10000//batch_size) == 0:
            print('Train Epoch: {} [{:5d}/{:5d} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [ ]:
# Just use the test dataset to evaluate our model
test_loader = torch.utils.data.DataLoader(
  mnist_data_test, batch_size=batch_size, #shuffle=True,
  num_workers=0, 
)

In [ ]:
model.eval()
test_loss, correct = 0., 0
for data, target in test_loader:
    data, target = data.to(device), target.to(device)
    output = model(data)
    test_loss += loss_fn(output, target).item() # sum up batch loss
    pred = output.data.max(1, keepdim=True)[1] # get the indices of the max log-probability
    correct += pred.eq(target.data.view_as(pred)).cpu().sum().numpy()

test_loss /= len(test_loader.dataset)
print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.4f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    correct * 100. / len(test_loader.dataset)))

Do UMAP on the final hidden layer


In [ ]:
# This maps the result of the projection directly into the results numpy arrays
def map_data_via_cnn(ds, d_loader):
    x_arr, y_arr = np.zeros( (len(ds), proj_dim) ), np.zeros( (len(ds),) )
    model.eval()
    for i_batch, (x_batch, y_batch) in enumerate(d_loader):
        data = x_batch.to(device)
        output = model(data, middle_layer=True)
        x_cnn = output.cpu().detach().numpy()
        x_arr[i_batch*batch_size:(i_batch+1)*batch_size, :] = x_cnn
        y_arr[i_batch*batch_size:(i_batch+1)*batch_size] = y_batch.detach()
    return x_arr, y_arr

xs_cnn, ys_cnn = map_data_via_cnn(data_set, data_loader)
xt_cnn, yt_cnn = map_data_via_cnn(mnist_data_test, test_loader)

In [ ]:
t0 = time.time()
reducer_cnn = umap.UMAP(random_state=42)
reducer_cnn.fit(xs_cnn) 
print("Fitting %d-D to CNN middle layer took %d secs" % (proj_dim, time.time()-t0,) )  
# 50d : 70 secs

In [ ]:
embedding_cnn = reducer_cnn.transform(xs_cnn)
embedding_cnn.shape

In [ ]:
plt.figure(figsize=(8,8))
# Other potential colormaps : Spectral, tab10, tab20, tab20b?, terrain, jet
plt.scatter(embedding_cnn[:, 0], embedding_cnn[:, 1], c=ys_cnn, cmap='Spectral', s=5) 
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP %d-D CNN middle layer for MNIST' % (proj_dim,), fontsize=24);

Idea

  • Start with a (small) set of known-labels from the training set
    • Pass them through the fixed straight UMAP, and the CNN UMAP
    • Work out test-set error based on nearest neighbours
    • Figure out which training set member(s) would be best to know the labels of...
      • Maybe a Gaussian Optimisation thing?
  • Later (soon after), train the CNN until it gets 'good' training set accuracy

In [ ]:
#np.sum(ys==ys_cnn)  These are identical => data_loader arrives in consistent order

In [ ]:
# First, get a small list of indices of data from each class
labelled_idx = []
for c in range(10):  # num_classes
    #idx_c = np.where(ys==c)[0][0]
    idx_c = np.nonzero(ys==c)[0][0]
    labelled_idx.append(idx_c)
labelled_idx, ys[labelled_idx]

In [ ]:
# Now do a KNN-style classification across a test set
#   http://scikit-learn.org/stable/modules/neighbors.html#finding-the-nearest-neighbors
from sklearn.neighbors import NearestNeighbors

def test_knn(reducer, idx, x_arr, y_arr, x_test, y_test):
    # Convert the labelled points to the embedding, and prepare the knn
    x_known, y_known = x_arr[idx], y_arr[idx]
    emb_known = reducer.transform(x_known)
    #print(emb_known, y_known)  # Makes sense
    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(emb_known)
    
    # Go through test set:
    emb_test = reducer.transform(x_test)
    near_dists, near_idxs = nbrs.kneighbors(emb_test)  # measures to known label points
    #near_dist = near_dists[:,0]
    near_idx = near_idxs[:,0]
    #print(near_dist, near_idx)
    
    # So let's get the classes corresponding to the nearest neighbours
    classes = y_known[near_idx]
    #print(y_test, classes.tolist())
    
    # Count the correct classifications
    accuracy = np.sum(y_test==classes)/classes.shape[0]
    
    return accuracy

In [ ]:
#test_knn(reducer_proj, labelled_idx, xs_proj, ys_proj, xt_proj[0:10], yt_proj[0:10])
test_knn(reducer_proj, labelled_idx, xs_proj, ys_proj, xt_proj, yt_proj)

In [ ]:
test_knn(reducer_cnn, labelled_idx, xs_cnn, ys_cnn, xt_cnn, yt_cnn)

In [ ]:
def best_next_knn(reducer, idx, x_arr, y_arr):
    # Convert the labelled points to the embedding, and prepare the knn
    x_known, y_known = x_arr[idx], y_arr[idx]
    emb_known = reducer.transform(x_known)
    #print(emb_known, y_known)  # Makes sense
    nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(emb_known)
    
    # Go through training set:
    emb_all = reducer.transform(x_arr)
    near_dists, near_idxs = nbrs.kneighbors(emb_all)  # measures to known label points
    #near_dist = near_dists[:,0]
    #near_idx = near_idxs[:,0]
    #print(near_dist, near_idx)
    
    # We are interested in : 
    #   Points where dist[0]>0 (i.e. not known), and class[0]!=class[1] (may be confused)
    #     ?ranked by  ( dist[1] - dist[0] )  (i.e. near a boundary)
    #     ?ranked by  ( dist[0] )  (i.e. far from their nearest determiner)
    
    near_class0 = y_known[ near_idxs[:,0] ]
    near_class1 = y_known[ near_idxs[:,1] ]
    
    importance_if_different = np.where(near_class0 == near_class1, 0., 
                                       #near_dists[:,1] - near_dists[:,0])
                                       near_dists[:,0])  # Closest point is furthest away
    
    # Now return the indices that are most important to know
    important_index = np.argmax(importance_if_different)
    return important_index

    #important_indices = np.argsort(importance_if_different)[-n:]
    #return important_indices

def best_next_knns(reducer, idx, x_arr, y_arr, n=10):
    idx_new = idx.copy()
    
    for i in range(n):
        idx_want = best_next_knn(reducer_proj, idx_new, xs_proj, ys_proj)
        # let's assume we find out this label now...
        idx_new.append(idx_want)
    
    return idx_new

important_idx = best_next_knns(reducer_proj, labelled_idx, xs_proj, ys_proj, n=10)
important_idx

In [ ]:
#important_idx = best_next_knn(reducer_proj, labelled_idx, xs_proj, ys_proj, n=8)
#important_idx = best_next_knn(reducer_proj, labelled_idx, xs_proj, ys_proj, n=8)

In [ ]:
plt.figure(figsize=(8,8))
plt.scatter(embedding_proj[:, 0], embedding_proj[:, 1], c=ys_proj, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))

plt.scatter(embedding_proj[labelled_idx, 0], embedding_proj[labelled_idx, 1], marker='^', s=200)
extra_idx = important_idx[len(labelled_idx):]
plt.scatter(embedding_proj[extra_idx, 0], embedding_proj[extra_idx, 1], marker='v', s=200)

plt.title('UMAP %d-D projection of raw MNIST images' % (proj_dim,), fontsize=24);

In [ ]:
labelled_idx = important_idx

Further Ideas

  • Investigate what the UMAP 'reducer' and 'embeddings' are doing
    • What does the embedding represent?
    • Is the embedding usable in its own right?
  • Train MNIST but leave several digits out
    • Look at where they would lie in UMAP space

References


In [ ]: