In [ ]:
! pip install umap-learn
In [ ]:
import numpy as np
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline
In [ ]:
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})
In [ ]:
digits = load_digits()
#print(digits.DESCR) # Documentation
#digits.images.shape # (1797, 8, 8)
digits.data.shape # (1797, 64)
In [ ]:
import umap
In [ ]:
reducer = umap.UMAP(random_state=42)
reducer.fit(digits.data) # <3
In [ ]:
embedding = reducer.transform(digits.data)
# Verify that the result of calling transform is
# idenitical to accessing the embedding_ attribute
assert(np.all(embedding == reducer.embedding_))
embedding.shape
In [ ]:
plt.scatter(embedding[:, 0], embedding[:, 1], c=digits.target, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP projection of the Digits dataset', fontsize=24);
In [ ]:
In [ ]:
# ! pip install Pillow-SIMD # Hmm : Missing jpeg library...
In [ ]:
import numpy as np
import torch
import torchvision.datasets
In [ ]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
In [ ]:
transform = torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize((0.1307,), (0.3081,)),
])
mnist_data = torchvision.datasets.MNIST('./data/mnist', download=True,
train=True, transform=transform, )
mnist_data_test = torchvision.datasets.MNIST('./data/mnist', download=True,
train=False, transform=transform, )
data_set = mnist_data
In [ ]:
batch_size=40 # Also works for CNN training below
data_loader = torch.utils.data.DataLoader(
data_set, batch_size=batch_size, #shuffle=True,
num_workers=0,
)
In [ ]:
proj_dim = 50
proj = torch.nn.Linear( 28*28, proj_dim ) # Does the initialisation 'better' than torch.randn()
In [ ]:
# Keep a copy of the basic dataset
xs, ys = np.zeros( (len(data_set), 28*28) ), np.zeros( (len(data_set),) )
for i_batch, (x_batch, y_batch) in enumerate(data_loader):
x_batch_flat = x_batch.view(-1, 28*28)
xs[i_batch*batch_size:(i_batch+1)*batch_size, :] = x_batch_flat.detach()
ys[i_batch*batch_size:(i_batch+1)*batch_size] = y_batch.detach()
In [ ]:
# This maps the result of the projection directly into the results numpy arrays
def map_data_via_proj(ds, d_loader):
x_arr, y_arr = np.zeros( (len(ds), proj_dim) ), np.zeros( (len(ds),) )
for i_batch, (x_batch, y_batch) in enumerate(d_loader):
x_batch_flat = x_batch.view(-1, 28*28)
x_proj = proj( x_batch_flat )
#print(x_proj.size()); break
x_arr[i_batch*batch_size:(i_batch+1)*batch_size, :] = x_proj.detach()
y_arr[i_batch*batch_size:(i_batch+1)*batch_size] = y_batch.detach()
return x_arr, y_arr
xs_proj, ys_proj = map_data_via_proj(data_set, data_loader)
xt_proj, yt_proj = map_data_via_proj(mnist_data_test, test_loader)
In [ ]:
#i_batch*batch_size
base=59990; ys[base+0:base+10]
In [ ]:
import time
import umap
In [ ]:
t0 = time.time()
reducer_proj = umap.UMAP(random_state=42)
reducer_proj.fit(xs_proj)
print("Fitting %d-D took %d secs" % (proj_dim, time.time()-t0,) )
# 400d : 74 secs
# 100d : 76 secs
# 50d : 69 secs
# 35d : 68 secs
# 20d : 68 secs
In [ ]:
embedding_proj = reducer_proj.transform(xs_proj)
embedding_proj.shape
In [ ]:
plt.figure(figsize=(8,8))
plt.scatter(embedding_proj[:, 0], embedding_proj[:, 1], c=ys_proj, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP %d-D projection of raw MNIST images' % (proj_dim,), fontsize=24);
In [ ]:
# Ok, so now let's build a CNN to classify regular MNIST
# with last layer being 50D, and re-run the UMAP on that result...
# Shout out to https://github.com/floydhub/mnist/blob/master/main.py for
# basic layout of model - updated significantly here for 0.4.1
In [ ]:
class BasicNet(torch.nn.Module):
def __init__(self):
super(BasicNet, self).__init__()
self.conv1 = torch.nn.Conv2d(1, 10, kernel_size=5)
self.conv1_mp = torch.nn.MaxPool2d(2)
self.conv2 = torch.nn.Conv2d(10, 20, kernel_size=5)
self.conv2_mp = torch.nn.MaxPool2d(2)
self.conv2_drop = torch.nn.Dropout2d()
self.fc1 = torch.nn.Linear(320, 50)
self.drop = torch.nn.Dropout()
self.fc2 = torch.nn.Linear(50, 10)
self.logsoftmax = torch.nn.LogSoftmax(dim=1)
self.relu = torch.nn.ReLU()
def forward(self, x, middle_layer=False):
x = self.relu(self.conv1_mp(self.conv1(x)))
x = self.relu(self.conv2_mp(self.conv2_drop(self.conv2(x))))
x = x.view(-1, 320) # Flatten
x = self.fc1(x)
if middle_layer:
return x
x = self.drop(self.relu(x))
x = self.fc2(x)
return self.logsoftmax(x)
In [ ]:
model = BasicNet().to(device)
In [ ]:
#optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
In [ ]:
train_loader = torch.utils.data.DataLoader(
data_set, batch_size=batch_size, #shuffle=True,
num_workers=0,
)
In [ ]:
model.train()
loss_fn = torch.nn.NLLLoss()
for epoch in range(5):
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = loss_fn(output, target)
loss.backward()
optimizer.step()
if batch_idx % (10000//batch_size) == 0:
print('Train Epoch: {} [{:5d}/{:5d} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
In [ ]:
# Just use the test dataset to evaluate our model
test_loader = torch.utils.data.DataLoader(
mnist_data_test, batch_size=batch_size, #shuffle=True,
num_workers=0,
)
In [ ]:
model.eval()
test_loss, correct = 0., 0
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += loss_fn(output, target).item() # sum up batch loss
pred = output.data.max(1, keepdim=True)[1] # get the indices of the max log-probability
correct += pred.eq(target.data.view_as(pred)).cpu().sum().numpy()
test_loss /= len(test_loader.dataset)
print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.4f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
correct * 100. / len(test_loader.dataset)))
In [ ]:
# This maps the result of the projection directly into the results numpy arrays
def map_data_via_cnn(ds, d_loader):
x_arr, y_arr = np.zeros( (len(ds), proj_dim) ), np.zeros( (len(ds),) )
model.eval()
for i_batch, (x_batch, y_batch) in enumerate(d_loader):
data = x_batch.to(device)
output = model(data, middle_layer=True)
x_cnn = output.cpu().detach().numpy()
x_arr[i_batch*batch_size:(i_batch+1)*batch_size, :] = x_cnn
y_arr[i_batch*batch_size:(i_batch+1)*batch_size] = y_batch.detach()
return x_arr, y_arr
xs_cnn, ys_cnn = map_data_via_cnn(data_set, data_loader)
xt_cnn, yt_cnn = map_data_via_cnn(mnist_data_test, test_loader)
In [ ]:
t0 = time.time()
reducer_cnn = umap.UMAP(random_state=42)
reducer_cnn.fit(xs_cnn)
print("Fitting %d-D to CNN middle layer took %d secs" % (proj_dim, time.time()-t0,) )
# 50d : 70 secs
In [ ]:
embedding_cnn = reducer_cnn.transform(xs_cnn)
embedding_cnn.shape
In [ ]:
plt.figure(figsize=(8,8))
# Other potential colormaps : Spectral, tab10, tab20, tab20b?, terrain, jet
plt.scatter(embedding_cnn[:, 0], embedding_cnn[:, 1], c=ys_cnn, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP %d-D CNN middle layer for MNIST' % (proj_dim,), fontsize=24);
In [ ]:
#np.sum(ys==ys_cnn) These are identical => data_loader arrives in consistent order
In [ ]:
# First, get a small list of indices of data from each class
labelled_idx = []
for c in range(10): # num_classes
#idx_c = np.where(ys==c)[0][0]
idx_c = np.nonzero(ys==c)[0][0]
labelled_idx.append(idx_c)
labelled_idx, ys[labelled_idx]
In [ ]:
# Now do a KNN-style classification across a test set
# http://scikit-learn.org/stable/modules/neighbors.html#finding-the-nearest-neighbors
from sklearn.neighbors import NearestNeighbors
def test_knn(reducer, idx, x_arr, y_arr, x_test, y_test):
# Convert the labelled points to the embedding, and prepare the knn
x_known, y_known = x_arr[idx], y_arr[idx]
emb_known = reducer.transform(x_known)
#print(emb_known, y_known) # Makes sense
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(emb_known)
# Go through test set:
emb_test = reducer.transform(x_test)
near_dists, near_idxs = nbrs.kneighbors(emb_test) # measures to known label points
#near_dist = near_dists[:,0]
near_idx = near_idxs[:,0]
#print(near_dist, near_idx)
# So let's get the classes corresponding to the nearest neighbours
classes = y_known[near_idx]
#print(y_test, classes.tolist())
# Count the correct classifications
accuracy = np.sum(y_test==classes)/classes.shape[0]
return accuracy
In [ ]:
#test_knn(reducer_proj, labelled_idx, xs_proj, ys_proj, xt_proj[0:10], yt_proj[0:10])
test_knn(reducer_proj, labelled_idx, xs_proj, ys_proj, xt_proj, yt_proj)
In [ ]:
test_knn(reducer_cnn, labelled_idx, xs_cnn, ys_cnn, xt_cnn, yt_cnn)
In [ ]:
def best_next_knn(reducer, idx, x_arr, y_arr):
# Convert the labelled points to the embedding, and prepare the knn
x_known, y_known = x_arr[idx], y_arr[idx]
emb_known = reducer.transform(x_known)
#print(emb_known, y_known) # Makes sense
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(emb_known)
# Go through training set:
emb_all = reducer.transform(x_arr)
near_dists, near_idxs = nbrs.kneighbors(emb_all) # measures to known label points
#near_dist = near_dists[:,0]
#near_idx = near_idxs[:,0]
#print(near_dist, near_idx)
# We are interested in :
# Points where dist[0]>0 (i.e. not known), and class[0]!=class[1] (may be confused)
# ?ranked by ( dist[1] - dist[0] ) (i.e. near a boundary)
# ?ranked by ( dist[0] ) (i.e. far from their nearest determiner)
near_class0 = y_known[ near_idxs[:,0] ]
near_class1 = y_known[ near_idxs[:,1] ]
importance_if_different = np.where(near_class0 == near_class1, 0.,
#near_dists[:,1] - near_dists[:,0])
near_dists[:,0]) # Closest point is furthest away
# Now return the indices that are most important to know
important_index = np.argmax(importance_if_different)
return important_index
#important_indices = np.argsort(importance_if_different)[-n:]
#return important_indices
def best_next_knns(reducer, idx, x_arr, y_arr, n=10):
idx_new = idx.copy()
for i in range(n):
idx_want = best_next_knn(reducer_proj, idx_new, xs_proj, ys_proj)
# let's assume we find out this label now...
idx_new.append(idx_want)
return idx_new
important_idx = best_next_knns(reducer_proj, labelled_idx, xs_proj, ys_proj, n=10)
important_idx
In [ ]:
#important_idx = best_next_knn(reducer_proj, labelled_idx, xs_proj, ys_proj, n=8)
#important_idx = best_next_knn(reducer_proj, labelled_idx, xs_proj, ys_proj, n=8)
In [ ]:
plt.figure(figsize=(8,8))
plt.scatter(embedding_proj[:, 0], embedding_proj[:, 1], c=ys_proj, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.scatter(embedding_proj[labelled_idx, 0], embedding_proj[labelled_idx, 1], marker='^', s=200)
extra_idx = important_idx[len(labelled_idx):]
plt.scatter(embedding_proj[extra_idx, 0], embedding_proj[extra_idx, 1], marker='v', s=200)
plt.title('UMAP %d-D projection of raw MNIST images' % (proj_dim,), fontsize=24);
In [ ]:
labelled_idx = important_idx
In [ ]: