In [ ]:
import sys
print(sys.version)

In [ ]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import time

import pandas as pd
import seaborn as sns

In [ ]:
import sys
sys.path.append('../code/')

from mnist_helpers import mnist_training, mnist_testing
from k_means import KMeans
from pca import Pca, make_image

In [ ]:
! ls -l ./data/PCA_training_data.pickle

In [ ]:
import pickle
pca_training = pickle.load(file=open('./data/PCA_training_data.pickle', "rb"))

In [ ]:
X_train_untransformed, y_train = mnist_training(shuffled=False) 
X_train = np.load('../notebooks/data/X_transformed_by_50_components.npy')
print("X_train shape: {}.  y_train shape: {}".format(X_train.shape, y_train.shape))

X_test_untransformed, y_test = mnist_testing(shuffled=False)
X_test = np.load('../notebooks/data/X_test_transformed_by_50_components.npy')
print("X_test shape: {}.  y_test shape: {}".format(X_test.shape, y_test.shape))

In [ ]:
N_points = 60000
assert X_train_untransformed.shape[0] == N_points
assert X_train.shape[0] == N_points

Q 5.1:

k = 16, MNIST data transformed by first 50 PCA components.


In [ ]:
km_16 = KMeans(k=16, train_X=X_train, train_y=y_train, 
            pca_obj=pca_training,
            max_iter = 500, 
            test_X=X_test, test_y=y_test,
            verbose=False)
km_16.run()

(a) The squared reconstruction error vs iteration number.


In [ ]:
km_16_reconstruction_error = km_16.plot_squared_reconstruction_error()
km_16_reconstruction_error.savefig('../figures/k_means/k16_reconstruction_error.pdf',
                                  bbox_inches='tight')

In [ ]:
km_16_reconstruction_error = km_16.plot_squared_reconstruction_error()
km_16_reconstruction_error.savefig('../figures/k_means/k16_reconstruction_error.pdf',
                                  bbox_inches='tight')

In [ ]:
km_16.results_df.tail(2)

In [ ]:
km_16_reconstruction_error_nromalized = \
    km_16.plot_squared_reconstruction_error_normalized()

In [ ]:
km_16_0_1_loss = km_16.plot_0_1_loss()
km_16_0_1_loss.savefig('../figures/k_means/k16_loss_01.pdf',
                       bbox_inches='tight')

(b) Let us say that the number of assignments for a mean is the number of points assigned to that mean. Plot the number of assignments for each center in descending order.


In [ ]:
km_16_assignment_bars = km_16.plot_num_assignments_for_each_center()
km_16_assignment_bars.savefig('../figures/k_means/k16_assignment_bars.pdf',
                             bbox_inches='tight')

(c) Visualize the 16 centers that you learned, and display them in an order in that corresponds to the frequency in which they were assigned (if you use a grid, just describe the ordering).


In [ ]:
km_16.visualize_center(km_16.center_coordinates[0])

In [ ]:
km_16.visualize_n_centers(16, top=True)

Q 5.2

k = 250, MNIST data transformed by first 50 PCA components.


In [ ]:
km_250 = KMeans(k=250, train_X=X_train, train_y=y_train, 
            pca_obj=pca_training,
            max_iter = 500,
            test_X=X_test, test_y=y_test,
            verbose=False)
km_250.run()

(a) The squared reconstruction error vs iteration number.


In [ ]:
km_250_reconstruction = km_250.plot_squared_reconstruction_error()
km_250_reconstruction.savefig('../figures/k_means/k250_reconstruction_error.pdf',
                             bbox_inches='tight')

In [ ]:
km_250.results_df.tail(1).T

(b) Let us say that the number of assignments for a mean is the number of points assigned to that mean. Plot the number of assignments for each center in descending order.


In [ ]:
km_250_assignment_bars = km_250.plot_num_assignments_for_each_center()
km_250_assignment_bars.savefig('../figures/k_means/k250_assignment_bars.pdf',
                              bbox_inches='tight')

(c) Visualize 16 of these centers, chosen randomly. Display them in the order in an order in that corresponds to the frequency in which they were assigned.


In [ ]:
km_250.visualize_n_centers(16, top=True)

In [ ]:
km_250.visualize_n_centers(16, top=False)

In [ ]:
# just for fun
km_250_loss_01 = km_250.plot_0_1_loss()

5.2 Classification with K-means

  1. (4 points) For K = 16, what are your training and test 0/1 losses?

In [ ]:
import copy

In [ ]:
def assess_test_data(self):
        test_model = copy.copy(self)
        test_model.X = self.test_X
        test_model.y = self.test_y
        test_model.N, test_model.d = test_model.X.shape

        ## model characteristics
        #test_model.assignments = None # cluster assignment.  Does not know about labels.
        #test_model.predictions = None # label assignment.  Does not know about cluster.

        test_model.results_df = None
        # todo: rename
        test_model.results_df_cluster_assignment_counts = None

        test_model.set_point_assignments()
        test_model.set_centers_classes()
        test_model.set_predicted_labels()

        test_model.record_count_of_assignments_to_each_mean()
        test_model.record_fit_statistics()
        print("test results:")
        print(test_model.results_df.T)

        self.test_model = test_model
        return test_model.results_df.T

In [ ]:
km_16_test_results = assess_test_data(km_16)

In [ ]:
print(km_16_test_results.to_latex())

In [ ]:
km_16.assess_test_data().to
  1. (4 points) For K = 250, what are your training and test 0/1 losses?

In [ ]:
km_250.assess_test_data()

In [ ]:
km_250_test_results = assess_test_data(km_250)
print("")
print(km_250_test_results.to_latex())

Check out centers that appear to be a mix of digits.

for k = 16, center number 5 appears to be a blend of 5 and 8. What are it's true labels?


In [ ]:
km_16.verbose = True

In [ ]:
km_16.clusters_by_num_in_cluster().head(6)

In [ ]:
km_16.set_centers_classes()

In [ ]: