imports


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt

import env
from ml_mnist.knn import KNNClassifier
from ml_mnist.gp import GPClassifier
from ml_mnist.logreg import LogisticRegression
from ml_mnist.nn import NNClassifier, RBM
from ml_mnist.nn.layers import FullyConnected, Activation
from ml_mnist.nn.activations import leaky_relu
from ml_mnist.decomposition import PCA

from ml_mnist.preprocessing import StandardScaler
from ml_mnist.feature_selection import VarianceThreshold
from ml_mnist.model_selection import TrainTestSplitter, GridSearchCV
from ml_mnist.augmentation import RandomAugmentator
from ml_mnist.metrics import (accuracy_score,
                               zero_one_loss, 
                               confusion_matrix, 
                               plot_confusion_matrix)
from ml_mnist.utils import (one_hot, unhot,
                             Stopwatch, RNG, 
                             plot_greyscale_image, plot_rbm_filters)
from ml_mnist.utils.dataset import load_mnist
from ml_mnist.utils.read_write import load_model

%load_ext autoreload
%autoreload 2
%matplotlib inline

load dataset


In [2]:
X, y = load_mnist(mode='train', path='data/')
X.shape


Out[2]:
(60000, 784)

In [3]:
plot_greyscale_image(X[0], title="Label is {0}".format(y[0]));



In [4]:
plot_greyscale_image(X[42], title="Label is {0}".format(y[42]));


k-NN

load small subset of dataset


In [ ]:
def load_small(n_samples=5000):
    X, y = load_mnist(mode='train', path='data/')
    X_scaled = X / 255.
    X_scaled = VarianceThreshold(0.1).fit_transform(X_scaled)
    X_scaled = StandardScaler(copy=False).fit_transform(X_scaled)
    tts = TrainTestSplitter(shuffle=True, random_seed=1337)
    indices, _ = tts.split(y, train_ratio=n_samples/60000., stratify=True)
    return X_scaled[indices], y[indices] # 5000 -> 4994 training samples

Approach #1: remove (almost) constant features + standartize + (non-kernelized) k-NN

Scale data to [0, 1] range


In [6]:
X_scaled = X / 255.
print X_scaled.min(), X_scaled.max()
print X_scaled.shape


0.0 1.0
(60000, 784)

In [7]:
sns.heatmap(X_scaled[100:124, 100:124]); # lots of zeros ofc


Remove features with low variance (784 -> 444)


In [8]:
X_scaled = VarianceThreshold(0.1).fit_transform(X_scaled)
print X_scaled.min(), X_scaled.max()
print X_scaled.shape


0.0 1.0
(60000, 444)

Now perform mean-std standartization


In [9]:
X_scaled = StandardScaler(copy=False).fit_transform(X_scaled)
print X_scaled.min(), X_scaled.max()
print X_scaled.shape


-1.27420789208 9.7871062278
(60000, 444)

In [10]:
sns.heatmap(X_scaled[100:124, 100:124], cmap='RdYlGn'); # more interesting


Some benchmarks

As you can see, for brute-force algorithm no training time needed at all, but longer prediction, compared to k-d tree. The difference become bigger as number of training samples grows and it should be even bigger when we'll use much less number of features after PCA (now 444).


In [11]:
knn = KNNClassifier(algorithm='brute')
knn


Out[11]:
KNNClassifier(algorithm='brute', k=5, kd_tree_=None, kernel=None,
       kernel_params={}, leaf_size=30, metric=None, p=2.0,
       weights='uniform')

In [12]:
with Stopwatch(verbose=True) as s:
    knn.fit(X_scaled[:1000], y[:1000])


Elapsed time: 0.000 sec

In [13]:
with Stopwatch(True) as s:
    y_pred = knn.predict(X_scaled[1000:1100])
print zero_one_loss(y_pred, y[1000:1100])


Elapsed time: 2.041 sec
0.21

In [14]:
knn2 = KNNClassifier(algorithm='kd_tree', leaf_size=10)
knn2


Out[14]:
KNNClassifier(algorithm='kd_tree', k=5, kd_tree_=None, kernel=None,
       kernel_params={}, leaf_size=10, metric=None, p=2.0,
       weights='uniform')

In [15]:
with Stopwatch(True) as s:
    knn2.fit(X_scaled[:1000], y[:1000])


Elapsed time: 0.438 sec

In [16]:
with Stopwatch(True) as s:
    y_pred = knn2.predict(X_scaled[1000:1100])
print zero_one_loss(y_pred, y[1000:1100])


Elapsed time: 0.455 sec
0.21

GridSearchCV (uses stratified K-Fold CV)

This class will be used for convenient hyper-parameters grid search for simple models. This class desing as many others is inspired by one of sklearn, yet have some extensions (such as model saving, that is supported for all models here, and possibility to specify order for parameters exploration).

One more feature is the following -- parameter refit, which controls order of exploring parameters.

If set to True, then for each combination of parameters we refit our model for each new train/test split to get mean accuracy score for given set of parameters as soon as possible. This makes sense for each ML algorithm (typically parametric), with explicit training procedure.

If set to False, then for each possible split we fit our model once, and after that we evaluate this model on all possible combination of parameters. This makes sense and yields results significantly faster for such models (typically non-parametric), as k-NN, in particular.

Below small demo of output for refit=True:


In [ ]:
param_grid = ({'weights': ['uniform', 'distance'], 'k': [2, 3]}, {'p': [1., np.inf], 'k': [2]})
grid_cv1 = GridSearchCV(model=KNNClassifier(algorithm='kd_tree', leaf_size=1), param_grid=param_grid, 
                        train_test_splitter_params=dict(shuffle=True, random_seed=1337), n_splits=4, 
                        refit=True, save_models=False, verbose=True)
grid_cv1.fit(X_scaled[:1000], y[:1000]); # rebuilding tree on each iteration

# Training KNNClassifier on 1000 samples x 444 features.
# 4-fold CV for each of 6 params combinations == 24 fits ...

# iter:  1/24 +--- elapsed:   1.026 sec   ...
# iter:  2/24 ++-- elapsed:   2.022 sec   ...
# iter:  3/24 +++- elapsed:   3.010 sec   ...
# iter:  4/24 ++++ elapsed:   4.012 sec - mean acc.: 0.7940 +/- 2 * 0.038
# iter:  5/24 +--- elapsed:   5.017 sec - best acc.: 0.7940 at {'k': 2, 'weights': 'uniform'}
# iter:  6/24 ++-- elapsed:   6.017 sec - best acc.: 0.7940 at {'k': 2, 'weights': 'uniform'}
# iter:  7/24 +++- elapsed:   7.042 sec - best acc.: 0.7940 at {'k': 2, 'weights': 'uniform'}
# iter:  8/24 ++++ elapsed:   8.054 sec - mean acc.: 0.8070 +/- 2 * 0.029
# iter:  9/24 +--- elapsed:   9.093 sec - best acc.: 0.8070 at {'k': 2, 'weights': 'distance'}
# iter: 10/24 ++-- elapsed:  10.105 sec - best acc.: 0.8070 at {'k': 2, 'weights': 'distance'}
# iter: 11/24 +++- elapsed:  11.138 sec - best acc.: 0.8070 at {'k': 2, 'weights': 'distance'}
# iter: 12/24 ++++ elapsed:  12.157 sec - mean acc.: 0.8209 +/- 2 * 0.024
# iter: 13/24 +--- elapsed:  13.198 sec - best acc.: 0.8209 at {'k': 3, 'weights': 'uniform'}
# iter: 14/24 ++-- elapsed:  14.308 sec - best acc.: 0.8209 at {'k': 3, 'weights': 'uniform'}
# iter: 15/24 +++- elapsed:  15.596 sec - best acc.: 0.8209 at {'k': 3, 'weights': 'uniform'}
# iter: 16/24 ++++ elapsed:  16.607 sec - mean acc.: 0.7811 +/- 2 * 0.029
# iter: 17/24 +--- elapsed:  17.706 sec - best acc.: 0.8209 at {'k': 3, 'weights': 'uniform'}
# iter: 18/24 ++-- elapsed:  18.770 sec - best acc.: 0.8209 at {'k': 3, 'weights': 'uniform'}
# iter: 19/24 +++- elapsed:  19.840 sec - best acc.: 0.8209 at {'k': 3, 'weights': 'uniform'}
# iter: 20/24 ++++ elapsed:  20.889 sec - mean acc.: 0.8140 +/- 2 * 0.031
# iter: 21/24 +--- elapsed:  21.866 sec - best acc.: 0.8209 at {'k': 3, 'weights': 'uniform'}
# iter: 22/24 ++-- elapsed:  22.843 sec - best acc.: 0.8209 at {'k': 3, 'weights': 'uniform'}
# iter: 23/24 +++- elapsed:  23.811 sec - best acc.: 0.8209 at {'k': 3, 'weights': 'uniform'}
# iter: 24/24 ++++ elapsed:  24.766 sec - mean acc.: 0.4880 +/- 2 * 0.018

and for refit=True: (difference is not big because too many features and too few parameter combinations are used)


In [ ]:
grid_cv2 = GridSearchCV(model=KNNClassifier(algorithm='kd_tree', leaf_size=1), param_grid=param_grid, 
                        train_test_splitter_params=dict(shuffle=True, random_seed=1337), n_splits=4, 
                        refit=False, save_models=False, verbose=True)
grid_cv2.fit(X_scaled[:1000], y[:1000]); # building tree only on each 6-th iteration

# Training KNNClassifier on 1000 samples x 444 features.
# 4-fold CV for each of 6 params combinations == 24 fits ...

# iter:  1/24 +--- elapsed:   1.019 sec - best acc.: 0.8110  [1/4 splits] at {'k': 2, 'weights': 'uniform'}
# iter:  2/24 +--- elapsed:   1.834 sec - best acc.: 0.8228  [1/4 splits] at {'k': 2, 'weights': 'distance'}
# iter:  3/24 +--- elapsed:   2.645 sec - best acc.: 0.8386  [1/4 splits] at {'k': 3, 'weights': 'uniform'}
# iter:  4/24 +--- elapsed:   3.448 sec - best acc.: 0.8386  [1/4 splits] at {'k': 3, 'weights': 'uniform'}
# iter:  5/24 +--- elapsed:   4.277 sec - best acc.: 0.8386  [1/4 splits] at {'k': 3, 'weights': 'uniform'}
# iter:  6/24 +--- elapsed:   5.058 sec - best acc.: 0.8386  [1/4 splits] at {'k': 3, 'weights': 'uniform'}
# iter:  7/24 ++-- elapsed:   6.073 sec - best acc.: 0.8055  [2/4 splits] at {'k': 2, 'weights': 'uniform'}
# iter:  8/24 ++-- elapsed:   6.878 sec - best acc.: 0.8174  [2/4 splits] at {'k': 2, 'weights': 'distance'}
# iter:  9/24 ++-- elapsed:   7.672 sec - best acc.: 0.8353  [2/4 splits] at {'k': 3, 'weights': 'uniform'}
# iter: 10/24 ++-- elapsed:   8.475 sec - best acc.: 0.8353  [2/4 splits] at {'k': 3, 'weights': 'uniform'}
# iter: 11/24 ++-- elapsed:   9.336 sec - best acc.: 0.8353  [2/4 splits] at {'k': 3, 'weights': 'uniform'}
# iter: 12/24 ++-- elapsed:  10.125 sec - best acc.: 0.8353  [2/4 splits] at {'k': 3, 'weights': 'uniform'}
# iter: 13/24 +++- elapsed:  11.311 sec - best acc.: 0.7806  [3/4 splits] at {'k': 2, 'weights': 'uniform'}
# iter: 14/24 +++- elapsed:  12.127 sec - best acc.: 0.7980  [3/4 splits] at {'k': 2, 'weights': 'distance'}
# iter: 15/24 +++- elapsed:  12.918 sec - best acc.: 0.8166  [3/4 splits] at {'k': 3, 'weights': 'uniform'}
# iter: 16/24 +++- elapsed:  13.722 sec - best acc.: 0.8166  [3/4 splits] at {'k': 3, 'weights': 'uniform'}
# iter: 17/24 +++- elapsed:  14.576 sec - best acc.: 0.8166  [3/4 splits] at {'k': 3, 'weights': 'uniform'}
# iter: 18/24 +++- elapsed:  15.538 sec - best acc.: 0.8166  [3/4 splits] at {'k': 3, 'weights': 'uniform'}
# iter: 19/24 ++++ elapsed:  16.519 sec - best acc.: 0.7940 +/- 2 * 0.038 at {'k': 2, 'weights': 'uniform'}
# iter: 20/24 ++++ elapsed:  17.322 sec - best acc.: 0.8070 +/- 2 * 0.029 at {'k': 2, 'weights': 'distance'}
# iter: 21/24 ++++ elapsed:  18.106 sec - best acc.: 0.8209 +/- 2 * 0.024 at {'k': 3, 'weights': 'uniform'}
# iter: 22/24 ++++ elapsed:  19.095 sec - best acc.: 0.8209 +/- 2 * 0.024 at {'k': 3, 'weights': 'uniform'}
# iter: 23/24 ++++ elapsed:  19.933 sec - best acc.: 0.8209 +/- 2 * 0.024 at {'k': 3, 'weights': 'uniform'}
# iter: 24/24 ++++ elapsed:  20.688 sec - best acc.: 0.8209 +/- 2 * 0.024 at {'k': 3, 'weights': 'uniform'}

best model as well as other "best" stuff are available:


In [ ]:
grid_cv2.best_model_

finally all results can be converted to pandas.DataFrame and stored to excel or whatever. For more see docstrings in code.


In [ ]:
df = grid_cv2.to_df()
df.to_excel('test.xlsx')
df

5-Fold CV on 5k images

load data


In [ ]:
param_grid = {'weights': ['uniform', 'distance'], 
              'k': range(2, 31),
              'p': [1., 2., 3., np.inf]}
param_order = ['k', 'weights', 'p']
grid_cv_knn_1 = GridSearchCV(model=KNNClassifier(algorithm='kd_tree', leaf_size=10), 
                             param_grid=param_grid,
                             param_order=param_order,
                             train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                             n_splits=5, 
                             refit=False, 
                             save_models=True,
                             dirpath='tmp/',
                             save_params=dict(
                                 params_mask=dict(kd_tree_=False), # do not save tree
                                 json_params=dict(indent=4)),
                             verbose=True)

In [ ]:
[params for params in grid_cv_knn_1.gen_params()][:10]

In [ ]:
grid_cv_knn_1.number_of_combinations()

In [ ]:
grid_cv_knn_1.fit(X_knn_1, y_knn_1);

# Training KNNClassifier on 4994 samples x 444 features.
# 5-fold CV for each of 232 params combinations == 1160 fits ...

# iter:    1/1160 +---- elapsed:  34.320 sec - best acc.: 0.9084  [1/5 splits] at {'p': 1.0, 'k': 2, 'weights': 'uniform'}
# iter:    2/1160 +---- elapsed:  49.252 sec - best acc.: 0.9203  [1/5 splits] at {'p': 1.0, 'k': 2, 'weights': 'distance'}
# iter:    3/1160 +---- elapsed:  63.681 sec - best acc.: 0.9203  [1/5 splits] at {'p': 1.0, 'k': 2, 'weights': 'distance'}
# ...
# iter:  925/1160 ++++- elapsed: 20728.7 sec - best acc.: 0.9217  [4/5 splits] at {'p': 1.0, 'k': 3, 'weights': 'uniform'}
# iter:  926/1160 ++++- elapsed: 20780.0 sec - best acc.: 0.9217  [4/5 splits] at {'p': 1.0, 'k': 3, 'weights': 'uniform'}
# iter:  927/1160 ++++- elapsed: 20794.5 sec - best acc.: 0.9217  [4/5 splits] at {'p': 1.0, 'k': 3, 'weights': 'uniform'}
# iter:  928/1160 ++++- elapsed: 20809.0 sec - best acc.: 0.9217  [4/5 splits] at {'p': 1.0, 'k': 3, 'weights': 'uniform'}
# iter:  929/1160 +++++ elapsed: 20843.3 sec - best acc.: 0.9091 +/- 2 * 0.007 at {'p': 1.0, 'k': 2, 'weights': 'uniform'}
# iter:  930/1160 +++++ elapsed: 20858.1 sec - best acc.: 0.9195 +/- 2 * 0.003 at {'p': 1.0, 'k': 2, 'weights': 'distance'}
# iter:  931/1160 +++++ elapsed: 20872.5 sec - best acc.: 0.9195 +/- 2 * 0.003 at {'p': 1.0, 'k': 2, 'weights': 'distance'}
# ...
# iter: 1158/1160 +++++ elapsed: 25924.2 sec - best acc.: 0.9209 +/- 2 * 0.004 at {'p': 1.0, 'k': 3, 'weights': 'uniform'}
# iter: 1159/1160 +++++ elapsed: 25939.9 sec - best acc.: 0.9209 +/- 2 * 0.004 at {'p': 1.0, 'k': 3, 'weights': 'uniform'}
# iter: 1160/1160 +++++ elapsed: 25955.6 sec - best acc.: 0.9209 +/- 2 * 0.004 at {'p': 1.0, 'k': 3, 'weights': 'uniform'}

In [ ]:
df = grid_cv_knn_1.to_df()
df.to_excel('knn_1_full.xlsx')
df.sort_values(by='mean_score', ascending=False).head(10).to_excel('knn_1_best.xlsx')

Approach #2: remove (almost) constant features + standartize + kernelized k-NN

3-Fold CV on 2.5k images

Unfortunately, kd-trees in scipy only supported for l_p metric, and not for custom function, so k-NN must be predicted in brute-force mode


In [ ]:
param_grid_0 = [{'weights': ['uniform'], 'k': range(2, 12 + 1)},
                {'weights': ['distance'], 'k': (2, 3, 4)}]
param_grid = []
for d in param_grid_0:
    d1 = d.copy()
    d1.update({'kernel': ['rbf'],
               'kernel_params': [dict(gamma=gamma) for gamma in np.logspace(-7, 2, 10)]})
    param_grid.append(d1)
    d2 = d.copy()
    d2.update({'kernel': ['sigmoid'],
               'kernel_params': [dict(gamma=gamma) for gamma in (1e-4, 1e-2, 1.)]})
    param_grid.append(d2)
    d3 = d.copy()
    d3.update({'kernel': ['poly'],
               'kernel_params': [dict(degree=degree) for degree in (2, 3, 4)]})
    param_grid.append(d3)
param_order = [['kernel_params', 'k']] * len(param_grid)

grid_cv_knn_2 = GridSearchCV(model=KNNClassifier(algorithm='brute'), 
                             param_grid=param_grid,
                             param_order=param_order,
                             train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                             n_splits=3, 
                             refit=True, 
                             save_models=True,
                             dirpath='tmp/',
                             save_params=dict(json_params=dict(indent=4)),
                             verbose=True)

In [ ]:
[params for params in grid_cv_knn_2.gen_params()][:3]

In [ ]:
grid_cv_knn_2.number_of_combinations()

In [ ]:
X_knn_2, y_knn_2 = load_small(2500)
grid_cv_knn_2.fit(X_knn_2, y_knn_2)

# Training KNNClassifier on 2494 samples x 444 features.
# 3-fold CV for each of 224 params combinations == 672 fits ...

# iter:   1/672 +-- elapsed:  99.099 sec   ...
# iter:   2/672 ++- elapsed: 197.839 sec   ...
# iter:   3/672 +++ elapsed: 294.787 sec - mean acc.: 0.8693 +/- 2 * 0.009
# iter:   4/672 +-- elapsed: 390.949 sec - best acc.: 0.8693 at {'kernel_params': {'gamma': 9.9999999999999995e-08}, 'k': 2, 'weights': 'uniform', 'kernel': 'rbf'}
# iter:   5/672 ++- elapsed: 487.090 sec - best acc.: 0.8693 at {'kernel_params': {'gamma': 9.9999999999999995e-08}, 'k': 2, 'weights': 'uniform', 'kernel': 'rbf'}
# ...
# iter: 668/672 ++- elapsed: 56102.7 sec - best acc.: 0.8889 at {'kernel_params': {'gamma': 9.9999999999999995e-08}, 'k': 2, 'weights': 'distance', 'kernel': 'rbf'}
# iter: 669/672 +++ elapsed: 56140.9 sec - mean acc.: 0.3946 +/- 2 * 0.015
# iter: 670/672 +-- elapsed: 56179.3 sec - best acc.: 0.8889 at {'kernel_params': {'gamma': 9.9999999999999995e-08}, 'k': 2, 'weights': 'distance', 'kernel': 'rbf'}
# iter: 671/672 ++- elapsed: 56217.2 sec - best acc.: 0.8889 at {'kernel_params': {'gamma': 9.9999999999999995e-08}, 'k': 2, 'weights': 'distance', 'kernel': 'rbf'}
# iter: 672/672 +++ elapsed: 56253.5 sec - mean acc.: 0.3797 +/- 2 * 0.020

In [ ]:
df = grid_cv_knn_2.to_df()
df.to_excel('knn_2_full.xlsx')
df.sort_values(by='mean_score', ascending=False).head(25).to_excel('knn_2_best.xlsx')

Approach #3, #4: Same as above but with PCA (unwhitened/whitened)

interesting observation

$$ \mathbf{x}_{PCA}=W^T(\mathbf{x}-\pmb{\mu})=\left(\sqrt{n}W^TS^{-1}\right)\frac{1}{\sqrt{n}}S(\mathbf{x}-\pmb{\mu})= \left[\frac{1}{\sqrt{n}}S\mathbf{x}\right]_{PCA\;whitened}, $$

where $S$ is matrix with singular values of $X$, and even more interesting: $$ \mathbf{x}_{PCA}=W^T(\mathbf{x}-\pmb{\mu})= \frac{1}{\sqrt{n}}S \left(\sqrt{n}S^{-1}W^T\right)(\mathbf{x}-\pmb{\mu})= \frac{1}{\sqrt{n}}S\cdot\mathbf{x}_{PCA\;whitened}, $$

therefore computing distance between vectors after applying PCA w/o whitening is the same as to apply PCA whitening and then to compute distance between weighted vectors according to the respective singular values! (I wanted to try it as a separate approach, but it is == approach #3)

compute & apply PCA for all training set


In [ ]:
X, y = load_mnist(mode='train', path='data/')
X /= 255.
with Stopwatch(verbose=True) as s:
    pca = PCA().fit(X)
pca.save('models/pca_full.json') # ~13 Mb

load PCA model


In [ ]:
pca_full = load_model('models/pca_full.json'); pca_full

In [ ]:
sum(pca_full.explained_variance_ratio_[:154]) # <- to explain 95% of the variance we need 154 components

load small stratified subset of data


In [ ]:
def load_small2(n_samples):
    X, y = load_mnist(mode='train', path='data/')
    X_scaled = X / 255. # only divide by 255
    tts = TrainTestSplitter(shuffle=True, random_seed=1337)
    indices, _ = tts.split(y, train_ratio=n_samples/60000., stratify=True)
    return X_scaled[indices], y[indices]
X_sm, y_sm = load_small2(1000) # approx

5k images 3-Fold CV for non-kernelized k-NN + number of PCA components


In [ ]:
param_grid = ({'weights': ['distance'],
               'k': [2, 3, 4],
               'p': [1, 2]
              },
              {'weights': ['uniform'],
               'k': [2, 3, 4, 6, 9, 12, 15],
               'p': [1, 2]
              })
grid_search_params = dict(model=KNNClassifier(algorithm='kd_tree'), 
                          param_grid=param_grid,
                          # param_order=param_order,
                          train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                          n_splits=3, 
                          refit=False, 
                          # save_models=True,
                          # dirpath='tmp/',
                          # save_params=dict(json_params=dict(indent=4)),
                          verbose=True)

for n_components in xrange(5, 151, 5):
    print "[PCA n_components = {0}]\n\n".format(n_components)
    X_current = pca_full.set_params(n_components=n_components, whiten=False).transform(X_sm)
    grid_cv_knn_pca_1 = GridSearchCV(**grid_search_params).fit(X_current, y_sm)
    df = grid_cv_knn_pca_1\
         .to_df()\
         .sort_values(by='mean_score', ascending=False)\
         .to_excel('cv_results/knn_3_pca_{0}_{1:.4f}.xlsx'.format(n_components, grid_cv_knn_pca_1.best_score_))
    print "\n\n"
    
# [PCA n_components = 5]


# Training KNNClassifier on 4994 samples x 5 features.
# 3-fold CV for each of 20 params combinations == 60 fits ...

# iter:  1/60 +-- elapsed:   0.673 sec - best acc.: 0.6936  [1/3 splits] at {'p': 1, 'k': 2, 'weights': 'distance'}
# iter:  2/60 +-- elapsed:   1.340 sec - best acc.: 0.6990  [1/3 splits] at {'p': 2, 'k': 2, 'weights': 'distance'}
# iter:  3/60 +-- elapsed:   1.998 sec - best acc.: 0.6990  [1/3 splits] at {'p': 2, 'k': 2, 'weights': 'distance'}
# ...
# iter: 58/60 +++ elapsed:  41.769 sec - best acc.: 0.7369 +/- 2 * 0.003 at {'p': 2, 'k': 12, 'weights': 'uniform'}
# iter: 59/60 +++ elapsed:  42.429 sec - best acc.: 0.7369 +/- 2 * 0.003 at {'p': 1, 'k': 15, 'weights': 'uniform'}
# iter: 60/60 +++ elapsed:  43.073 sec - best acc.: 0.7369 +/- 2 * 0.003 at {'p': 1, 'k': 15, 'weights': 'uniform'}
# ...
# ...
# ...
# iter: 58/60 +++ elapsed: 133.416 sec - best acc.: 0.9381 +/- 2 * 0.004 at {'p': 2, 'k': 2, 'weights': 'distance'}
# iter: 59/60 +++ elapsed: 136.472 sec - best acc.: 0.9381 +/- 2 * 0.004 at {'p': 2, 'k': 2, 'weights': 'distance'}
# iter: 60/60 +++ elapsed: 138.300 sec - best acc.: 0.9381 +/- 2 * 0.004 at {'p': 2, 'k': 2, 'weights': 'distance'}



# [PCA n_components = 115]


# Training KNNClassifier on 4994 samples x 115 features.
# 3-fold CV for each of 20 params combinations == 60 fits ...

# iter:  1/60 +-- elapsed:   3.008 sec - best acc.: 0.9263  [1/3 splits] at {'p': 1, 'k': 2, 'weights': 'distance'}
# iter:  2/60 +-- elapsed:   4.943 sec - best acc.: 0.9394  [1/3 splits] at {'p': 2, 'k': 2, 'weights': 'distance'}

... same with whitening


In [ ]:
param_grid = ({'weights': ['distance'],
               'k': [2, 3, 4],
               'p': [1, 2]
              },
              {'weights': ['uniform'],
               'k': [2, 3, 4, 6, 9, 12, 15],
               'p': [1, 2]
              })
grid_search_params = dict(model=KNNClassifier(algorithm='kd_tree'), 
                          param_grid=param_grid,
                          # param_order=param_order,
                          train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                          n_splits=3, 
                          refit=False, 
                          # save_models=True,
                          # dirpath='tmp/',
                          # save_params=dict(json_params=dict(indent=4)),
                          verbose=False)

for n_components in xrange(10, 151, 5):
    print "[PCA n_components = {0}]".format(n_components)
    X_current = pca_full.set_params(n_components=n_components, whiten=True).transform(X_sm)
    grid_cv_knn_pca_1 = GridSearchCV(**grid_search_params).fit(X_current, y_sm)
    df = grid_cv_knn_pca_1\
         .to_df()\
         .sort_values(by='mean_score', ascending=False)\
         .to_excel('cv_results/knn_3_pca_whiten_{0}_{1:.4f}.xlsx'.format(n_components, grid_cv_knn_pca_1.best_score_))
# [PCA n_components = 10]
# [PCA n_components = 15]
# [PCA n_components = 20]
# [PCA n_components = 25]
# [PCA n_components = 30]
# [PCA n_components = 35]
# [PCA n_components = 40]
# [PCA n_components = 45]
# [PCA n_components = 50]
# [PCA n_components = 55]
# [PCA n_components = 60]
# [PCA n_components = 65]
# [PCA n_components = 70]
# [PCA n_components = 75]

1k images 3-Fold CV for kernelized k-NN + number of PCA components


In [ ]:
param_grid = ({'weights': ['distance'],
               'k': [2, 3, 4],
               'kernel': ['rbf'],
               'kernel_params': [dict(gamma=x) for x in [1e-1, 1e-2, 1e-4, 1e-6]]
              },
              {'weights': ['uniform'],
               'k': [2, 3, 4, 6, 9, 12],
               'kernel': ['rbf'],
               'kernel_params': [dict(gamma=x) for x in [1e-1, 1e-2, 1e-4, 1e-6]]
              },
              {'weights': ['distance'],
               'k': [2, 3, 4],
               'kernel': ['poly'],
               'kernel_params': [dict(degree=x) for x in [2, 3, 4]]
              },
              {'weights': ['uniform'],
               'k': [2, 3, 4, 6],
               'kernel': ['poly'],
               'kernel_params': [dict(degree=x) for x in [2, 3, 4]]
              })
grid_search_params = dict(model=KNNClassifier(algorithm='brute'), 
                          param_grid=param_grid,
                          # param_order=param_order,
                          train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                          n_splits=3, 
                          refit=True, 
                          # save_models=True,
                          # dirpath='tmp/',
                          # save_params=dict(json_params=dict(indent=4)),
                          verbose=True)

for n_components in xrange(5, 151, 5):
    print "[PCA n_components = {0}]\n\n".format(n_components)
    X_current = pca_full.set_params(n_components=n_components, whiten=False).transform(X_sm)
    grid_cv_knn_pca_2 = GridSearchCV(**grid_search_params).fit(X_current, y_sm)
    df = grid_cv_knn_pca_2\
         .to_df()\
         .sort_values(by='mean_score', ascending=False)\
         .to_excel('cv_results/knn_4_pca_krnl_{0}_{1:.4f}.xlsx'.format(n_components, grid_cv_knn_pca_2.best_score_))
    print "\n"
# [PCA n_components = 5]


# Training KNNClassifier on 996 samples x 5 features.
# 3-fold CV for each of 57 params combinations == 171 fits ...

# iter:   1/171 +-- elapsed:  18.874 sec   ...
# iter:   2/171 ++- elapsed:  39.243 sec   ...
# iter:   3/171 +++ elapsed:  58.217 sec - mean acc.: 0.6879 +/- 2 * 0.029
# ...
# iter: 169/171 +-- elapsed: 2299.67 sec - best acc.: 0.7149 at {'kernel': 'rbf', 'k': 6, 'weights': 'uniform', 'kernel_params': {'gamma': 0.1}}
# iter: 170/171 ++- elapsed: 2306.23 sec - best acc.: 0.7149 at {'kernel': 'rbf', 'k': 6, 'weights': 'uniform', 'kernel_params': {'gamma': 0.1}}
# iter: 171/171 +++ elapsed: 2313.28 sec - mean acc.: 0.5814 +/- 2 * 0.011
# ...
# ...
# ...
# iter: 169/171 +-- elapsed: 1869.40 sec - best acc.: 0.8704 at {'kernel': 'rbf', 'k': 2, 'weights': 'distance', 'kernel_params': {'gamma': 0.1}}
# iter: 170/171 ++- elapsed: 1876.34 sec - best acc.: 0.8704 at {'kernel': 'rbf', 'k': 2, 'weights': 'distance', 'kernel_params': {'gamma': 0.1}}
# iter: 171/171 +++ elapsed: 1882.34 sec - mean acc.: 0.3715 +/- 2 * 0.043


# [PCA n_components = 95]


# Training KNNClassifier on 996 samples x 95 features.
# 3-fold CV for each of 57 params combinations == 171 fits ...

# iter:   1/171 +-- elapsed:  15.785 sec   ...
# iter:   2/171 ++- elapsed:  31.366 sec   ...
# iter:   3/171 +++ elapsed:  46.182 sec - mean acc.: 0.8674 +/- 2 * 0.024
# iter:   4/171 +-- elapsed:  60.642 sec - best acc.: 0.8674 at {'kernel': 'rbf', 'k': 2, 'weights': 'distance', 'kernel_params': {'gamma': 0.1}}

... same with whitening


In [ ]:
param_grid = ({'weights': ['distance'],
               'k': [2, 3, 4],
               'kernel': ['rbf'],
               'kernel_params': [dict(gamma=x) for x in [1e-1, 1e-2, 1e-4, 1e-6]]
              },
              {'weights': ['uniform'],
               'k': [2, 3, 4, 6, 9, 12],
               'kernel': ['rbf'],
               'kernel_params': [dict(gamma=x) for x in [1e-1, 1e-2, 1e-4, 1e-6]]
              },
              {'weights': ['distance'],
               'k': [2, 3, 4],
               'kernel': ['poly'],
               'kernel_params': [dict(degree=x) for x in [2, 3, 4]]
              },
              {'weights': ['uniform'],
               'k': [2, 3, 4, 6],
               'kernel': ['poly'],
               'kernel_params': [dict(degree=x) for x in [2, 3, 4]]
              })
grid_search_params = dict(model=KNNClassifier(algorithm='brute'), 
                          param_grid=param_grid,
                          # param_order=param_order,
                          train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                          n_splits=3, 
                          refit=True, 
                          # save_models=True,
                          # dirpath='tmp/',
                          # save_params=dict(json_params=dict(indent=4)),
                          verbose=True)

for n_components in xrange(5, 151, 5):
    print "[PCA n_components = {0}]\n\n".format(n_components)
    X_current = pca_full.set_params(n_components=n_components, whiten=True).transform(X_sm)
    grid_cv_knn_pca_2 = GridSearchCV(**grid_search_params).fit(X_current, y_sm)
    df = grid_cv_knn_pca_2\
         .to_df()\
         .sort_values(by='mean_score', ascending=False)\
         .to_excel('cv_results/knn_4_pca_krnl_whiten_{0}_{1:.4f}.xlsx'.format(n_components, grid_cv_knn_pca_2.best_score_))
    print "\n"
# [PCA n_components = 5]


# Training KNNClassifier on 996 samples x 5 features.
# 3-fold CV for each of 57 params combinations == 171 fits ...

# iter:   1/171 +-- elapsed:  16.284 sec   ...
# iter:   2/171 ++- elapsed:  32.904 sec   ...
# iter:   3/171 +++ elapsed:  54.273 sec - mean acc.: 0.6939 +/- 2 * 0.018
# ...
# iter: 169/171 +-- elapsed: 2319.17 sec - best acc.: 0.7199 at {'kernel': 'rbf', 'k': 9, 'weights': 'uniform', 'kernel_params': {'gamma': 0.1}}
# iter: 170/171 ++- elapsed: 2325.68 sec - best acc.: 0.7199 at {'kernel': 'rbf', 'k': 9, 'weights': 'uniform', 'kernel_params': {'gamma': 0.1}}
# iter: 171/171 +++ elapsed: 2331.78 sec - mean acc.: 0.5984 +/- 2 * 0.013
# ...
# ...
# ...
# iter: 169/171 +-- elapsed: 2504.95 sec - best acc.: 0.7972 at {'kernel': 'rbf', 'k': 2, 'weights': 'distance', 'kernel_params': {'gamma': 0.1}}
# iter: 170/171 ++- elapsed: 2511.18 sec - best acc.: 0.7972 at {'kernel': 'rbf', 'k': 2, 'weights': 'distance', 'kernel_params': {'gamma': 0.1}}
# iter: 171/171 +++ elapsed: 2517.55 sec - mean acc.: 0.1124 +/- 2 * 0.001


# [PCA n_components = 85]


# Training KNNClassifier on 996 samples x 85 features.
# 3-fold CV for each of 57 params combinations == 171 fits ...

# iter:   1/171 +-- elapsed:  15.737 sec   ...
# iter:   2/171 ++- elapsed:  32.295 sec   ...
# iter:   3/171 +++ elapsed:  48.847 sec - mean acc.: 0.7892 +/- 2 * 0.035

Approach #5 Artificially augment dataset


In [18]:
X, y = load_mnist(mode='train', path='data/')
aug = RandomAugmentator(transform_shape=(28, 28), random_seed=1337)\
      .add('RandomRotate', angle=(-10., 15.))\
      .add('Dropout', p=(0., 0.1))\
      .add('RandomGaussian', sigma=(0., 0.5))\
      .add('RandomShift', x_shift=(-2, 2), y_shift=(-2, 2))
for z in aug.transform(X[:2]/255., 3):
    plot_greyscale_image(z)



In [ ]:
pca_full = load_model('models/pca_full.json')

def load_big2():
    X, y = load_mnist(mode='train', path='data/')
    X_scaled = X / 255. # only divide by 255
    tts = TrainTestSplitter(shuffle=True, random_seed=1337)
    train, test = tts.split(y, train_ratio=50005./60000., stratify=True)
    return X_scaled[train], y[train], X_scaled[test], y[test] # 49999 train, 10001 val
X_train, y_train, X_test, y_test = load_big2()

X_train = X_train[:5000]
y_train = y_train[:5000]
X_test = X_test[:1000]
y_test = y_test[:1000]

N = 3
aug = RandomAugmentator(transform_shape=(28, 28), random_seed=1337)
aug.add('RandomRotate', angle=(-7., 10.))
aug.add('RandomGaussian', sigma=(0., 0.5))
aug.add('RandomShift', x_shift=(-1, 1), y_shift=(-1, 1))
aug.add('Dropout', p=(0.8, 1.0))
X_train_aug = aug.transform(X_train, N)
y_train_aug = np.repeat(y_train, N + 1)
print X_train_aug.shape

pca_full.set_params(n_components=35, whiten=False)
X_train_aug = pca_full.transform(X_train_aug)
X_test = pca_full.transform(X_test)
knn = KNNClassifier(algorithm='kd_tree', k=2, p=2, weights='distance')
with Stopwatch(verbose=True) as s: knn.fit(X_train_aug, y_train_aug)
with Stopwatch(verbose=True) as t: y_pred = knn.predict(X_test)
print accuracy_score(y_test, y_pred)

k-NN best models from all approaches

X_train (60000) = 50k train : 10k validation

Approaches 1, 2


In [ ]:
def load_big():
    X, y = load_mnist(mode='train', path='data/')
    X_scaled = X / 255.
    X_scaled = VarianceThreshold(0.1).fit_transform(X_scaled)
    X_scaled = StandardScaler(copy=False).fit_transform(X_scaled)
    tts = TrainTestSplitter(shuffle=True, random_seed=1337)
    train, test = tts.split(y, train_ratio=50005./60000., stratify=True)
    return X_scaled[train], y[train], X_scaled[test], y[test] # 49999 train, 10001 val

In [ ]:
X_train, y_train, X_test, y_test = load_big()

In [ ]:
knns_best = []
# from approach 1
knns_best.append(KNNClassifier(algorithm='brute', k=3, p=1., weights='uniform'))
knns_best.append(KNNClassifier(algorithm='brute', k=2, p=1., weights='distance'))
# from approach 2
knns_best.append(KNNClassifier(algorithm='brute', k=2, weights='distance', kernel='rbf', kernel_params=dict(gamma=1e-5)))
knns_best.append(KNNClassifier(algorithm='brute', k=3, weights='uniform', kernel='rbf', kernel_params=dict(gamma=1e-5)))

In [ ]:
# -------------------------------------------
# def f(x):
#     return knn._predict_x(x)
# from joblib import Parallel, delayed
# p = Parallel(n_jobs=1, max_nbytes=None)
# print p(delayed(f)(x) for x in X_test[:2]) # <-- NOT WORKING, CANNOT PICKLE INSTANCE METHODS
# ----------------------------------------------
import pathos.multiprocessing as mp
pool = mp.ProcessingPool(4)
for knn in knns_best:
    knn.fit(X_train, y_train)
    y_pred = pool.map(knn._predict_x, X_test) # knn.predict(X_test) in parallel
    print accuracy_score(y_test, y_pred)
# 0.96650...
# 0.96400...
# 0.96110...
# 0.96150...

Approach 3 (w/o whitening)


In [36]:
pca_full = load_model('models/pca_full.json')

In [37]:
def load_big2():
    X, y = load_mnist(mode='train', path='data/')
    X_scaled = X / 255. # only divide by 255
    tts = TrainTestSplitter(shuffle=True, random_seed=1337)
    train, test = tts.split(y, train_ratio=50005./60000., stratify=True)
    return X_scaled[train], y[train], X_scaled[test], y[test] # 49999 train, 10001 val
X_train, y_train, X_test, y_test = load_big2()

In [21]:
pca_full.set_params(n_components=35)
X_train = pca_full.transform(X_train)
X_test = pca_full.transform(X_test)
knn = KNNClassifier(algorithm='kd_tree', k=3, p=2, weights='uniform')
with Stopwatch(verbose=True) as s: knn.fit(X_train, y_train) # Elapsed time: 0.064 sec
with Stopwatch(verbose=True) as t: y_pred = knn.predict(X_test) # Elapsed time: 18.823 sec <- FAST!
print accuracy_score(y_test, y_pred)
# 0.9754...


Elapsed time: 0.077 sec
Elapsed time: 19.782 sec
0.975402459754

In [23]:
C = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(C);



In [27]:
C = confusion_matrix(y_test, y_pred, normalize='cols')
plot_confusion_matrix(C, fmt=".2f");



In [ ]:
pca_full.set_params(n_components=35)
X_train = pca_full.transform(X_train)
X_test = pca_full.transform(X_test)
knn = KNNClassifier(algorithm='kd_tree', k=2, p=2, weights='distance')
with Stopwatch(verbose=True) as s: knn.fit(X_train, y_train) # Elapsed time: 0.067 sec
with Stopwatch(verbose=True) as t: y_pred = knn.predict(X_test) # Elapsed time: 17.848 sec
print accuracy_score(y_test, y_pred)
# 0.9751...

In [ ]:
pca_full.set_params(n_components=35)
X_train = pca_full.transform(X_train)
X_test = pca_full.transform(X_test)
knn = KNNClassifier(algorithm='kd_tree', k=2, p=1, weights='distance')
with Stopwatch(verbose=True) as s: knn.fit(X_train, y_train) 
with Stopwatch(verbose=True) as t: y_pred = knn.predict(X_test)
print accuracy_score(y_test, y_pred)
# 0.9747...

In [ ]:
pca_full.set_params(n_components=30)
X_train = pca_full.transform(X_train)
X_test = pca_full.transform(X_test)
knn = KNNClassifier(algorithm='kd_tree', k=3, p=2, weights='uniform')
with Stopwatch(verbose=True) as s: knn.fit(X_train, y_train)
with Stopwatch(verbose=True) as t: y_pred = knn.predict(X_test)
print accuracy_score(y_test, y_pred)
# 0.9746...

... with whitening


In [ ]:
pca_full.set_params(n_components=35, whiten=True)
X_train = pca_full.transform(X_train)
X_test = pca_full.transform(X_test)
knn = KNNClassifier(algorithm='kd_tree', k=3, p=2, weights='uniform')
with Stopwatch(verbose=True) as s: knn.fit(X_train, y_train)
with Stopwatch(verbose=True) as t: y_pred = knn.predict(X_test)
print accuracy_score(y_test, y_pred)
# 0.9723...

Approach 4 (w/ and w/o whitening)


In [ ]:
pca_full.set_params(n_components=35, whiten=False)
X_train = pca_full.transform(X_train)
X_test = pca_full.transform(X_test)
knn = KNNClassifier(algorithm='brute', k=3, weights='uniform', kernel='rbf', kernel_params=dict(gamma=1e-4))
knn.fit(X_train, y_train)
y_pred = []
for (i, x) in enumerate(X_test):
    y_pred.append(knn._predict_x(x))
    if (i + 1) % 10 == 0:
        print "computed {0}/{1} ... accuracy {2:.4f}".format(i + 1, len(X_test), accuracy_score(y_test[:len(y_pred)], y_pred))
print accuracy_score(y_test, y_pred)
# ...
# computed 2960/10001 ... accuracy 0.9743
# computed 2970/10001 ... accuracy 0.9744
# ...
# computed 3030/10001 ... accuracy 0.9743
# computed 3040/10001 ... accuracy 0.9743

In [ ]:
pca_full.set_params(n_components=20, whiten=True)
X_train = pca_full.transform(X_train)
X_test = pca_full.transform(X_test)
knn = KNNClassifier(algorithm='brute', k=3, weights='uniform', kernel='rbf', kernel_params=dict(gamma=1e-4))
knn.fit(X_train, y_train)
y_pred = []
for (i, x) in enumerate(X_test):
    y_pred.append(knn._predict_x(x))
    if (i + 1) % 10 == 0:
        print "computed {0}/{1} ... accuracy {2:.4f}".format(i + 1, len(X_test), accuracy_score(y_test[:len(y_pred)], y_pred))
print accuracy_score(y_test, y_pred)
# 0.9655...

Approach 6: exponential decay on normalized explained variance


In [17]:
pca_full = load_model('models/pca_full.json')
def load_big2(train_ratio=50005./60000.):
    X, y = load_mnist(mode='train', path='data/')
    X_scaled = X / 255. # only divide by 255
    tts = TrainTestSplitter(shuffle=True, random_seed=1337)
    train, test = tts.split(y, train_ratio=train_ratio, stratify=True)
    return X_scaled[train], y[train], X_scaled[test], y[test]
X_train_orig, y_train_orig, X_test_orig, y_test_orig = load_big2(57000./60000.)

In [ ]:
# train_ratio=50005./60000.
pca_full.set_params(n_components=35, whiten=True)
z = pca_full.explained_variance_ratio_[:35]
z /= sum(z)
# for alpha in (1e-6, 1e-4, 1e-2, 0.1, 1., 10.):
# for alpha in np.logspace(0.0, 5.0, num=11):
# for alpha in (5., 7., 8., 9., 11., 12., 14., 16.):
for alpha in np.arange(11.0, 13.0, 0.2):
    print "alpha =", alpha
    X_train = pca_full.transform(X_train_orig)
    X_test = pca_full.transform(X_test_orig)
    X_train *= np.exp(alpha * z)
    X_test  *= np.exp(alpha * z)
#     knn = KNNClassifier(algorithm='kd_tree', k=2, p=2, weights='distance')
#     knn.fit(X_train, y_train)
#     print knn.evaluate(X_test, y_test)
    knn = KNNClassifier(algorithm='kd_tree', k=3, p=2, weights='uniform')
    knn.fit(X_train, y_train)
    print knn.evaluate(X_test, y_test)# train_ratio=50005./60000.
pca_full.set_params(n_components=35, whiten=True)
z = pca_full.explained_variance_ratio_[:35]
z /= sum(z)
# for alpha in (1e-6, 1e-4, 1e-2, 0.1, 1., 10.):
# for alpha in np.logspace(0.0, 5.0, num=11):
# for alpha in (5., 7., 8., 9., 11., 12., 14., 16.):
for alpha in np.arange(11.0, 13.0, 0.2):
    print "alpha =", alpha
    X_train = pca_full.transform(X_train_orig)
    X_test = pca_full.transform(X_test_orig)
    X_train *= np.exp(alpha * z)
    X_test  *= np.exp(alpha * z)
#     knn = KNNClassifier(algorithm='kd_tree', k=2, p=2, weights='distance')
#     knn.fit(X_train, y_train)
#     print knn.evaluate(X_test, y_test)
    knn = KNNClassifier(algorithm='kd_tree', k=3, p=2, weights='uniform')
    knn.fit(X_train, y_train)
    print knn.evaluate(X_test, y_test)
# alpha = 1e-06
# 0.971102889711
# 0.972302769723
# 0.971102889711
# alpha = 0.0001
# 0.971102889711
# 0.972302769723
# alpha = 0.01
# 0.971102889711
# 0.972302769723
# alpha = 0.1
# 0.971202879712
# 0.972302769723
# alpha = 1.0
# 0.97200279972
# 0.972802719728
# alpha = 10.0
# 0.973802619738
# 0.97700229977
# ...
# alpha = 5.0
# 0.973402659734
# 0.974802519748
# alpha = 7.0
# 0.97400259974
# 0.975602439756
# alpha = 8.0
# 0.974102589741
# 0.976202379762
# alpha = 9.0
# 0.973802619738
# 0.976302369763
# alpha = 11.0
# 0.97400259974
# 0.977302269773
# alpha = 12.0
# 0.974202579742
# 0.977502249775
# alpha = 14.0
# 0.973402659734
# 0.976602339766
# alpha = 16.0
# 0.972902709729
# 0.976202379762
# alpha = 11.2
# 0.977402259774
# alpha = 11.4
# 0.977602239776
# alpha = 11.6
# [*] 0.977802219778
# alpha = 11.8
# [*] 0.977802219778
# alpha = 12.0
# 0.977502249775
# alpha = 12.2
# 0.977402259774
# alpha = 12.4

In [ ]:
# train_ratio=57000./60000.
pca_full.set_params(n_components=35, whiten=True)
z = pca_full.explained_variance_ratio_[:35]
z /= sum(z)
alpha = 11.6

aug = RandomAugmentator(transform_shape=(28, 28), random_seed=1337)
aug.add('RandomRotate', angle=(-7., 10.))
aug.add('RandomGaussian', sigma=(0., 0.5))
aug.add('RandomShift', x_shift=(-1, 1), y_shift=(-1, 1))
aug.add('Dropout', p=(0., 0.2))

for N in xrange(10): # augment by a factor of (1 + N)
    X_train = aug.transform(X_train_orig, N)
    y_train = np.repeat(y_train_orig, N + 1)
    
    X_train = pca_full.transform(X_train)
    X_test = pca_full.transform(X_test_orig)
    X_train *= np.exp(alpha * z)
    X_test  *= np.exp(alpha * z)
    
    knn = KNNClassifier(algorithm='kd_tree', k=3, p=2, weights='uniform')
    knn.fit(X_train, y_train)
    print "N = {0}, acc. = {1:.5f}".format(N, knn.evaluate(X_test, y_test_orig))
# N = 0, acc. = 0.97904
# N = 1, acc. = 0.98137
# N = 2, acc. = 0.98137
# N = 3, acc. = 0.98303
# N = 4, acc. = 0.98337
# N = 5, acc. = 0.98370
# N = 6, acc. = 0.98370
# N = 7, acc. = 0.98237
# [*] N = 8, acc. = 0.98536
# N = 9, acc. = 0.98436

Approach #NN


In [ ]:
nn = load_model('tmp/16nn.json')
X_train, _ = load_mnist('train', 'data/')
X_train /= 255.
nn.forward_pass(X_train)
np.save('data/train_feats.npy', leaky_relu(nn.layers[13]._last_input))

In [ ]:
X = np.load('data/train_feats.npy')
_, y = load_mnist('train', 'data/')
tts = TrainTestSplitter(shuffle=True, random_seed=1337)
train, test = tts.split(y, train_ratio=50005./60000., stratify=True) # 49999 : 10001
param_grid = dict(
    k=[2, 3, 4, 5],
    p=[1., 2., 3.],
    weights=['uniform', 'distance']
)
grid_cv = GridSearchCV(None, param_grid=param_grid)
knn = KNNClassifier(algorithm='kd_tree')
knn.fit(X[train], y[train])
for params in grid_cv.gen_params():
    knn.reset_params().set_params(**params)
    acc = knn.evaluate(X[test], y[test])
    print "{0:.4f} at {1}".format(acc, params)
# (Sorted)
# 0.9906 at {'p': 1.0, 'k': 5, 'weights': 'distance'}
# 0.9912 at {'p': 2.0, 'k': 5, 'weights': 'distance'}
# 0.9919 at {'p': 3.0, 'k': 5, 'weights': 'distance'}
# 0.9926 at {'p': 1.0, 'k': 4, 'weights': 'distance'}
# 0.9929 at {'p': 2.0, 'k': 4, 'weights': 'distance'}
# 0.9934 at {'p': 3.0, 'k': 4, 'weights': 'distance'}
# 0.9943 at {'p': 2.0, 'k': 3, 'weights': 'distance'}
# 0.9945 at {'p': 1.0, 'k': 3, 'weights': 'distance'}
# 0.9950 at {'p': 3.0, 'k': 3, 'weights': 'distance'}
# 0.9957 at {'p': 2.0, 'k': 2, 'weights': 'uniform'}
# 0.9958 at {'p': 3.0, 'k': 2, 'weights': 'uniform'}
# 0.9959 at {'p': 1.0, 'k': 2, 'weights': 'uniform'}
# 0.9960 at {'p': 2.0, 'k': 2, 'weights': 'distance'}
# 0.9962 at {'p': 3.0, 'k': 2, 'weights': 'distance'}
# 0.9963 at {'p': 1.0, 'k': 2, 'weights': 'distance'}
# 0.9964 at {'p': 2.0, 'k': 4, 'weights': 'uniform'}
# 0.9965 at {'p': 3.0, 'k': 4, 'weights': 'uniform'}
# 0.9967 at {'p': 1.0, 'k': 4, 'weights': 'uniform'}
# 0.9968 at {'p': 3.0, 'k': 3, 'weights': 'uniform'}
# 0.9969 at {'p': 1.0, 'k': 3, 'weights': 'uniform'}
# 0.9969 at {'p': 1.0, 'k': 5, 'weights': 'uniform'}
# 0.9970 at {'p': 2.0, 'k': 5, 'weights': 'uniform'}
# 0.9970 at {'p': 3.0, 'k': 5, 'weights': 'uniform'}
# [*] 0.9971 at {'p': 2.0, 'k': 3, 'weights': 'uniform'}

Logistic Regression

Approach #1: no preprocessing


In [ ]:
X, y = load_mnist(mode='train', path='data/')
X /= 255.
train, test = TrainTestSplitter(shuffle=True, random_seed=1337).split(y, train_ratio=0.85)
y = one_hot(y)
logreg = LogisticRegression(n_batches=10, 
                            random_seed=1337, 
                            optimizer_params=dict(
                                max_epochs=100, 
                                learning_rate=1e-3)
                            )
logreg.fit(X[train], y[train], X_val=X[test], y_val=y[test])
y_pred = logreg.predict(X[test])
print accuracy_score(y_pred, y[test])
# 0.92755...

In [ ]:
X, y = load_mnist(mode='train', path='data/')
X /= 255.
train, test = TrainTestSplitter(shuffle=True, random_seed=1337).split(y, train_ratio=0.85)
y = one_hot(y)
logreg = LogisticRegression(n_batches=10, 
                            random_seed=1337, 
                            optimizer_params=dict(
                                max_epochs=100, 
                                learning_rate=1e-3)
                            )
logreg.fit(X[train], y[train], X_val=X[test], y_val=y[test])
y_pred = logreg.predict(X[test])
print accuracy_score(y_pred, y[test])
# 0.92766...

Validation for L2, learning rate


In [ ]:
X, y = load_mnist(mode='train', path='data/')
X /= 255.
train, test = TrainTestSplitter(shuffle=True, random_seed=1337).split(y, train_ratio=0.85)
y = one_hot(y)

for lr in (5 * 1e-5, 1e-4, 2 * 1e-4, 5 * 1e-4, 1e-3, 2 * 1e-3, 5 * 1e-3, 1e-2):
    for L2 in (0., 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1.):
        logreg = LogisticRegression(L2=L2,
                                    n_batches=10, 
                                    random_seed=1337, 
                            optimizer_params=dict(
                                max_epochs=600,
                                early_stopping=50,
#                                 verbose=True,
                                learning_rate=lr,
                                plot=False)
                            )
        logreg.fit(X[train], y[train], X_val=X[test], y_val=y[test])
        acc = logreg.evaluate(X[test], y[test])
        print "{0:.4f}, lr = {1}, L2 = {2}".format(acc, lr, L2)
# 0.9051, lr = 1e-05, L2 = 1e-06
# 0.9051, lr = 1e-05, L2 = 1e-05
# 0.9051, lr = 1e-05, L2 = 0.0001
# 0.9051, lr = 1e-05, L2 = 0.001
# 0.9049, lr = 1e-05, L2 = 0.01
# 0.9046, lr = 1e-05, L2 = 0.1
# 0.9009, lr = 1e-05, L2 = 1.0
# 0.9250, lr = 2e-05, L2 = 1e-06
# 0.9250, lr = 2e-05, L2 = 1e-05
# 0.9250, lr = 2e-05, L2 = 0.0001
# 0.9251, lr = 2e-05, L2 = 0.001
# 0.9248, lr = 2e-05, L2 = 0.01
# 0.9248, lr = 2e-05, L2 = 0.1
# 0.9268, lr = 5e-05, L2 = 0.0
# 0.9268, lr = 5e-05, L2 = 1e-06
# 0.9268, lr = 5e-05, L2 = 1e-05
# 0.9267, lr = 5e-05, L2 = 0.0001
# 0.9268, lr = 5e-05, L2 = 0.001
# 0.9270, lr = 5e-05, L2 = 0.01
# 0.9262, lr = 5e-05, L2 = 0.1
# 0.9216, lr = 5e-05, L2 = 1.0
# 0.9264, lr = 0.0001, L2 = 0.0
# 0.9264, lr = 0.0001, L2 = 1e-06
# 0.9266, lr = 0.0001, L2 = 1e-05
# 0.9266, lr = 0.0001, L2 = 0.0001
# 0.9266, lr = 0.0001, L2 = 0.001
# 0.9268, lr = 0.0001, L2 = 0.01
# 0.9262, lr = 0.0001, L2 = 0.1
# 0.9220, lr = 0.0001, L2 = 1.0
# 0.9267, lr = 0.0002, L2 = 0.0
# 0.9267, lr = 0.0002, L2 = 1e-06
# 0.9266, lr = 0.0002, L2 = 1e-05
# 0.9266, lr = 0.0002, L2 = 0.0001
# 0.9276, lr = 0.0002, L2 = 0.001
# 0.9264, lr = 0.0002, L2 = 0.01
# 0.9262, lr = 0.0002, L2 = 0.1
# 0.9218, lr = 0.0002, L2 = 1.0
# 0.9281, lr = 0.0005, L2 = 0.0
# 0.9281, lr = 0.0005, L2 = 1e-06
# 0.9282, lr = 0.0005, L2 = 1e-05
# 0.9280, lr = 0.0005, L2 = 0.0001
# 0.9278, lr = 0.0005, L2 = 0.001
# 0.9274, lr = 0.0005, L2 = 0.01
# 0.9264, lr = 0.0005, L2 = 0.1
# 0.9212, lr = 0.0005, L2 = 1.0
# 0.9276, lr = 0.001, L2 = 0.0
# 0.9277, lr = 0.001, L2 = 1e-06
# 0.9277, lr = 0.001, L2 = 1e-05
# 0.9277, lr = 0.001, L2 = 0.0001
# 0.9281, lr = 0.001, L2 = 0.001
# 0.9271, lr = 0.001, L2 = 0.01
# 0.9260, lr = 0.001, L2 = 0.1
# 0.9224, lr = 0.001, L2 = 1.0
# 0.9299, lr = 0.002, L2 = 0.0
# 0.9293, lr = 0.002, L2 = 1e-06
# 0.9292, lr = 0.002, L2 = 1e-05
# 0.9297, lr = 0.002, L2 = 0.0001
# 0.9292, lr = 0.002, L2 = 0.001
# 0.9291, lr = 0.002, L2 = 0.01
# 0.9281, lr = 0.002, L2 = 0.1
# 0.9232, lr = 0.002, L2 = 1.0
# 0.9294, lr = 0.005, L2 = 0.0
# [*] 0.9301, lr = 0.005, L2 = 1e-06
# 0.9294, lr = 0.005, L2 = 1e-05
# 0.9293, lr = 0.005, L2 = 0.0001
# 0.9294, lr = 0.005, L2 = 0.001
# 0.9299, lr = 0.005, L2 = 0.01
# 0.9277, lr = 0.005, L2 = 0.1
# 0.9227, lr = 0.005, L2 = 1.0
# 0.9274, lr = 0.01, L2 = 0.0
# 0.9266, lr = 0.01, L2 = 1e-06
# 0.9276, lr = 0.01, L2 = 1e-05
# 0.9286, lr = 0.01, L2 = 0.0001
# 0.9274, lr = 0.01, L2 = 0.001
# 0.9291, lr = 0.01, L2 = 0.01
# 0.9261, lr = 0.01, L2 = 0.1
# 0.9201, lr = 0.01, L2 = 1.0

Approach #2: PCA


In [ ]:
logregs = []
for i, n_components in enumerate(xrange(301, 401, 20)):
    pca_full = load_model('models/pca_full.json')
    pca_full.set_params(n_components=n_components, whiten=False)

    X, y = load_mnist(mode='train', path='data/')
    X /= 255.
    X = pca_full.transform(X)

    train, test = TrainTestSplitter(shuffle=True, random_seed=1337).split(y, train_ratio=0.85)
    y = one_hot(y)

    logreg = LogisticRegression(n_batches=10, 
                                random_seed=1337, 
                                optimizer_params=dict(
                                    max_epochs=500, 
                                    learning_rate=1e-3, 
                                    plot=False)
                                )
    logregs.append(logreg)
#     logregs[i].set_params(optimizer_params=dict(max_epochs=100, learning_rate=1e-3, plot=False))
    logreg.fit(X[train], y[train], X_val=X[test], y_val=y[test])
    y_pred = logreg.predict(X[test])
    print "PCA {0} --- {1:.4f}".format(n_components, accuracy_score(y_pred, y[test]))

#   W/O whitening    | with
# ---------------------------
# PCA 15  --- 0.8441 | 
# PCA 20  --- 0.8783 |
# PCA 25  --- 0.8874 |
# PCA 30  --- 0.8936 | 0.8931
# PCA 35  --- 0.9027 | 0.9029
# PCA 40  --- 0.9056 | 0.9051
# PCA 45  --- 0.9076 | 0.9077
# PCA 50  --- 0.9087 | 0.9083
# PCA 55  --- 0.9132 | 0.9134
# PCA 60  --- 0.9129 | 0.9129
# PCA 65  --- 0.9133 | 0.9129
# PCA 70  --- 0.9176 | 0.9173
# PCA 75  --- 0.9189 | 0.9186
# PCA 80  --- 0.9206 | 0.9200
# PCA 85  --- 0.9207 | 0.9207
# PCA 90  --- 0.9213 | 0.9212
# PCA 95  --- 0.9203 | 0.9198
# PCA 100 --- 0.9184 | 0.9188
# PCA 105 --- 0.9203 | 0.9198
# PCA 110 --- 0.9209 | 0.9202
# PCA 115 --- 0.9210 | 0.9209
# PCA 120 --- 0.9217 | 0.9212
# PCA 125            | 0.9228 [*]
# PCA 130            | 0.9210
# PCA 135            | 0.9220
# PCA 140            | 0.9211
# PCA 145            | 0.9202
# PCA 150            | 0.9208
# PCA 155            | 0.9223
# PCA 165          --- 0.9210
# PCA 170          --- 0.9207
# PCA 175          --- 0.9214
# PCA 180          --- 0.9211
# PCA 185          --- 0.9208
# PCA 190 -         -- 0.9208
# PCA 195          --- 0.9204
# PCA 200          --- 0.9208
# ...
# PCA 220          --- 0.9209
# PCA 230          --- 0.9214
# PCA 240          --- 0.9207
# ...
# PCA 301          --- 0.9207
# PCA 321          --- 0.9204
# ...

Approach #3: augment data (x5) for logreg and save to file

save data


In [4]:
X, y = load_mnist(mode='train', path='data/')
X /= 255.  
X = X.astype(np.float32)
aug = RandomAugmentator(transform_shape=(28, 28), random_seed=1337)
aug.add('RandomRotate', angle=(-5., 7.))
aug.add('RandomGaussian', sigma=(0., 0.5))
aug.add('RandomShift', x_shift=(-1, 1), y_shift=(-1, 1))
aug.add('Dropout', p=(0., 0.2))
X_aug = aug.transform(X, 4)
y_aug = np.repeat(y, 5)
y_aug = one_hot(y_aug)
np.save('data/X_aug_logreg.npy', X_aug)
np.save('data/y_aug_logreg.npy', y_aug)

load data


In [5]:
X = np.load('data/X_aug_logreg.npy')
y = np.load('data/y_aug_logreg.npy')
train, test = TrainTestSplitter(shuffle=True, random_seed=1337).split(y, train_ratio=29./30.)

grid searchs


In [ ]:
X = np.load('data/X_aug_logreg.npy')#[:25000]
y = np.load('data/y_aug_logreg.npy')#[:25000]
train, test = TrainTestSplitter(shuffle=True, random_seed=1337).split(y, train_ratio=29./30.)

for lr in reversed([1e-2, 1e-3, 1e-4, 1e-5, 1e-6]):
    for L2 in (1e-8, 1e-6, 1e-4, 1e-2, 1.):
        plot = (L2 == 1e-8)
        logreg = LogisticRegression(L2=L2,
                                    n_batches=64,
                                    # n_batches=10,
                                    random_seed=1337, 
                                    optimizer_params=dict(
                                        max_epochs=800,
                                        # max_epochs=20,
                                        early_stopping=50, 
                                        learning_rate=lr, 
                                        plot=plot,
                                        plot_dirpath='learning_curves_logreg_{0}/'.format(lr)
                                    ))
        logreg.fit(X[train], y[train], X_val=X[test], y_val=y[test])
        acc = logreg.evaluate(X[test], y[test])
        print "{0:.4f}, lr = {1}, L2 = {2}".format(acc, lr, L2)

        s = '{0:.4f}'.format(acc).replace('.', '_')
        t = 'models/logreg/logreg_{0}_{1}_{2}.json'.format(s, lr, L2)
        logreg.save(t)
        logreg_loaded = load_model(t)#.fit([[0.]], [[1]])

        print "{0:.4f}".format(logreg.evaluate(X[test], y[test]))
        
# 0.7843, lr = 1e-06, L2 = 1e-08
# 0.7843, lr = 1e-06, L2 = 1e-06
# 0.7843, lr = 1e-06, L2 = 0.0001
# 0.7843, lr = 1e-06, L2 = 0.01
# 0.7855, lr = 1e-06, L2 = 1.0
# 0.8754, lr = 1e-05, L2 = 1e-08
# 0.8754, lr = 1e-05, L2 = 1e-06
# ...
# 0.8805, lr = 1e-4, L2 whaterer
# ...
# 0.86.., lr = 1e-3, L2 whaterer

Approach #4: Exponential decay on singular values


In [15]:
pca_full = load_model('models/pca_full.json')
def load_big2():
    X, y = load_mnist(mode='train', path='data/')
    X_scaled = X / 255. # only divide by 255
    tts = TrainTestSplitter(shuffle=True, random_seed=1337)
    train, test = tts.split(y, train_ratio=50005./60000., stratify=True)
    return X_scaled[train], y[train], X_scaled[test], y[test] # 49999 train, 10001 val
X_train_orig, y_train, X_test_orig, y_test = load_big2()
y_train = one_hot(y_train)
y_test = one_hot(y_test)

In [ ]:
pca_full.set_params(n_components=35, whiten=True)
z = pca_full.explained_variance_ratio_[:35]
z /= sum(z)
for alpha in (1e-6, 1e-4, 1e-2, 0.1, 1., 2., 5., 10., 16., 25., 100.):
    print "alpha =", alpha
    X_train = pca_full.transform(X_train_orig)
    X_test = pca_full.transform(X_test_orig)
    X_train *= np.exp(alpha * z)
    X_test *= np.exp(alpha * z)
    logreg = LogisticRegression(L2=1e-6,
                                n_batches=10, 
                                random_seed=1337, 
                                optimizer_params=dict(
                                    max_epochs=600,
                                    early_stopping=50,
#                                 verbose=True,
                                    learning_rate=0.005,
                                    plot=False)
                                )
    logreg.fit(X_train, y_train, X_val=X_test, y_val=y_test)
    print logreg.evaluate(X_test, y_test)
# alpha = 1e-06
# 0.90800919908
# alpha = 0.0001
# 0.90800919908
# alpha = 0.01
# 0.90800919908
# alpha = 0.1
# 0.90800919908
# alpha = 1.0
# 0.90800919908
# alpha = 2.0
# 0.908109189081
# alpha = 5.0
# 0.907709229077
# alpha = 10.0
# 0.907809219078
# alpha = 16.0
# 0.907209279072
# alpha = 25.0
# 0.906409359064
# alpha = 100.0
# 0.505749425057

Approach #NN


In [ ]:
X_train = np.load('data/train_feats.npy')    
_, y_train = load_mnist('train', 'data/')

tts = TrainTestSplitter(shuffle=True, random_seed=1337)
train, val = tts.split(y_train, train_ratio=50005./60000., stratify=True) # 49999 : 10001

param_grid = dict(
    L2=[0] + np.logspace(-4., 1., 11).tolist(),
)
logreg_params = dict(n_batches=32,
                     random_seed=1337,
                     optimizer_params=dict(
                         max_epochs=750,
                         learning_rate=0.001,
                         early_stopping=50,
                         plot=False,
                         verbose=False
                     ))
for params in GridSearchCV(param_grid=param_grid).gen_params():
    logreg = LogisticRegression(**logreg_params).set_params(**params)
    logreg.fit(X_train[train], one_hot(y_train[train]), X_val=X_train[val], y_val=one_hot(y_train[val]))
    acc = logreg.evaluate(X_train[val], one_hot(y_train[val]))
    print "{0:.5f} at {1}".format(acc, val_acc, params)
    
# (Sorted)
#     0.99590 val at {'learning_rate': 0.005, 'L2': 3.1622776601683795}
#     0.99610 val at {'learning_rate': 0.005, 'L2': 0.31622776601683794}
#     0.99710 val at {'learning_rate': 0.005, 'L2': 0.0031622776601683794}
#     0.99710 val at {'learning_rate': 0.005, 'L2': 10.0}
#     0.99730 val at {'learning_rate': 0.005, 'L2': 0.031622776601683791}
#     0.99760 val at {'learning_rate': 0.005, 'L2': 0.0}
#     0.99770 val at {'learning_rate': 0.005, 'L2': 0.0001}
#     0.99780 val at {'learning_rate': 0.005, 'L2': 0.001}
#     0.99780 val at {'learning_rate': 0.005, 'L2': 0.01}
#     0.99790 val at {'learning_rate': 0.005, 'L2': 0.1}
#     0.99790 val at {'learning_rate': 0.005, 'L2': 1.0}
# [*] 0.99810 val at {'learning_rate': 0.005, 'L2': 0.00031622776601683794}

RBM

params


In [ ]:
X, y = load_mnist('train', 'data/')
indices, _ = TTS(shuffle=True, random_seed=1337).split(y, train_ratio=4.005/60., stratify=True)
X = X[indices]
X = X[:4000]
X /= 255.
param_grid = dict(
    n_hidden=[128, 256, 384],
    learning_rate=[0.05, 0.01, 0.005, '0.05->0.005', '0.01->0.001'],
    k=[1, 4],
    random_seed=[1337, 42],
)
rbm = RBM(persistent=True,
          n_epochs=40,
          early_stopping=12,
          momentum='0.5->0.99',
          batch_size=10,
          verbose=False)
done = 0
for thr in (False, True):
    if thr:
        X = (X > 0.5).astype(np.float32)
    for params in GS(param_grid=param_grid).gen_params(): # 60
        done += 1
        rbm.reset_params().set_params(**params)
        rbm.fit(X)
        mse = rbm.best_recon
        dirpath = 'tmp/rbm_ge0.5/' if thr else 'tmp/rbm/'
        rbm.save(dirpath + '{0:.5f}.json'.format(mse))
        print "mse {0:.5f} [{1}/120] at {2}!".format(mse, done, params)
# (Sorted)
# [*] mse 0.06684 [25/120] at {'k': 1, 'random_seed': 1337, 'learning_rate': '0.01->0.001', 'n_hidden': 256}!
# ...

In [3]:
rbm = load_model('models/rbm.json')

In [4]:
plot_rbm_filters(rbm.best_W)
plt.savefig('rbm_filters.png')


nudge and try again


In [13]:
# non-random nudging in all directions

X, y = load_mnist('train', 'data/')
X /= 255.
indices, _ = TrainTestSplitter(shuffle=True, random_seed=1337).split(y, train_ratio=4.005/60., stratify=True)
X = X[indices]
X = X[:4000]

X_aug = []
for x in X:
    X_aug.append(x)
    for t in RandomAugmentator(transform_shape=(28, 28), out_shape=(784,))\
             .add('RandomShift', x_shift=(-1, -1), y_shift=( 0,  0))\
             .transform_x(x, 1):
        X_aug.append(t)
    for t in RandomAugmentator(transform_shape=(28, 28), out_shape=(784,))\
             .add('RandomShift', x_shift=( 1,  1), y_shift=( 0,  0))\
             .transform_x(x, 1):
        X_aug.append(t)
    for t in RandomAugmentator(transform_shape=(28, 28), out_shape=(784,))\
             .add('RandomShift', x_shift=( 0,  0), y_shift=( 1,  1))\
             .transform_x(x, 1):
        X_aug.append(t)
    for t in RandomAugmentator(transform_shape=(28, 28), out_shape=(784,))\
             .add('RandomShift', x_shift=( 0,  0), y_shift=(-1, -1))\
             .transform_x(x, 1):
        X_aug.append(t)
X_aug = np.asarray(X_aug)
np.save('data/X_rbm_small.npy', X_aug)

In [6]:
X_aug = np.load('data/X_rbm_small.npy')

In [ ]:
param_grid = dict(
    learning_rate=['0.01->0.005', '0.05->0.001', '0.05->0.005', '0.01->0.001'],
    batch_size=[5, 10, 20, 40],
    random_seed=[1337, 42],
)
rbm = RBM(n_hidden=256,
          k=1,
          persistent=True,
          n_epochs=60,
          early_stopping=12,
          momentum='0.5->0.99',
          verbose=True)
done = 0
GS = GridSearchCV
for params in GS(param_grid=param_grid).gen_params():
    done += 1
    if done <= 16:
        continue
    rbm.reset_params().set_params(**params)
    rbm.fit(X_aug)
    mse = rbm.best_recon
    rbm.save('tmp/rbm_{0:.5f}.json'.format(mse))
    print "mse {0:.5f} [{1}/40] at {2}!".format(mse, done, params)
# (Sorted:) 
# [*] mse 0.06809 [19/40] at {'learning_rate': '0.05->0.001', 'random_seed': 1337, 'batch_size': 20}!
# ...

extract features and save features


In [ ]:
rbm = load_model('models/rbm.json')
X, _ = load_mnist('train', 'data/')
X /= 255.
F = np.dot(X, rbm.best_W) + rbm.hb # rbm.propup(X)
# F.min(), F.max(), F.mean() --> -3773.89447221 2.30920675476 -140.968359014
F = StandardScaler().fit_transform(F)
np.save('data/rbm_train.npy', F)

Approach #RBM


In [ ]:
X_train = np.load('data/rbm_train.npy')    
_, y_train = load_mnist('train', 'data/')

tts = TrainTestSplitter(shuffle=True, random_seed=1337)
train, val = tts.split(y_train, train_ratio=50005./60000., stratify=True) # 49999 : 10001

param_grid = dict(
    L2=np.logspace(-6., 1., 15),
)
logreg_params = dict(n_batches=32,
                     random_seed=1337,
                     optimizer_params=dict(
                         max_epochs=750,
                         learning_rate=0.001,
                         early_stopping=50,
                         plot=False,
                         verbose=False
                     ))
for params in GridSearchCV(param_grid=param_grid).gen_params():
    logreg = LogisticRegression(**logreg_params).set_params(**params)
    logreg.fit(X_train[train], one_hot(y_train[train]), X_val=X_train[val], y_val=one_hot(y_train[val]))
    acc = logreg.evaluate(X_train[val], one_hot(y_train[val]))
    print "{0:.5f} at {1}".format(acc, val_acc, params)
    
# 0.91800 test 0.92251 val at {'L2': 9.9999999999999995e-07}
# 0.91760 test 0.92241 val at {'L2': 3.1622776601683792e-06}
# ... D:

Neural Network

#1 augment data (x5) for NN and save to file


In [7]:
X, y = load_mnist(mode='train', path='data/')
X /= 255.
X = X.astype(np.float32)
aug = RandomAugmentator(transform_shape=(28, 28), random_seed=1337)
aug.add('RandomRotate', angle=(-5., 7.))
aug.add('RandomGaussian', sigma=(0., 0.5))
aug.add('RandomShift', x_shift=(-1, 1), y_shift=(-1, 1))
X_aug = aug.transform(X, 4)
y_aug = np.repeat(y, 5)
y_aug = one_hot(y_aug)
np.save('data/X_aug_nn.npy', X_aug)
np.save('data/y_aug_nn.npy', y_aug)

load data


In [2]:
X = np.load('data/X_aug_nn.npy')
y = np.load('data/y_aug_nn.npy')
train, test = TrainTestSplitter(shuffle=True, random_seed=1337).split(y, train_ratio=29./30.)

NN models


In [ ]:
print "Loading data ..."
X = np.load('data/X_aug_nn.npy')#[:30000]
y = np.load('data/y_aug_nn.npy')#[:30000]
train, test = TrainTestSplitter(shuffle=True, random_seed=1337).split(y, train_ratio=29./30.)

nn = NNClassifier(layers=[
                  FullyConnected(512),
                  Activation('leaky_relu'),
                  FullyConnected(256),
                  Activation('leaky_relu'),
                  FullyConnected(128),
                  Activation('leaky_relu'),
                  FullyConnected(32),
                  Activation('leaky_relu'),
                  FullyConnected(10),
                  Activation('softmax')
              ],
              n_batches=1024,
              shuffle=True,
              random_seed=1337,
              optimizer_params=dict(
                  max_epochs=100,
                  early_stopping=20,
                  verbose=True,
                  plot=True,
                  plot_dirpath='learning_curves_NN/',
                  learning_rate=1e-4
              ))
print "Initializing NN ..."
nn.fit(X[train], y[train], X_val=X[test], y_val=y[test])
print nn.evaluate(X[train], y[train], 'accuracy_score')

#  1) validation accuracy --> 0.9929
#  2) 512-256-128-32-10   Dropout(0.1)   --> 0.9906
#  3) 512-256-128-32-10   Dropout(0.2)   --> 0.9897
#  4) 600-300-128-32-10                  --> 0.9879
#  5) 600-300-128-32-10   Dropout(0.1)   --> 0.9914
#  6) 600-300-128-32-10   Dropout(0.12)  --> 0.9895
#  7) 800-400-200-100-10  Dropout(0.12)  --> 0.9929
#  8) 1024-512-256-128-10 Dropout(0.12)  --> 0.9944
#  9) 1024-D.05-768-D.1-256-128-10       --> 0.9905
# 10.a) 1024-768-256-128-10 Dropout(0.1) --> 0.9923
# 10.b) 1024-768-256-128-10 Dropout(0.2) --> 0.9892
# 10.c) 1024-768-256-128-10 Dropout(1/4) --> 0.9857
# 10.d) 1024-768-256-128-10 Dropout(0.5) --> 0.9686
# (...)

#2 more thorough augmentation


In [ ]:
X, y = load_mnist(mode='train', path='data/')
X = X / 255.
X = X.astype(np.float32)

tts = TrainTestSplitter(shuffle=False, random_seed=1337)
train, val = tts.split(y, train_ratio=55005.98/60000., stratify=True) # 55k : 5k
X_train, y_train, X_val, y_val = X[train], y[train], X[val], y[val]

y_val = one_hot(y_val)

np.save('data/nn_X_val.npy', X_val)
np.save('data/nn_y_val.npy', y_val)

aug = RandomAugmentator(transform_shape=(28, 28), random_seed=1337)
aug.add('RandomRotate', angle=(-5., 7.))
aug.add('RandomGaussian', sigma=(0., 0.5))
aug.add('RandomShift', x_shift=(-1, 1), y_shift=(-1, 1))

X_train = aug.transform(X_train, 4)
y_train = np.repeat(y_train, 5)
y_train = one_hot(y_train)

np.save('data/nn_X_train.npy', X_train)
np.save('data/nn_y_train.npy', y_train)

In [2]:
# 1.a) 1024-D.05-768-D.1-256-128-10  --> 0.9880
# 1.b) 1024-D.05-768-D.05-256-128-10 --> 0.9868
# 1.c) 1024-768-256-128-10           --> 0.9896
# 2)   1000-800-800-500-250-10       --> 0.9824
# ... --> 0.9838
# (...)
# WORSE!

back to #1


In [ ]:
#     11) 800-1024-512-256-128              --> 0.9933
#     12) 1337-911-666-128                  --> 0.9923
#     13) 800-D.05-1024-D.1-512-256-128     --> 0.9936
#     14) 800-D.05-1024-D.1-512-D.1-256-128 --> 0.9928
#     15) 1337-D.05-911-D.1-666-128         --> 0.9939
# [*] 16) 1337-D.05-911-D.1-666-333-128     --> 0.9948
#     17) 1337-D.1-911-D.2-666-333-128      --> 0.9887
#     18) ...                               --> 0.9930
#     19) ...                               --> 0.9935
#     20) 2048-D.1-1337-D.2-666-333         --> 0.9896
#     21) 2048-D.15-1337-D.25-666-333       --> 0.9723
#     22) 2048-D.05-1337-D.1-666-333        --> 0.9936
#     23) 2048-D.1-1337-D.2-666-333-128     --> 0.9892

Gaussian Processes

some benchmarks

exact linear systems solving


In [8]:
X, y = load_mnist('train', 'data/')
X /= 255.
y = one_hot(y)
gp = GPClassifier(algorithm='exact')
gp


Out[8]:
GPClassifier(K_=None, algorithm='exact', cg_max_iter=None, cg_tol=1e-05,
       f_=None, kernel='rbf', kernel_params={}, lml_=None, max_iter=100,
       n_samples=1000, pi_=None, random_seed=None, sigma_n=0.0, tol=1e-05)

In [ ]:
gp.reset_K()
with Stopwatch(verbose=True):
    gp.fit(X[:10], y[:10])
gp.reset_K()
with Stopwatch(verbose=True):
    gp.fit(X[:100], y[:100])
gp.reset_K()
with Stopwatch(verbose=True):
    gp.fit(X[:1000], y[:1000])
gp.reset_K()
with Stopwatch(verbose=True):
    gp.fit(X[:2000], y[:2000])
# Elapsed time: 0.046 sec
# Elapsed time: 0.518 sec
# Elapsed time: 59.686 sec
# Elapsed time: 298.424 sec

via CG


In [ ]:
gp = GPClassifier(algorithm='cg')
gp.reset_K()
with Stopwatch(verbose=True):
    gp.fit(X[:100], y[:100])
gp.reset_K()
with Stopwatch(verbose=True):
    gp.fit(X[:1000], y[:1000])
gp.reset_K()
with Stopwatch(verbose=True):
    gp.fit(X[:2000], y[:2000])
# Elapsed time: 0.044 sec
# Elapsed time: 0.262 sec
# Elapsed time: 50.412 sec
# Elapsed time: 259.823 sec

Approach #1. Raw data


In [2]:
sigma_n = np.concatenate(([0], np.logspace(-8., -4., 2)))
length_scale = np.logspace(-1., 2., 19)
gamma = 0.5/length_scale**2
# sigma_f = np.logspace(-2., 2., 7)
param_grid = ({'sigma_n': sigma_n, 
              'kernel_params': [dict(sigma=1., gamma=gamma_) for gamma_ in gamma]},
             {'sigma_n': sigma_n, 
              'kernel_params': [dict(sigma=0.1, gamma=gamma_) for gamma_ in gamma]},
             {'sigma_n': sigma_n, 
              'kernel_params': [dict(sigma=10., gamma=gamma_) for gamma_ in gamma]})
grid_cv = GridSearchCV(model=GPClassifier(algorithm='cg', random_seed=1337, tol=1e-8, cg_tol=1e-7, n_samples=1500), 
                       param_grid=param_grid,
                       train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                       n_splits=2, 
                       refit=True,
                       verbose=True)
print grid_cv.number_of_combinations()


171

In [3]:
[params for params in grid_cv.gen_params()][:3]


Out[3]:
[{'kernel_params': {'gamma': 49.999999999999993, 'sigma': 1.0},
  'sigma_n': 0.0},
 {'kernel_params': {'gamma': 49.999999999999993, 'sigma': 1.0},
  'sigma_n': 1e-08},
 {'kernel_params': {'gamma': 49.999999999999993, 'sigma': 1.0},
  'sigma_n': 0.0001}]

In [4]:
X, y = load_mnist(mode='train', path='data/')
X /= 255.
st = StandardScaler(copy=False, with_mean=True, with_std=False)
X = st.fit_transform(X)
tts = TrainTestSplitter(random_seed=1337, shuffle=True)
indices, _ = tts.split(y, train_ratio=0.02, stratify=True) # 1195 samples
X = X[indices]
y = y[indices]

In [ ]:
grid_cv.fit(X, y);
# Training GPClassifier on 1195 samples x 784 features.
# 2-fold CV for each of 171 params combinations == 342 fits ...

# iter:   1/342 +- elapsed:  21.159 sec   ...
# iter:   2/342 ++ elapsed:  35.444 sec - mean acc.: 0.1113 +/- 2 * 0.014
# iter:   3/342 +- elapsed:  49.669 sec - best acc.: 0.1113 at {'kernel_params': {'sigma': 1.0, 'gamma': 49.999999999999993}, 'sigma_n': 0.0}
# ...
# ...
# ...
# iter: 340/342 ++convergence is not reached
#  elapsed: 16914.8 sec - mean acc.: 0.1046 +/- 2 * 0.001
# iter: 341/342 +-convergence is not reached
#  elapsed: 17005.8 sec - best acc.: 0.6686 at {'kernel_params': {'sigma': 0.1, 'gamma': 0.049999999999999989}, 'sigma_n': 0.0}
# iter: 342/342 ++convergence is not reached
#  elapsed: 17083.8 sec - mean acc.: 0.1046 +/- 2 * 0.001

In [7]:
df = grid_cv.to_df()
df.to_excel('cv_results/gp_raw_full.xlsx')
df.sort_values(by='mean_score', ascending=False).head(25).to_excel('cv_results/gp_raw_best.xlsx')

Approach #2. PCA

load data


In [26]:
pca_full = load_model('models/pca_full.json')
X, y = load_mnist(mode='train', path='data/')
X /= 255.
# st = StandardScaler(copy=False, with_mean=True, with_std=False)
# X = st.fit_transform(X)
tts = TrainTestSplitter(random_seed=1337, shuffle=True)
indices, _ = tts.split(y, train_ratio=0.02, stratify=True) # 1195 samples
X = X[indices]
y = y[indices]

PCA w/o whitening


In [ ]:
# for n_components in xrange(5, 151, 5):
for n_components in xrange(12, 25):
    gamma = np.array([0.3, 0.6, 1.5, 3.0, 5.1]) / n_components
    param_grid = {'sigma_n': [1e-8],
                  'kernel_params': [dict(sigma=0.1, gamma=gamma_) for gamma_ in gamma]}
    grid_cv_params = dict(model=GPClassifier(algorithm='cg', random_seed=1337, tol=1e-8, cg_tol=1e-7, n_samples=1500), 
                           param_grid=param_grid,
                           train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                           n_splits=2, 
                           refit=True,
                           verbose=True)
    print "[PCA n_components = {0}]\n\n".format(n_components)
    X_current = pca_full.set_params(n_components=n_components, whiten=False).transform(X)
    grid_cv = GridSearchCV(**grid_cv_params).fit(X_current, y)
    df = grid_cv\
         .to_df()\
         .sort_values(by='mean_score', ascending=False)\
         .to_excel('cv_results/gp_pca_{0}_{1:.4f}.xlsx'.format(n_components, grid_cv.best_score_))
    print "\n\n"
# [PCA n_components = 5]


# Training GPClassifier on 1195 samples x 5 features.
# 2-fold CV for each of 5 params combinations == 10 fits ...

# iter:  1/10 +- elapsed:  11.139 sec   ...
# iter:  2/10 ++ elapsed:  22.108 sec - mean acc.: 0.5940 +/- 2 * 0.040
# iter:  3/10 +- elapsed:  32.947 sec - best acc.: 0.5940 at {'kernel_params': {'sigma': 0.1, 'gamma': 0.059999999999999998}, 'sigma_n': 1e-08}
# iter:  4/10 ++ elapsed:  41.796 sec - mean acc.: 0.6384 +/- 2 * 0.034
# iter:  5/10 +- elapsed:  49.648 sec - best acc.: 0.6384 at {'kernel_params': {'sigma': 0.1, 'gamma': 0.12}, 'sigma_n': 1e-08}
# iter:  6/10 ++ elapsed:  56.744 sec - mean acc.: 0.6728 +/- 2 * 0.018
# iter:  7/10 +- elapsed:  63.334 sec - best acc.: 0.6728 at {'kernel_params': {'sigma': 0.1, 'gamma': 0.29999999999999999}, 'sigma_n': 1e-08}
# iter:  8/10 ++ elapsed:  70.164 sec - mean acc.: 0.6410 +/- 2 * 0.012
# iter:  9/10 +- elapsed:  75.789 sec - best acc.: 0.6728 at {'kernel_params': {'sigma': 0.1, 'gamma': 0.29999999999999999}, 'sigma_n': 1e-08}
# iter: 10/10 ++ elapsed:  81.808 sec - mean acc.: 0.5172 +/- 2 * 0.003
# ...
# ...
# ...
# iter:  8/10 ++ elapsed:  80.497 sec - mean acc.: 0.7422 +/- 2 * 0.032
# iter:  9/10 +- elapsed:  85.950 sec - best acc.: 0.7481 at {'kernel_params': {'sigma': 0.1, 'gamma': 0.042857142857142858}, 'sigma_n': 1e-08}
# iter: 10/10 ++ elapsed:  91.393 sec - mean acc.: 0.5288 +/- 2 * 0.012



# [PCA n_components = 40]


# Training GPClassifier on 1195 samples x 40 features.
# 2-fold CV for each of 5 params combinations == 10 fits ...

# iter:  1/10 +- elapsed:  14.542 sec   ...
# iter:  2/10 ++ elapsed:  28.153 sec - mean acc.: 0.5832 +/- 2 * 0.016
# iter:  3/10 +- elapsed:  39.689 sec - best acc.: 0.5832 at {'kernel_params': {'sigma': 0.1, 'gamma': 0.0074999999999999997}, 'sigma_n': 1e-08}

more thoroughly


In [ ]:
n_components = 20
whiten = False

X = pca_full.set_params(n_components=n_components, whiten=whiten).transform(X)
X = StandardScaler(copy=False, with_mean=True, with_std=False).fit_transform(X)

sigma_n = [0., 1e-8, 1e-6, 1e-4, 1e-2]
sigma_f = np.logspace(-2., 1., 6)
gamma = np.linspace(0.04, 0.12, 16, True)
param_grid = [{'sigma_n': sigma_n, 'kernel_params': [dict(sigma=sigma, gamma=gamma_) for gamma_ in gamma]} for sigma in sigma_f]
grid_cv = GridSearchCV(model=GPClassifier(algorithm='cg', 
                                          random_seed=1337, 
                                          max_iter=200, 
                                          tol=1e-8, 
                                          cg_tol=1e-7, 
                                          n_samples=1500), 
                       param_grid=param_grid,
                       train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                       n_splits=2, 
                       refit=True,
                       verbose=True)
grid_cv.number_of_combinations() # 480
grid_cv.fit(X, y);
# Training GPClassifier on 1195 samples x 20 features.
# 2-fold CV for each of 480 params combinations == 960 fits ...

# iter:   1/960 +- elapsed:   1.409 sec   ...
# iter:   2/960 ++ elapsed:   2.614 sec - mean acc.: 0.6368 +/- 2 * 0.023
# iter:   3/960 +- elapsed:   3.875 sec - best acc.: 0.6368 at {'kernel_params': {'sigma': 0.01, 'gamma': 0.040000000000000001}, 'sigma_n': 0.0}
# ...
# ...
# ...
# iter: 958/960 ++convergence is not reached
#  elapsed: 9239.23 sec - mean acc.: 0.2006 +/- 2 * 0.075
# iter: 959/960 +-convergence is not reached
#  elapsed: 9253.45 sec - best acc.: 0.8677 at {'kernel_params': {'sigma': 0.63095734448019303, 'gamma': 0.082666666666666666}, 'sigma_n': 0.0}
# iter: 960/960 ++convergence is not reached
#  elapsed: 9267.46 sec - mean acc.: 0.7169 +/- 2 * 0.094

In [30]:
# df = grid_cv.to_df()
# df.to_excel('cv_results/gp_2_full.xlsx')
df.sort_values(by='mean_score', ascending=False).head(64).to_excel('cv_results/gp_2_best.xlsx')

PCA whitening


In [ ]:
for n_components in xrange(5, 151, 5):
    gamma = np.array([0.3, 0.6, 1.5, 3.0, 5.1]) / n_components
    param_grid = {'sigma_n': [1e-8],
                  'kernel_params': [dict(sigma=0.1, gamma=gamma_) for gamma_ in gamma]}
    grid_cv_params = dict(model=GPClassifier(algorithm='cg', random_seed=1337, tol=1e-8, cg_tol=1e-7, n_samples=1500), 
                           param_grid=param_grid,
                           train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                           n_splits=2, 
                           refit=True,
                           verbose=True)
    print "[PCA n_components = {0}]\n\n".format(n_components)
    X_current = pca_full.set_params(n_components=n_components, whiten=True).transform(X)
    grid_cv = GridSearchCV(**grid_cv_params).fit(X_current, y)
    df = grid_cv\
         .to_df()\
         .sort_values(by='mean_score', ascending=False)\
         .to_excel('cv_results/gp_pca_whiten_{0}_{1:.4f}.xlsx'.format(n_components, grid_cv.best_score_))
    print "\n\n"
# the best is 0.79.. <-- worse

Approach #3. Exponential decay on the normalized explained variance


In [ ]:
pca_full = load_model('models/pca_full.json')
X, y = load_mnist(mode='train', path='data/')
X /= 255.
tts = TrainTestSplitter(random_seed=1337, shuffle=True)
indices, _ = tts.split(y, train_ratio=0.03, stratify=True) # 1794 samples
X = X[indices]
y = y[indices]
X = pca_full.set_params(n_components=20, whiten=True).transform(X)
X = StandardScaler(copy=False, with_mean=True, with_std=False).fit_transform(X)
z = pca_full.explained_variance_ratio_[:20]
z /= sum(z)
train, test = tts.split(y, train_ratio=0.5, stratify=True)
# for alpha in np.logspace(-6., 2., 9):
# for alpha in np.logspace(-3., 1.2, 9):
# for alpha in np.arange(1.4, 9.8, 0.4):
# for alpha in np.arange(5.4, 6.6, 0.1):
for alpha in np.arange(6.05, 6.15, 0.01):
    X_train = X[train] * np.exp(alpha * z)
    X_test  = X[test]  * np.exp(alpha * z)
    gp = GPClassifier(algorithm='cg',
                      sigma_n=1e-8,
                      kernel_params=dict(sigma=0.1, gamma=0.075),
                      n_samples=1500,
                      tol=1e-7,
                      max_iter=200,
                      random_seed=1337,
                      cg_tol=1e-7)
    gp.fit(X_train, y[train])
    acc = gp.evaluate(X_test, y[test])
    print "{0:.4f}, alpha = {1}".format(acc, alpha)
# 0.8122, alpha = 0.01
# 0.8111, alpha = 0.1
# 0.8211, alpha = 1.0
# -----
# 0.8111, alpha = 0.125892541179
# 0.8122, alpha = 0.421696503429
# 0.8244, alpha = 1.41253754462
# 0.8511, alpha = 4.73151258961
# 0.4056, alpha = 15.8489319246
# -----
# 0.8478, alpha = 4.6
# 0.8500, alpha = 5.0
# 0.8433, alpha = 5.4
# 0.8578, alpha = 5.8
# 0.8544, alpha = 6.2
# 0.8500, alpha = 6.6
# ----
# 0.8578, alpha = 5.9
# 0.8578, alpha = 6.0
# 0.8589, alpha = 6.1
# 0.8544, alpha = 6.2
# ----
# 0.8556, alpha = 6.08
# 0.8589, alpha = 6.09
# 0.8589, alpha = 6.1
# 0.8556, alpha = 6.11

more thoroughly


In [18]:
pca_full = load_model('models/pca_full.json')
X, y = load_mnist(mode='train', path='data/')
X /= 255.
# st = StandardScaler(copy=False, with_mean=True, with_std=False)
# X = st.fit_transform(X)
tts = TrainTestSplitter(random_seed=1337, shuffle=True)
indices, _ = tts.split(y, train_ratio=0.02, stratify=True) # 1195 samples
X = X[indices]
y = y[indices]

first exp(alpha * z) and after that mean


In [ ]:
n_components = 20
whiten = True
X = pca_full.set_params(n_components=n_components, whiten=whiten).transform(X)
alpha = 6.1

z = pca_full.explained_variance_ratio_[:20]
z /= sum(z)
X *= np.exp(alpha * z)
X = StandardScaler(copy=False, with_mean=True, with_std=False).fit_transform(X)

sigma_n = [0., 1e-4, 1e-2]
sigma_f = [0.1, 0.3, 0.5, 0.7, 0.9]
gamma = np.linspace(0.08, 0.11, 7, True)
param_grid = [{'sigma_n': sigma_n, 'kernel_params': [dict(sigma=sigma, gamma=gamma_) for gamma_ in gamma]} for sigma in sigma_f]
grid_cv = GridSearchCV(model=GPClassifier(algorithm='cg', 
                                          random_seed=1337, 
                                          max_iter=200, 
                                          tol=1e-8, 
                                          cg_tol=1e-7, 
                                          n_samples=1500), 
                       param_grid=param_grid,
                       train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                       n_splits=2, 
                       refit=True,
                       verbose=True)
print grid_cv.number_of_combinations() # 105
grid_cv.fit(X, y);
# Training GPClassifier on 1195 samples x 20 features.
# 2-fold CV for each of 105 params combinations == 210 fits ...

# iter:   1/210 +-convergence is not reached
#  elapsed:   9.902 sec   ...
# iter:   2/210 ++convergence is not reached
#  elapsed:  19.138 sec - mean acc.: 0.7798 +/- 2 * 0.030
# iter:   3/210 +-convergence is not reached
#  elapsed:  28.945 sec - best acc.: 0.7798 at {'kernel_params': {'sigma': 0.1, 'gamma': 0.080000000000000002}, 'sigma_n': 0.0}
# ...
# ...
# ...
# iter: 208/210 ++convergence is not reached
#  elapsed: 2135.44 sec - mean acc.: 0.7606 +/- 2 * 0.022
# iter: 209/210 +-convergence is not reached
#  elapsed: 2145.34 sec - best acc.: 0.8702 at {'kernel_params': {'sigma': 0.7, 'gamma': 0.080000000000000002}, 'sigma_n': 0.01}
# iter: 210/210 ++convergence is not reached
#  elapsed: 2155.47 sec - mean acc.: 0.7615 +/- 2 * 0.022

In [12]:
df = grid_cv.to_df()
df.to_excel('cv_results/gp_3_full.xlsx')
df.sort_values(by='mean_score', ascending=False).head(64).to_excel('cv_results/gp_3_best.xlsx')

Approach #4. RQ Kernel

4.1 find reasonable ranges for params for PCA-20


In [ ]:
n_components = 20
whiten = False

X = pca_full.set_params(n_components=n_components, whiten=whiten).transform(X)
X = StandardScaler(copy=False, with_mean=True, with_std=False).fit_transform(X)

sigma_n = [0., 1e-8, 1e-6]
l = np.logspace(-1., 2., 12)
alpha = np.logspace(0., 2., 5)

param_grid = [{'sigma_n': sigma_n, 
               'kernel_params': [dict(sigma=0.1, alpha=alpha_, l=l_) for alpha_ in alpha]} for l_ in l]
grid_cv = GridSearchCV(model=GPClassifier(algorithm='cg', 
                                          kernel='RationalQuadratic',
                                          random_seed=1337, 
                                          max_iter=200, 
                                          tol=1e-8, 
                                          cg_tol=1e-7, 
                                          n_samples=1500), 
                       param_grid=param_grid,
                       train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                       n_splits=2, 
                       refit=True,
                       verbose=True)
grid_cv.number_of_combinations() # 180
grid_cv.fit(X, y);
# Training GPClassifier on 1195 samples x 20 features.
# 2-fold CV for each of 180 params combinations == 360 fits ...

# iter:   1/360 +-convergence is not reached
#  elapsed:   8.978 sec   ...
# iter:   2/360 ++convergence is not reached
#  elapsed:  17.204 sec - mean acc.: 0.1138 +/- 2 * 0.002
# iter:   3/360 +-convergence is not reached
#  elapsed:  26.859 sec - best acc.: 0.1138 at {'kernel_params': {'alpha': 1.0, 'sigma': 0.1, 'l': 0.10000000000000001}, 'sigma_n': 0.0}
# ...
# ...
# ...
# iter: 358/360 ++convergence is not reached
#  elapsed: 2948.18 sec - mean acc.: 0.1121 +/- 2 * 0.000
# iter: 359/360 +-convergence is not reached
#  elapsed: 2959.30 sec - best acc.: 0.8025 at {'kernel_params': {'alpha': 100.0, 'sigma': 0.1, 'l': 2.3101297000831593}, 'sigma_n': 0.0}
# iter: 360/360 ++convergence is not reached
#  elapsed: 2971.48 sec - mean acc.: 0.1121 +/- 2 * 0.000

In [20]:
df = grid_cv.to_df()
df.to_excel('cv_results/gp_rq_full.xlsx')
df.sort_values(by='mean_score', ascending=False).head(64).to_excel('cv_results/gp_rq_best.xlsx')

[discarded] 4.2 PCA components (whiten|x)

[discarded] 4.3 Exponential decay ...

Approach #NN


In [ ]:
X = np.load('data/train_feats.npy')
_, y = load_mnist('train', 'data/')
tts = TrainTestSplitter(shuffle=True, random_seed=1337)
indices, _ = tts.split(y, train_ratio=1300./60000., stratify=True)
y = y[indices]
y = one_hot(y)
X = X[indices]

# sigma_n = [0., 1e-4, 1e-2]
sigma_n = [0, 1e-8, 1e-6]
# sigma_f = [0.1, 1., 10.]
# sigma_f = np.logspace(-1., 1., 5) 
sigma_f = np.logspace(-0.9, -0.2, 5)
# length_scale = np.logspace(-1., 2., 19)
# gamm = 0.5/length_scale**2
# gamma = np.logspace(-4., -2.1, 19) 
gamma = np.logspace(-3.7, -3., 11)
param_grid = [{'sigma_n': sigma_n, 
               'kernel_params': [dict(sigma=sigma, gamma=gamma_) for gamma_ in gamma]} for sigma in sigma_f]
grid_cv = GridSearchCV(model=GPClassifier(algorithm='cg', 
                                          random_seed=1337, 
                                          max_iter=200, 
                                          tol=1e-8, 
                                          cg_tol=1e-7, 
                                          n_samples=1500), 
                       param_grid=param_grid,
                       train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                       n_splits=2, 
                       refit=True,
                       verbose=True)
grid_cv.number_of_combinations() # 44
grid_cv.fit(X, y);

# Training GPClassifier on 1295 samples x 128 features.
# 2-fold CV for each of 171 params combinations == 342 fits ...

# iter:   1/342 +- elapsed:   3.584 sec   ...
# ...
# ...
# ...
# iter: 226/342 ++ elapsed: 4405.15 sec - mean acc.: 0.1042 +/- 2 * 0.001
# iter: 227/342 +- elapsed: 4432.84 sec - best acc.: 0.9846 at {'kernel_params': {'sigma': 0.1, 'gamma': 0.00050000000000000001}, 'sigma_n': 0.0}
# iter: 228/342 ++ elapsed: 4460.72 sec - mean acc.: 0.1042 +/- 2 * 0.001

# --------------------------------------------------------------

# Training GPClassifier on 1295 samples x 128 features.
# 2-fold CV for each of 285 params combinations == 570 fits ...

# iter:   1/570 +- elapsed:  28.242 sec   ...
# iter:   2/570 ++ elapsed:  51.745 sec - mean acc.: 0.9799 +/- 2 * 0.005
# ...
# ...
# ...
# iter: 418/570 ++ elapsed: 9589.25 sec - mean acc.: 0.5370 +/- 2 * 0.119
# iter: 419/570 +- elapsed: 9605.40 sec - best acc.: 0.9861 at {'kernel_params': {'sigma': 0.31622776601683794, 'gamma': 0.00033711476775509616}, 'sigma_n': 0.0}
# iter: 420/570 ++ elapsed: 9620.88 sec - mean acc.: 0.6041 +/- 2 * 0.119

# ---------------------------------------------------------------

# Training GPClassifier on 1096 samples x 128 features.
# 2-fold CV for each of 165 params combinations == 330 fits ...

# iter:   1/330 +-convergence is not reached
#  elapsed:  21.402 sec   ...
# iter:   2/330 ++convergence is not reached
#  elapsed:  41.644 sec - mean acc.: 0.9845 +/- 2 * 0.006
# iter:   3/330 +-convergence is not reached
#  elapsed:  59.386 sec - best acc.: 0.9845 at {'kernel_params': {'sigma': 0.12589254117941673, 'gamma': 0.00019952623149688788}, 'sigma_n': 0}
# ...
# ...
# ...
# iter: 328/330 ++convergence is not reached
#  elapsed: 7163.06 sec - mean acc.: 0.8219 +/- 2 * 0.129
# iter: 329/330 +-convergence is not reached
#  elapsed: 7184.70 sec - best acc.: 0.9899 at {'kernel_params': {'sigma': 0.42169650342858211, 'gamma': 0.00085113803820237679}, 'sigma_n': 0}
# iter: 330/330 ++convergence is not reached
#  elapsed: 7208.78 sec - mean acc.: 0.8219 +/- 2 * 0.129

In [ ]:
df = grid_cv.to_df()
df.to_excel('cv_results/gp_nn_full.xlsx')
df.sort_values(by='mean_score', ascending=False).to_excel('cv_results/gp_nn_best.xlsx')

Approach #RBM


In [ ]:
X = np.load('data/rbm_train.npy')
_, y = load_mnist('train', 'data/')
tts = TrainTestSplitter(shuffle=True, random_seed=1337)
indices, _ = tts.split(y, train_ratio=1100./60000., stratify=True) 
X = X[indices]
y = y[indices]
y = one_hot(y)

sigma_n = [0.]
sigma_f = [0.1]
length_scale = np.logspace(-1., 2., 13)
# gamma = np.logspace(-3.7, -3., 11)
gamma = np.logspace(-5., -0., 19) 
param_grid = [{'sigma_n': sigma_n, 
               'kernel_params': [dict(sigma=sigma, gamma=gamma_) for gamma_ in gamma]} for sigma in sigma_f]
grid_cv = GridSearchCV(model=GPClassifier(algorithm='cg', 
                                          random_seed=1337, 
                                          max_iter=200, 
                                          tol=1e-8, 
                                          cg_tol=1e-7, 
                                          n_samples=1500), 
                       param_grid=param_grid,
                       train_test_splitter_params=dict(shuffle=True, random_seed=1337), 
                       n_splits=2, 
                       refit=True,
                       verbose=True)
print grid_cv.number_of_combinations()
grid_cv.fit(X, y)
#
# [*] 0.683... D:

In [ ]:
df = grid_cv.to_df()
df.to_excel('cv_results/gp_rbm_full.xlsx')
df.sort_values(by='mean_score', ascending=False).to_excel('cv_results/gp_rbm_best.xlsx')

Misc

Plot confusion matrices for final models


In [2]:
_, y_test = load_mnist('test', 'data/')
# knn_pred = np.load('data/knn_pred.npy')
# nn_pred = unhot(np.load('data/nn_pred.npy'))
# logreg_pred = unhot(np.load('data/logreg_pred.npy'))
gp_pred = unhot(np.load('data/gp_pred.npy'))

In [3]:
# C = confusion_matrix(y_test, knn_pred)
# ax = plot_confusion_matrix(C)
# plt.title("Confusion matrix for k-NN model", fontsize=18)
# plt.savefig('confusion_matrix_knn.png', dpi=144)

# C = confusion_matrix(y_test, knn_pred)
# ax = plot_confusion_matrix(C)
# plt.title("Confusion matrix for k-NN model", fontsize=18)
# plt.savefig('confusion_matrix_knn.png', dpi=144)

# C = confusion_matrix(y_test, nn_pred)
# ax = plot_confusion_matrix(C)
# plt.title("Confusion matrix for NN model", fontsize=18)
# plt.savefig('confusion_matrix_nn.png', dpi=144)

# C = confusion_matrix(y_test, logreg_pred)
# ax = plot_confusion_matrix(C)
# plt.title("Confusion matrix for LogReg model", fontsize=18)
# plt.savefig('confusion_matrix_logreg.png', dpi=144)

C = confusion_matrix(y_test, gp_pred)
ax = plot_confusion_matrix(C)
plt.title("Confusion matrix for GP model", fontsize=18)
plt.savefig('confusion_matrix_gp.png', dpi=144)