In [ ]:

    
import os, gc
os.chdir( '/Users/innazarov/ownCloud/' )



In [ ]:

    
import warnings ; warnings.simplefilter( "ignore" )
import numpy as np, pandas as pd, flex as fl, scipy as sp

%matplotlib inline
import matplotlib.pyplot as plt



In [ ]:

    
mnist_npz = '/Users/innazarov/Github/study_notes/year_14_15/spring_2015/machine_learning/data/mldata/mnist_scikit.npz'
assert( os.path.exists( mnist_npz ) )
with np.load( mnist_npz, 'r' ) as npz :
    mnist_labels, mnist_data = np.asarray( npz[ 'labels' ], np.int ), npz[ 'data' ] / 255.0
mnist_classes = np.unique( mnist_labels )



In [ ]:

    
from sklearn import *



In [ ]:

    
def fast_plot( data, title, **kwargs ) :
    axis = plt.figure( figsize = ( 16, 9 ) ).add_subplot( 111 )
    axis.set_title( title )
    fl.plot( axis, data, shape = ( 28, -1 ), cmap = plt.cm.hot, interpolation = "nearest", **kwargs )
    plt.show( )



In [ ]:

    
def confusion( actual, predicted ) :
    tbl_ = sp.sparse.coo_matrix( ( np.ones_like( predicted ), ( predicted, actual ) ) )
    tbl = pd.DataFrame( tbl_.todense( ), index = mnist_classes, columns = mnist_classes )
    tbl.index.name = "Predicted"
    tbl.columns.name = "Actual"
    return tbl



In [ ]:

    
random_state = np.random.RandomState( None )



In [ ]:

    
split_ = cross_validation.train_test_split( mnist_data, mnist_labels, test_size = 0.75,
                                            random_state = random_state )
X_train, X_test, y_train, y_test = split_



In [ ]:

    
print X_train.shape
print X_test.shape

Draw and plot some random subset of the train dataset.



In [ ]:

    
index_perm_ = np.random.permutation( X_train.shape[ 0 ] )
X = X_train[ index_perm_[ : 400 ] ]
fast_plot( X, u"200 random digits MNIST train dataset", n = 200 )

Simple average pattern classifier

Generate averaged images



In [ ]:

    
train_digits_ = { label_ : X_train[ np.flatnonzero( y_train == label_ ) ]
                           for label_ in mnist_classes }

avg_patterns_ = { label_ : samples_.mean( axis = 0 )[ np.newaxis ]
                  for label_, samples_ in train_digits_.iteritems() }

## Construct an aligned pair of arrays
average_digits = np.concatenate( [ array_ for array_ in avg_patterns_.itervalues() ], axis = 0 )
average_labels = np.array( [ label_ for label_ in avg_patterns_.iterkeys() ], dtype = np.int )
del avg_patterns_

## Sort them
order_ = np.argsort( average_labels )
average_labels = average_labels[ order_ ]
average_digits = average_digits[ order_ ]



In [ ]:

    
## Plot the patterns
fast_plot( average_digits, u"Averaged digits on train" )

A simple minimal error calssifier based on mean patterns. $\newcommand{\argmin}{\mathop{\mathtt{argmin}}} \newcommand{\Dcal}{\mathcal{D}}$ Basically this is a simple gaussian model: $$ \log \mathcal{L} = - \frac{|\Dcal|}{2} \log 2\pi - \frac{|\Dcal|}{2} \log \sigma^2

                  - \frac{1}{2 \sigma^2} \sum_{x\in \Dcal} ( x - \mu_{k_x} )^2 \,. $$

Classification is done with this rule: \begin{align*} \hat{y}(x) & = \argmin_{k=1,\ldots,K} \, \sum_{n=1}^N \sum_{m=1}^M ( x_{nm} - \mu_{knm} )^2 \\ & = \argmin_{k=1,\ldots,K} \, \sum_{n=1}^N \sum_{m=1}^M - 2 x_{nm} \mu_{knm} + \mu_{knm}^2 \\ & = \argmin_{k=1,\ldots,K} \, \|\mu_k\|^2 - 2 \langle x, \mu_k \rangle \,, \end{align*} where $$ \langle f, g \rangle = \sum_{n=1}^N \sum_{m=1}^M f_{nm} g_{nm}\,. $$



In [ ]:

    
## Compute the squared error ...
yp_ = - 2 * np.tensordot( X_test, average_digits, axes = [ -1, -1 ] ) \
      + average_digits.dot( average_digits.T )[:1]
## ... and based on it find the closest label.
pred_labels_ = average_labels[ yp_.argmin(axis = 1) ]

Construct the confusion matrix



In [ ]:

    
confusion( y_test, pred_labels_ )

Compute the accuracy



In [ ]:

    
print "Achieved accuracy is %.3f%%" % ( np.mean( y_test == pred_labels_ ) * 100.0, )

Random Forest FTW!

Initalize and fit a random forest to the train dataset.



In [ ]:

    
rfc_ = ensemble.RandomForestClassifier( n_estimators = 50, n_jobs = -1,
                                        random_state = random_state ).fit( X_train, y_train )

Predict on the test dataset.



In [ ]:

    
pred_labels_ = rfc_.classes_[ rfc_.predict_proba( X_test ).argmax( axis = 1 ) ]

Display the accuracy ...



In [ ]:

    
print "Achieved accuracy is %.3f%%" % ( np.mean( y_test == pred_labels_ ) * 100.0, )

... and the confusion matrix.



In [ ]:

    
confusion( y_test, pred_labels_ )

Show the pixel-feature importances of the random forest.



In [ ]:

    
fast_plot( rfc_.feature_importances_[ np.newaxis ],
           u"Random Forest feature importances",
           n_row = 1, n_col = 1 )

PCA

Let's use PCA to learn a linear manifold from the data. Scipy's SVD returns $U$, $\Sigma$ and $V'$ (not $V$).



In [ ]:

    
scl_ = preprocessing.StandardScaler( ).fit( X_train )
U, S, V = sp.linalg.svd( scl_.transform( X_train ), full_matrices = False )
order_ = S.argsort( )[::-1]
U, S, V = U[:,order_], S[ order_ ], V[order_]

Compute variance within each principal direction.



In [ ]:

    
var_ = S**2
leverage_ = np.cumsum( var_ ) / np.sum( var_ )
n_components = np.min( np.flatnonzero( leverage_ > 0.95 ) )

print """The least number of principal components required to""" \
    + """guarantee at least 95%% recostruction is %d.""" % ( n_components, ) 

plt.plot( leverage_ )
plt.axhline( y = 0.95, color = 'k', lw = 2 )

Chose the number of components.



In [ ]:

    
n_components = 256

Extract the principal components



In [ ]:

    
pc_ = U[:,:n_components] * S[ :n_components ]

Embed the compnents in the original space.



In [ ]:

    
X_ = scl_.inverse_transform( np.dot( pc_, V[:n_components] ) )

The eigenvectors are



In [ ]:

    
fast_plot( X_, u"Principal components of train dataset", n = 512 )

See how well it fares on the test dataset: reconstruct ...



In [ ]:

    
pc_ = np.dot( scl_.transform( X_test ), V[:n_components].T )
X_ = scl_.inverse_transform( np.dot( pc_, V[:n_components] ) )

... and display



In [ ]:

    
fast_plot( np.abs(X_-X_test), u"Low rank reconstruction error of the test dataset", n = 32 )

Naive Bayes

Let's use a Gaussian naive bayes classifier: $\newcommand\given{\:\vert\:}$

The Naive Bayes assumes that the features are conditionally independent, provided the class label $y$ is known. In particular for any $i\in \{1,\ldots,d\}$ one has: $$ p\bigl( x_1,\ldots,x_p \:\big\vert\: y, \Theta\bigr) = \prod_{i=1}^d p\bigl( x_i \:\big\vert\: y, \Theta\bigr) \,, $$ whence using the Bayes' rule : \begin{align*} p\bigl( y \:\big\vert\: x, \Theta\bigr) &= p\bigl( y \:\big\vert\: x, \Theta\bigr) \frac{p(y\:\big\vert\: \Theta)}{p(x\:\big\vert\: \Theta)} \\ &\propto p\bigl(y\:\big\vert\: \Theta\bigr) \prod_{i=1}^d p\bigl( x_i \:\big\vert\: y, \Theta\bigr) \,, \end{align*} where $\Theta$ is the parameter vector of the model.

Prediction is done using the maximum posterior porbability principle: $\newcommand{\argmax}{\mathop{\mathtt{argmax}}}$ $$ \hat{y}(x) = \argmax_{y} \log p\bigl(y\:\big\vert\: \Theta\bigr)

                        + \sum_{i=1}^d \log p\bigl( x_i \:\big\vert\: y, \Theta\bigr) \,. $$

Fitting the Naive Bayes classifier is done unsing the ML approach: $$ \hat{\Theta} = \argmax\Theta \sum{j=1}^J \log p\bigl(y_j\:\big\vert\: \Theta\bigr)

                        + \sum_{j=1}^J \sum_{i=1}^d \log p\bigl( x_{ji} \:\big\vert\: y_j, \Theta\bigr) \,. $$

Gaussian NB classifier assumes that $$ p\bigl( x_i \:\big\vert\: y, \Theta\bigr) = \bigl(2\pi \sigma^2_{yi}\bigr)^{-\frac{1}{2}} \mathtt{exp}\biggl( -\frac{1}{2\sigma^2_{yi}} (x_i - \mu_{yi} ) \biggr) $$



In [ ]:

    
gnb_ = naive_bayes.GaussianNB( ).fit( X_train, y_train )

Predict the labels ...



In [ ]:

    
pred_labels_ = gnb_.classes_[ gnb_.predict_proba( X_test ).argmax( axis = 1 ) ]

... and report the accuracy.



In [ ]:

    
print np.mean( pred_labels_ == y_test )

Display the confusion matrix



In [ ]:

    
confusion( y_test, pred_labels_ )

The estimated mean of each pixel form the following patterns:



In [ ]:

    
fast_plot( gnb_.theta_, u"Estimated mean values of each pixel feature given a class", n = 10 )

They seem identical to the average patterns and indeed they are:



In [ ]:

    
np.allclose( gnb_.theta_, average_digits )

Let's see the estimated variablility of each pixel.



In [ ]:

    
fast_plot( gnb_.sigma_, u"GNB Estimated standard deviation of individual pixels", n = 10 )

Logistic Regression : $l_2$

Let's employ logisitc regression to classify the digits.



In [ ]:

    
X_train_ = scl_.transform( X_train )
lorl2_ = linear_model.LogisticRegression( C = 1.0, penalty = "l2" ).fit( X_train_, y_train )

Predict on the test.



In [ ]:

    
pred_labels_ = lorl2_.predict( scl_.transform( X_test ) )
print "Accuracy score is %.3f%%" % ( 100*np.mean( pred_labels_ == y_test ), )

Display a confusion matrix



In [ ]:

    
confusion( y_test, pred_labels_ )

Let's visualize the weights of the estimated logistic regression.



In [ ]:

    
fast_plot( lorl2_.coef_, u"Weights of the $l_2$ logistic regression", n = 10 )

Test an $l_1$ regularization.



In [ ]:

    
X_train_ = scl_.transform( X_train )
lorl1_ = linear_model.LogisticRegression( C = 1.0, penalty = "l1" ).fit( X_train_, y_train )

Report accuracy



In [ ]:

    
pred_labels_ = lorl1_.predict( scl_.transform( X_test ) )
print "Accuracy score is %.3f%%" % ( 100*np.mean( pred_labels_ == y_test ), )

confusion matrix



In [ ]:

    
confusion( y_test, pred_labels_ )

and display the weights



In [ ]:

    
fast_plot( lorl1_.coef_, u"Weights of the $l_1$ logistic regression", n = 10 )

FFT

Let's attempt convolutions.



In [ ]:

    
X_train_2d_ = np.reshape( X_train_, ( -1, 28, 28 ) )
fft2_2d_ = np.fft.fft2( X_train_2d_ )



In [ ]:

    
fft2_2d_.shape



In [ ]:

    
fast_plot( fft2_2d_.real, u"2D FFT of some MNIST digits", n = 400 )



In [ ]:

    
fast_plot( fft2_2d_.imag, u"2D FFT of some MNIST digits", n = 400 )

Try a higher coefficient for regularization.



In [ ]:

    
X_train_ = scl_.transform( X_train )
lorl1_3_ = linear_model.LogisticRegression( C = 2.0, penalty = "l1" ).fit( X_train_, y_train )

Report



In [ ]:

    
pred_labels_ = lorl1_3_.predict( scl_.transform( X_test ) )
print "Accuracy score is %.3f%%" % ( 100*np.mean( pred_labels_ == y_test ), )
print confusion( y_test, pred_labels_ )
fast_plot( lorl1_3_.coef_, u"Weights of the $l_1$ logistic regression", n = 10 )

Run a Grid seach over the penalty and the degree of regularization.



In [ ]:

    
g_lor_ = grid_search.GridSearchCV( linear_model.LogisticRegression( ),
                                   param_grid = {
                                        "C" : np.logspace( -2, 2, num = 5 ),
                                        "penalty" : [ "l2", "l1" ],
                                  }, cv = 7, n_jobs = -1, verbose = 50 ).fit( X_train_, y_train )

Let's see what the cross-validation consider the best model.



In [ ]:

    
g_lor_.scores_

PCA + $k$-nearest neighbour classifier

Reduce dimensionality using PCA to 64.



In [ ]:

    
scl_ = preprocessing.StandardScaler( ).fit( X_train )
X_train_ = scl_.transform( X_train )
pca_ = decomposition.PCA( n_components = 15 ).fit( X_train_ )

The $k$-nn implementation in Scikit-Learn, appears to prohibit chaning the number of neighbors on-the-fly.



In [ ]:

    
## Don't set the number of neighbours just yet.
PC_train = pca_.transform( X_train_ )
knn_ = neighbors.KNeighborsClassifier( n_neighbors = None, n_jobs = -1 ).fit( PC_train, y_train )

def knn_call( knn, method, k, *args, **kwargs ) :
    knn.n_neighbors = k
    return getattr( knn, method )( *args, **kwargs )

Predict the labels using the majority voting procedure.



In [ ]:

    
n_neighbors = np.asarray( np.linspace( 1, 13, num = 5 ), np.int )

PC_test = pca_.transform( scl_.transform( X_test ) )
pred_labels_ = { k : knn_call( knn_, "predict", k = k, X = PC_test )
                   for k in n_neighbors }

Display accuracy of the $k$-nn classifiers and the confusion matrix.



In [ ]:

    
import scipy as sp
for k in n_neighbors :
    tbl = confusion( y_test, pred_labels_[ k ] )
    print "%d-nn : accuracy %0.3f%%\n" % ( k, 100*np.mean( pred_labels_[ k ] == y_test ), ), tbl, "\n"



In [ ]:

    
pipe_ = pipeline.Pipeline( [
            ( "scaler1", preprocessing.StandardScaler( ) ),
            ( "pca", decomposition.PCA( n_components = 100 ) ),
            ( "scaler2", preprocessing.StandardScaler( ) ),
            ( "lore", linear_model.LogisticRegression( C = 1.0, penalty = "l2" ) ),
        ] ).fit( X_train, y_train )

Now test the pipeline.



In [ ]:

    
pred_labels_ = pipe_.predict( X_test )

display the summary scores.



In [ ]:

    
print "Accuracy score is %.3f%%" % ( 100*np.mean( pred_labels_ == y_test ), )
confusion( y_test, pred_labels_ )



In [ ]:

    
import lasagne, theano.tensor as T

Implement as simple neural network with lasagne



In [ ]:

    
input_var = T.tensor4( 'inputs' )
target_var = T.ivector( 'targets' )



In [ ]:

    
## The input layer
network = lasagne.layers.InputLayer( shape = ( None, 1, 28, 28 ),
                                     input_var = input_var )

## The 2D 5x5 conv layer with ReLU and 2x2 max-pooling
## (28-7+1) // 1
network = lasagne.layers.Conv2DLayer( network, num_filters = 32, filter_size = ( 5, 5 ),
                                      nonlinearity = lasagne.nonlinearities.rectify,
                                      border_mode = "valid", W = lasagne.init.GlorotUniform( ) )
network = lasagne.layers.MaxPool2DLayer( network, pool_size = ( 2, 2 ) )

## The 2D 3x3 conv layer with ReLU and 2x2 max-pooling
network = lasagne.layers.Conv2DLayer( lasagne.layers.dropout( network, p = .2 ),
                                      num_filters = 64, filter_size = ( 3, 3 ),
                                      nonlinearity = lasagne.nonlinearities.rectify,
                                      border_mode = "valid", W = lasagne.init.GlorotUniform( ) )
network = lasagne.layers.MaxPool2DLayer( network, pool_size = ( 2, 2 ) )

## FC layer with dropout
network = lasagne.layers.DenseLayer( lasagne.layers.dropout( network, p = .5 ),
                                     num_units = 256, nonlinearity = lasagne.nonlinearities.rectify )

network = lasagne.layers.DenseLayer( lasagne.layers.dropout( network, p = .5 ),
                                     num_units = 10, nonlinearity = lasagne.nonlinearities.softmax )



In [ ]:

    
prediction = lasagne.layers.get_output( network )
loss = lasagne.objectives.categorical_crossentropy( prediction, target_var ).mean( )



In [ ]:

    
params = lasagne.layers.get_all_params( network, trainable = True )
updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate = 0.01, momentum = 0.9 )



In [ ]:

    
test_prediction = lasagne.layers.get_output( network, deterministic = True )
test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var ).mean( )

test_acc = T.mean( T.eq( T.argmax( test_prediction, axis = 1 ), target_var ), dtype = theano.config.floatX )



In [ ]:

    
train_fn = theano.function([input_var, target_var], loss, updates=updates)
val_fn = theano.function([input_var, target_var], [test_loss, test_acc])