In [ ]:
import os, gc
os.chdir( '/Users/innazarov/ownCloud/' )
In [ ]:
import warnings ; warnings.simplefilter( "ignore" )
import numpy as np, pandas as pd, flex as fl, scipy as sp
%matplotlib inline
import matplotlib.pyplot as plt
In [ ]:
mnist_npz = '/Users/innazarov/Github/study_notes/year_14_15/spring_2015/machine_learning/data/mldata/mnist_scikit.npz'
assert( os.path.exists( mnist_npz ) )
with np.load( mnist_npz, 'r' ) as npz :
mnist_labels, mnist_data = np.asarray( npz[ 'labels' ], np.int ), npz[ 'data' ] / 255.0
mnist_classes = np.unique( mnist_labels )
In [ ]:
from sklearn import *
In [ ]:
def fast_plot( data, title, **kwargs ) :
axis = plt.figure( figsize = ( 16, 9 ) ).add_subplot( 111 )
axis.set_title( title )
fl.plot( axis, data, shape = ( 28, -1 ), cmap = plt.cm.hot, interpolation = "nearest", **kwargs )
plt.show( )
In [ ]:
def confusion( actual, predicted ) :
tbl_ = sp.sparse.coo_matrix( ( np.ones_like( predicted ), ( predicted, actual ) ) )
tbl = pd.DataFrame( tbl_.todense( ), index = mnist_classes, columns = mnist_classes )
tbl.index.name = "Predicted"
tbl.columns.name = "Actual"
return tbl
In [ ]:
random_state = np.random.RandomState( None )
In [ ]:
split_ = cross_validation.train_test_split( mnist_data, mnist_labels, test_size = 0.75,
random_state = random_state )
X_train, X_test, y_train, y_test = split_
In [ ]:
print X_train.shape
print X_test.shape
Draw and plot some random subset of the train dataset.
In [ ]:
index_perm_ = np.random.permutation( X_train.shape[ 0 ] )
X = X_train[ index_perm_[ : 400 ] ]
fast_plot( X, u"200 random digits MNIST train dataset", n = 200 )
Generate averaged images
In [ ]:
train_digits_ = { label_ : X_train[ np.flatnonzero( y_train == label_ ) ]
for label_ in mnist_classes }
avg_patterns_ = { label_ : samples_.mean( axis = 0 )[ np.newaxis ]
for label_, samples_ in train_digits_.iteritems() }
## Construct an aligned pair of arrays
average_digits = np.concatenate( [ array_ for array_ in avg_patterns_.itervalues() ], axis = 0 )
average_labels = np.array( [ label_ for label_ in avg_patterns_.iterkeys() ], dtype = np.int )
del avg_patterns_
## Sort them
order_ = np.argsort( average_labels )
average_labels = average_labels[ order_ ]
average_digits = average_digits[ order_ ]
In [ ]:
## Plot the patterns
fast_plot( average_digits, u"Averaged digits on train" )
A simple minimal error calssifier based on mean patterns. $\newcommand{\argmin}{\mathop{\mathtt{argmin}}} \newcommand{\Dcal}{\mathcal{D}}$ Basically this is a simple gaussian model: $$ \log \mathcal{L} = - \frac{|\Dcal|}{2} \log 2\pi - \frac{|\Dcal|}{2} \log \sigma^2
- \frac{1}{2 \sigma^2} \sum_{x\in \Dcal} ( x - \mu_{k_x} )^2 \,. $$
Classification is done with this rule: \begin{align*} \hat{y}(x) & = \argmin_{k=1,\ldots,K} \, \sum_{n=1}^N \sum_{m=1}^M ( x_{nm} - \mu_{knm} )^2 \\ & = \argmin_{k=1,\ldots,K} \, \sum_{n=1}^N \sum_{m=1}^M - 2 x_{nm} \mu_{knm} + \mu_{knm}^2 \\ & = \argmin_{k=1,\ldots,K} \, \|\mu_k\|^2 - 2 \langle x, \mu_k \rangle \,, \end{align*} where $$ \langle f, g \rangle = \sum_{n=1}^N \sum_{m=1}^M f_{nm} g_{nm}\,. $$
In [ ]:
## Compute the squared error ...
yp_ = - 2 * np.tensordot( X_test, average_digits, axes = [ -1, -1 ] ) \
+ average_digits.dot( average_digits.T )[:1]
## ... and based on it find the closest label.
pred_labels_ = average_labels[ yp_.argmin(axis = 1) ]
Construct the confusion matrix
In [ ]:
confusion( y_test, pred_labels_ )
Compute the accuracy
In [ ]:
print "Achieved accuracy is %.3f%%" % ( np.mean( y_test == pred_labels_ ) * 100.0, )
Initalize and fit a random forest to the train dataset.
In [ ]:
rfc_ = ensemble.RandomForestClassifier( n_estimators = 50, n_jobs = -1,
random_state = random_state ).fit( X_train, y_train )
Predict on the test dataset.
In [ ]:
pred_labels_ = rfc_.classes_[ rfc_.predict_proba( X_test ).argmax( axis = 1 ) ]
Display the accuracy ...
In [ ]:
print "Achieved accuracy is %.3f%%" % ( np.mean( y_test == pred_labels_ ) * 100.0, )
... and the confusion matrix.
In [ ]:
confusion( y_test, pred_labels_ )
Show the pixel-feature importances of the random forest.
In [ ]:
fast_plot( rfc_.feature_importances_[ np.newaxis ],
u"Random Forest feature importances",
n_row = 1, n_col = 1 )
Let's use PCA to learn a linear manifold from the data. Scipy's SVD returns $U$, $\Sigma$ and $V'$ (not $V$).
In [ ]:
scl_ = preprocessing.StandardScaler( ).fit( X_train )
U, S, V = sp.linalg.svd( scl_.transform( X_train ), full_matrices = False )
order_ = S.argsort( )[::-1]
U, S, V = U[:,order_], S[ order_ ], V[order_]
Compute variance within each principal direction.
In [ ]:
var_ = S**2
leverage_ = np.cumsum( var_ ) / np.sum( var_ )
n_components = np.min( np.flatnonzero( leverage_ > 0.95 ) )
print """The least number of principal components required to""" \
+ """guarantee at least 95%% recostruction is %d.""" % ( n_components, )
plt.plot( leverage_ )
plt.axhline( y = 0.95, color = 'k', lw = 2 )
Chose the number of components.
In [ ]:
n_components = 256
Extract the principal components
In [ ]:
pc_ = U[:,:n_components] * S[ :n_components ]
Embed the compnents in the original space.
In [ ]:
X_ = scl_.inverse_transform( np.dot( pc_, V[:n_components] ) )
The eigenvectors are
In [ ]:
fast_plot( X_, u"Principal components of train dataset", n = 512 )
See how well it fares on the test dataset: reconstruct ...
In [ ]:
pc_ = np.dot( scl_.transform( X_test ), V[:n_components].T )
X_ = scl_.inverse_transform( np.dot( pc_, V[:n_components] ) )
... and display
In [ ]:
fast_plot( np.abs(X_-X_test), u"Low rank reconstruction error of the test dataset", n = 32 )
Let's use a Gaussian naive bayes classifier: $\newcommand\given{\:\vert\:}$
The Naive Bayes assumes that the features are conditionally independent, provided the class label $y$ is known. In particular for any $i\in \{1,\ldots,d\}$ one has: $$ p\bigl( x_1,\ldots,x_p \:\big\vert\: y, \Theta\bigr) = \prod_{i=1}^d p\bigl( x_i \:\big\vert\: y, \Theta\bigr) \,, $$ whence using the Bayes' rule : \begin{align*} p\bigl( y \:\big\vert\: x, \Theta\bigr) &= p\bigl( y \:\big\vert\: x, \Theta\bigr) \frac{p(y\:\big\vert\: \Theta)}{p(x\:\big\vert\: \Theta)} \\ &\propto p\bigl(y\:\big\vert\: \Theta\bigr) \prod_{i=1}^d p\bigl( x_i \:\big\vert\: y, \Theta\bigr) \,, \end{align*} where $\Theta$ is the parameter vector of the model.
Prediction is done using the maximum posterior porbability principle: $\newcommand{\argmax}{\mathop{\mathtt{argmax}}}$ $$ \hat{y}(x) = \argmax_{y} \log p\bigl(y\:\big\vert\: \Theta\bigr)
+ \sum_{i=1}^d \log p\bigl( x_i \:\big\vert\: y, \Theta\bigr) \,. $$
Fitting the Naive Bayes classifier is done unsing the ML approach: $$ \hat{\Theta} = \argmax\Theta \sum{j=1}^J \log p\bigl(y_j\:\big\vert\: \Theta\bigr)
+ \sum_{j=1}^J \sum_{i=1}^d \log p\bigl( x_{ji} \:\big\vert\: y_j, \Theta\bigr) \,. $$
Gaussian NB classifier assumes that $$ p\bigl( x_i \:\big\vert\: y, \Theta\bigr) = \bigl(2\pi \sigma^2_{yi}\bigr)^{-\frac{1}{2}} \mathtt{exp}\biggl( -\frac{1}{2\sigma^2_{yi}} (x_i - \mu_{yi} ) \biggr) $$
In [ ]:
gnb_ = naive_bayes.GaussianNB( ).fit( X_train, y_train )
Predict the labels ...
In [ ]:
pred_labels_ = gnb_.classes_[ gnb_.predict_proba( X_test ).argmax( axis = 1 ) ]
... and report the accuracy.
In [ ]:
print np.mean( pred_labels_ == y_test )
Display the confusion matrix
In [ ]:
confusion( y_test, pred_labels_ )
The estimated mean of each pixel form the following patterns:
In [ ]:
fast_plot( gnb_.theta_, u"Estimated mean values of each pixel feature given a class", n = 10 )
They seem identical to the average patterns and indeed they are:
In [ ]:
np.allclose( gnb_.theta_, average_digits )
Let's see the estimated variablility of each pixel.
In [ ]:
fast_plot( gnb_.sigma_, u"GNB Estimated standard deviation of individual pixels", n = 10 )
Let's employ logisitc regression to classify the digits.
In [ ]:
X_train_ = scl_.transform( X_train )
lorl2_ = linear_model.LogisticRegression( C = 1.0, penalty = "l2" ).fit( X_train_, y_train )
Predict on the test.
In [ ]:
pred_labels_ = lorl2_.predict( scl_.transform( X_test ) )
print "Accuracy score is %.3f%%" % ( 100*np.mean( pred_labels_ == y_test ), )
Display a confusion matrix
In [ ]:
confusion( y_test, pred_labels_ )
Let's visualize the weights of the estimated logistic regression.
In [ ]:
fast_plot( lorl2_.coef_, u"Weights of the $l_2$ logistic regression", n = 10 )
Test an $l_1$ regularization.
In [ ]:
X_train_ = scl_.transform( X_train )
lorl1_ = linear_model.LogisticRegression( C = 1.0, penalty = "l1" ).fit( X_train_, y_train )
Report accuracy
In [ ]:
pred_labels_ = lorl1_.predict( scl_.transform( X_test ) )
print "Accuracy score is %.3f%%" % ( 100*np.mean( pred_labels_ == y_test ), )
confusion matrix
In [ ]:
confusion( y_test, pred_labels_ )
and display the weights
In [ ]:
fast_plot( lorl1_.coef_, u"Weights of the $l_1$ logistic regression", n = 10 )
Let's attempt convolutions.
In [ ]:
X_train_2d_ = np.reshape( X_train_, ( -1, 28, 28 ) )
fft2_2d_ = np.fft.fft2( X_train_2d_ )
In [ ]:
fft2_2d_.shape
In [ ]:
fast_plot( fft2_2d_.real, u"2D FFT of some MNIST digits", n = 400 )
In [ ]:
fast_plot( fft2_2d_.imag, u"2D FFT of some MNIST digits", n = 400 )
Try a higher coefficient for regularization.
In [ ]:
X_train_ = scl_.transform( X_train )
lorl1_3_ = linear_model.LogisticRegression( C = 2.0, penalty = "l1" ).fit( X_train_, y_train )
Report
In [ ]:
pred_labels_ = lorl1_3_.predict( scl_.transform( X_test ) )
print "Accuracy score is %.3f%%" % ( 100*np.mean( pred_labels_ == y_test ), )
print confusion( y_test, pred_labels_ )
fast_plot( lorl1_3_.coef_, u"Weights of the $l_1$ logistic regression", n = 10 )
Run a Grid seach over the penalty and the degree of regularization.
In [ ]:
g_lor_ = grid_search.GridSearchCV( linear_model.LogisticRegression( ),
param_grid = {
"C" : np.logspace( -2, 2, num = 5 ),
"penalty" : [ "l2", "l1" ],
}, cv = 7, n_jobs = -1, verbose = 50 ).fit( X_train_, y_train )
Let's see what the cross-validation consider the best model.
In [ ]:
g_lor_.scores_
Reduce dimensionality using PCA to 64.
In [ ]:
scl_ = preprocessing.StandardScaler( ).fit( X_train )
X_train_ = scl_.transform( X_train )
pca_ = decomposition.PCA( n_components = 15 ).fit( X_train_ )
The $k$-nn implementation in Scikit-Learn, appears to prohibit chaning the number of neighbors on-the-fly.
In [ ]:
## Don't set the number of neighbours just yet.
PC_train = pca_.transform( X_train_ )
knn_ = neighbors.KNeighborsClassifier( n_neighbors = None, n_jobs = -1 ).fit( PC_train, y_train )
def knn_call( knn, method, k, *args, **kwargs ) :
knn.n_neighbors = k
return getattr( knn, method )( *args, **kwargs )
Predict the labels using the majority voting procedure.
In [ ]:
n_neighbors = np.asarray( np.linspace( 1, 13, num = 5 ), np.int )
PC_test = pca_.transform( scl_.transform( X_test ) )
pred_labels_ = { k : knn_call( knn_, "predict", k = k, X = PC_test )
for k in n_neighbors }
Display accuracy of the $k$-nn classifiers and the confusion matrix.
In [ ]:
import scipy as sp
for k in n_neighbors :
tbl = confusion( y_test, pred_labels_[ k ] )
print "%d-nn : accuracy %0.3f%%\n" % ( k, 100*np.mean( pred_labels_[ k ] == y_test ), ), tbl, "\n"
In [ ]:
pipe_ = pipeline.Pipeline( [
( "scaler1", preprocessing.StandardScaler( ) ),
( "pca", decomposition.PCA( n_components = 100 ) ),
( "scaler2", preprocessing.StandardScaler( ) ),
( "lore", linear_model.LogisticRegression( C = 1.0, penalty = "l2" ) ),
] ).fit( X_train, y_train )
Now test the pipeline.
In [ ]:
pred_labels_ = pipe_.predict( X_test )
display the summary scores.
In [ ]:
print "Accuracy score is %.3f%%" % ( 100*np.mean( pred_labels_ == y_test ), )
confusion( y_test, pred_labels_ )
In [ ]:
import lasagne, theano.tensor as T
Implement as simple neural network with lasagne
In [ ]:
input_var = T.tensor4( 'inputs' )
target_var = T.ivector( 'targets' )
In [ ]:
## The input layer
network = lasagne.layers.InputLayer( shape = ( None, 1, 28, 28 ),
input_var = input_var )
## The 2D 5x5 conv layer with ReLU and 2x2 max-pooling
## (28-7+1) // 1
network = lasagne.layers.Conv2DLayer( network, num_filters = 32, filter_size = ( 5, 5 ),
nonlinearity = lasagne.nonlinearities.rectify,
border_mode = "valid", W = lasagne.init.GlorotUniform( ) )
network = lasagne.layers.MaxPool2DLayer( network, pool_size = ( 2, 2 ) )
## The 2D 3x3 conv layer with ReLU and 2x2 max-pooling
network = lasagne.layers.Conv2DLayer( lasagne.layers.dropout( network, p = .2 ),
num_filters = 64, filter_size = ( 3, 3 ),
nonlinearity = lasagne.nonlinearities.rectify,
border_mode = "valid", W = lasagne.init.GlorotUniform( ) )
network = lasagne.layers.MaxPool2DLayer( network, pool_size = ( 2, 2 ) )
## FC layer with dropout
network = lasagne.layers.DenseLayer( lasagne.layers.dropout( network, p = .5 ),
num_units = 256, nonlinearity = lasagne.nonlinearities.rectify )
network = lasagne.layers.DenseLayer( lasagne.layers.dropout( network, p = .5 ),
num_units = 10, nonlinearity = lasagne.nonlinearities.softmax )
In [ ]:
prediction = lasagne.layers.get_output( network )
loss = lasagne.objectives.categorical_crossentropy( prediction, target_var ).mean( )
In [ ]:
params = lasagne.layers.get_all_params( network, trainable = True )
updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate = 0.01, momentum = 0.9 )
In [ ]:
test_prediction = lasagne.layers.get_output( network, deterministic = True )
test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var ).mean( )
test_acc = T.mean( T.eq( T.argmax( test_prediction, axis = 1 ), target_var ), dtype = theano.config.floatX )
In [ ]:
train_fn = theano.function([input_var, target_var], loss, updates=updates)
val_fn = theano.function([input_var, target_var], [test_loss, test_acc])