In [2]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# Import a bunch of libraries.
import time
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_mldata
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report

# Set the randomizer seed so results are the same each time.
np.random.seed(0)

In [3]:
# Load the digit data either from mldata.org, or once downloaded to data_home, from disk. The data is about 53MB so this cell
# should take a while the first time your run it.
mnist = fetch_mldata('MNIST original', data_home='~/datasets/mnist')
X, Y = mnist.data, mnist.target

# Rescale grayscale values to [0,1].
X = X / 255.0

# Shuffle the input: create a random permutation of the integers between 0 and the number of data points and apply this
# permutation to X and Y.
# NOTE: Each time you run this cell, you'll re-shuffle the data, resulting in a different ordering.
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y = X[shuffle], Y[shuffle]

print 'data shape: ', X.shape
print 'label shape:', Y.shape

# Set some variables to hold test, dev, and training data.
test_data, test_labels = X[61000:], Y[61000:]
dev_data, dev_labels = X[60000:61000], Y[60000:61000]
train_data, train_labels = X[:60000], Y[:60000]
mini_train_data, mini_train_labels = X[:1000], Y[:1000]


data shape:  (70000, 784)
label shape: (70000,)

In [7]:
# Credit where due... some inspiration drawn from:
# https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/fig/mnist.py

# example_as_pixel_matrix():
#   transforms a 784 element pixel into a 28 x 28 pixel matrix
def example_as_pixel_matrix(example):
    return np.reshape(example, (-1, 28))

# add_example_to_figure():
#   given an existing figure, number of rows, columns, and position,
#   adds a subplot with the example to the figure
def add_example_to_figure(example, 
                     figure, 
                     subplot_rows, 
                     subplot_cols, 
                     subplot_number):
    matrix = example_as_pixel_matrix(example)

    subplot = figure.add_subplot(subplot_rows, subplot_cols, subplot_number)
    subplot.imshow(matrix, cmap='Greys', interpolation='Nearest')
    # disable tick marks
    subplot.set_xticks(np.array([]))
    subplot.set_yticks(np.array([]))

# plot_examples():
#   given a matrix of examples (digit, example#) => example, 
#   plots it with digits as rows and examples as columns
def plot_examples(examples):
    
    figure = plt.figure()
    
    shape = np.shape(examples)
    rows = shape[0]
    columns = shape[1]
    
    subplot_index = 1
    
    for digit, examples_for_digit in enumerate(examples):
        for example_index, example in enumerate(examples_for_digit):
            add_example_to_figure(example, 
                                  figure, 
                                  rows, 
                                  columns, 
                                  subplot_index
                                 )
            subplot_index = subplot_index + 1
    
    figure.tight_layout()
    plt.show()

# plot_one_example():
#   given an example, plots only that example, typically
#   for debugging or diagnostics
def plot_one_example(example):  
    examples = [ [ example ] ]
    plot_examples(examples)

# select_indices_of_digit():
#   given an array of digit lables, selects the indices of
#   labels that match a desired digit
def select_indices_of_digit(labels, digit):
    return [i for i, label in enumerate(labels) if label == digit]

# take_n_from():
#   code readability sugar for taking a number of elements from an array
def take_n_from(count, array):
    return array[:count]

# take_n_examples_by_digit():
#   given a data set of examples, a label set, and a parameter n,
#   creates a matrix where the rows are the digits 0-9, and the
#   columns are the first n examples of each digit
def take_n_examples_by_digit(data, labels, n):
    examples = [
        data[take_n_from(n, select_indices_of_digit(labels, digit))]
        for digit in range(10)
    ]
    return examples

In [10]:
import itertools

def blur(image):
    pixel_matrix = example_as_pixel_matrix(image)
    blurred_image = []
    rows, columns = np.shape(pixel_matrix)
    
    for row in range(rows):
        for column in range(columns):
            # take the mean of the 9-pixel neighborhood (in clause)
            # but guard against running off the edges of the matrix (if clause)
            value = np.mean(list( 
                pixel_matrix[i][j] 
                for i, j
                in itertools.product(
                    range(row - 1, row + 2), 
                    range(column - 1, column + 2)
                )
                if (i >= 0) and (j >= 0) and (i < rows) and (j < columns)
            ))
            
            blurred_image.append(value)
    
    return blurred_image

def blur_images(images): 
    blurred = [ blur(image) for image in images ]
    return blurred

In [8]:
blurred_train_data = [ blur(datum) for datum in train_data ]


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-8-67fe8fa5c499> in <module>()
----> 1 blurred_train_data = [ blur(datum) for datum in train_data ]

<ipython-input-5-cd3806ff7859> in blur(image)
     15                 in itertools.product(
     16                     range(row - 1, row + 2),
---> 17                     range(column - 1, column + 2)
     18                 )
     19                 if (i >= 0) and (j >= 0) and (i < rows) and (j < columns)

//anaconda/lib/python2.7/site-packages/numpy/core/fromnumeric.pyc in mean(a, axis, dtype, out, keepdims)
   2733 
   2734     return _methods._mean(a, axis=axis, dtype=dtype,
-> 2735                             out=out, keepdims=keepdims)
   2736 
   2737 def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):

//anaconda/lib/python2.7/site-packages/numpy/core/_methods.pyc in _mean(a, axis, dtype, out, keepdims)
     52 
     53 def _mean(a, axis=None, dtype=None, out=None, keepdims=False):
---> 54     arr = asanyarray(a)
     55 
     56     rcount = _count_reduce_items(arr, axis)

//anaconda/lib/python2.7/site-packages/numpy/core/numeric.pyc in asanyarray(a, dtype, order)
    512 
    513     """
--> 514     return array(a, dtype, copy=False, order=order, subok=True)
    515 
    516 def ascontiguousarray(a, dtype=None):

KeyboardInterrupt: 

In [11]:
train_data_0k = train_data[:10000]
blurred_train_data_0k = blur_images(train_data_0k)

In [15]:
train_data_1k = train_data[10000:20000]
blurred_train_data_1k = blur_images(train_data_1k)

In [13]:
blurred_batches = [ blurred_train_data_0k, blurred_train_data_0k, blurred_train_data_0k, blurred_train_data_0k ]

In [14]:
np.shape(blurred_batches)


Out[14]:
(4, 10000, 784)