Pre-Download Data

To save you the hassle of repeated downloads, it's easier so save the files in a shared folder.

To run all cells, select Cell > Run All. You can also run cells one at a time, select Cell > Run cells in the menu above.


In [ ]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf

# Some of these are hard to distinguish.
# Check https://quickdraw.withgoogle.com/data for examples
zoo = ['frog', 'horse', 'lion', 'monkey', 'octopus', 'owl', 'rhinoceros', 
       'snail', 'tiger', 'zebra']

# Mapping between category names and ids
animal2id = dict((c,i) for i,c in enumerate(zoo))
id2animal = dict((i,c) for i,c in enumerate(zoo))
for i, animal in id2animal.items():
    print("Class {}: {}".format(i, animal))

In [ ]:
from six.moves.urllib.request import urlretrieve
import os

DATA_DIR = 'data/'

def maybe_download(url, data_dir):
    filename = url.split('/')[-1]
    file_path = os.path.join(data_dir, filename)

    # Check if the file already exists.
    if not os.path.exists(file_path):
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)

        print("Downloading {} to {}".format(url, file_path))
        file_path, _ = urlretrieve(url=url, filename=file_path)
    else:
        print("Using previously downloaded file: {}".format(file_path))
    return file_path

def load_data(file_path, max_examples=2000, example_name=''):
    d = np.load(open(file_path, 'r'))
    d = d[:max_examples,:] # limit number of instances to save memory
    print("Loaded {} {} examples of dimension {} from {}".format(
            d.shape[0], example_name, d.shape[1], file_path))
    return d

data= []
labels =[]

for animal in zoo:
    url = "https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/{}.npy".format(animal)
    file_path = maybe_download(url, DATA_DIR)
    data.append(load_data(file_path, max_examples = 1000, example_name = animal))
    labels.extend([animal2id[animal]]*data[-1].shape[0])

data = np.concatenate(data, axis=0)
labels = np.array(labels)
print("Final shape of data: {}".format(data.shape))

The data is fun to look at. Compared to MNIST the classes seem much harder to distinguish


In [ ]:
import matplotlib.pyplot as plt

n_samples = 10
random_indices = np.random.permutation(data.shape[0])

for i in random_indices[:n_samples]:
    print(i, labels[i])
    print("Category {}: {}".format(labels[i], id2animal[labels[i]]))

    # We'll show the image and its pixel value histogram side-by-side.

    # To interpret the values as a 28x28 image, we need to reshape
    # the numpy array, which is one dimensional.
    image = data[i, :]

    _, (ax1, ax2) = plt.subplots(1, 2)
    ax1.imshow(image.reshape(28, 28), cmap=plt.cm.Greys, interpolation='nearest')
    ax2.hist(image, bins=20)
    ax1.grid(False)
    plt.show()

In [ ]:
if data.dtype == 'uint8':  # avoid doing this twice
    data = data.astype(np.float32)
    data = (data - (255 / 2.0)) / 255

Our labels are 0,1,2,..,10 right now. We convert to a one-hot representation


In [ ]:
random_indices = np.random.permutation(labels.shape[0])

print("Labels before:")
print(labels[random_indices[:5]])

def one_hot(labels, n_classes):
    n_labels = len(labels)
    one_hot_labels = np.zeros((n_labels, n_classes))
    one_hot_labels[np.arange(n_labels), labels] = 1
    return one_hot_labels

labels_one_hot = one_hot(labels, len(zoo))

print("Labels after:")
print(labels_one_hot[random_indices[:5]])

Finally, let's split the data into random train and test partitions


In [ ]:
n_test_examples = 1000

random_indices = np.random.permutation(data.shape[0])
test_data = data[random_indices[:n_test_examples],:]
test_labels = labels_one_hot[random_indices[:n_test_examples],:]
train_data = data[random_indices[n_test_examples:],:]
train_labels = labels_one_hot[random_indices[n_test_examples:],:]
print("Data shapes: ", test_data.shape, test_labels.shape, train_data.shape, train_labels.shape)

Save data for other experiments


In [ ]:
outfile_name = os.path.join(DATA_DIR, "zoo.npz")
with open(outfile_name, 'w') as outfile:
    np.savez(outfile, train_data, train_labels, test_data, test_labels)
print ("Saved train/test data to {}".format(outfile_name))