The code for this demo comes from the MIT Deep Learning course: https://deeplearning.mit.edu/
as well as Google's Tensorboard tutorial: https://www.tensorflow.org/tensorboard/r1/overview/
the high resolution digits were generated this way: http://blog.otoro.net/2016/04/01/generating-large-images-from-latent-vectors/
In [ ]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
# Logging for Tensorboard
from time import time
from tensorflow.keras.callbacks import TensorBoard # If you want to use tb, make sure to have a log directory
# Commonly used modules
import numpy as np
import os
import sys
# Images, plots, display, and visualization
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import cv2 # installed as opencv
import IPython
from six.moves import urllib
print(tf.__version__)
In [2]:
(train_images, train_labels), (test_images, test_labels) = keras.datasets.mnist.load_data()
# reshape images to specify that it's a single channel
train_images = train_images.reshape(train_images.shape[0], 28, 28, 1)
test_images = test_images.reshape(test_images.shape[0], 28, 28, 1)
In [3]:
def preprocess_images(imgs): # works for either a single image or multiple images
sample_img = imgs if len(imgs.shape) == 2 else imgs[0]
assert sample_img.shape in [(28, 28, 1), (28, 28)], sample_img.shape # make sure images are 28x28 and single-channel (grayscale)
return ??? # normalize to [0,1] here
train_images = ???
test_images = ???
Let's make sure we didn't break anything and display some images
In [ ]:
plt.figure(figsize=(10,2))
for i in range(5):
plt.subplot(1,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(train_images[i].reshape(28, 28), cmap=plt.cm.binary)
plt.xlabel(train_labels[i])
In [ ]:
# I recommend a CNN
model = ???
Before we fit our model, we need to add a few more things:
In [6]:
model.compile(???)
In [9]:
tensorboard = TensorBoard(log_dir="./logs/{}".format(time()))
To see what we get, open a terminal and run the following:
tensorboard --logdir=./logs/
where logdir is the relative path to your logs folder
In [14]:
# This is just me making sure that my logs go to the right place
os.chdir('C:/Users/jpa84/Documents/Python Scripts/Lunch_Bytes/')
!pwd
For the model that I made, each epoch took about 80 seconds to complete. I wouldn't recommend more than two or three epochs (to keep this tutorial moving)
In [15]:
training = model.???
In [ ]:
print(test_images.shape) # check that the dimensions of the test data match the training data in case of any errors
test_loss, test_acc = model.???
print('Test loss:', test_loss)
print('Test accuracy:', test_acc*100.0, '%')
Cool! the model is about 99% accurate (in my test runthrough)
Test data doesn't represent data "from the wild" exactly, so let's read in some high resolution images and see how the model performs. This part of the notebook doesn't involve much TensorFlow so I've left the code intact.
This is meant to be an example of a more interesting application of our model
In [ ]:
mnist_dream_path = 'images/mnist_dream.mp4'
mnist_prediction_path = 'images/mnist_dream_predicted.mp4'
# download the video if you haven't
if not os.path.isfile(mnist_dream_path):
print('downloading the sample video...')
vid_url = 'https://github.com/lexfridman/mit-deep-learning/raw/master/tutorial_deep_learning_basics' + '/' + mnist_dream_path
mnist_dream_path = urllib.request.urlretrieve(vid_url)[0]
def cv2_imshow(img):
ret = cv2.imencode('.png', img)[1].tobytes()
img_ip = IPython.display.Image(data=ret)
IPython.display.display(img_ip)
cap = cv2.VideoCapture(mnist_dream_path)
vw = None
frame = -1 # counter for debugging (mostly), 0-indexed
# go through all the frames and run our classifier on the high res MNIST images as they morph from number to number
while True: # should 481 frames
frame += 1
ret, img = cap.read()
if not ret: break
assert img.shape[0] == img.shape[1] # should be a square
if img.shape[0] != 720:
img = cv2.resize(img, (720, 720))
#preprocess the image for prediction
img_proc = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_proc = cv2.resize(img_proc, (28, 28))
img_proc = preprocess_images(img_proc)
img_proc = 1 - img_proc # inverse since training dataset is white text with black background
net_in = np.expand_dims(img_proc, axis=0) # expand dimension to specify batch size of 1
net_in = np.expand_dims(net_in, axis=3) # expand dimension to specify number of channels
preds = model.predict(net_in)[0]
guess = np.argmax(preds)
perc = np.rint(preds * 100).astype(int)
img = 255 - img
pad_color = 0
img = np.pad(img, ((0,0), (0,1280-720), (0,0)), mode='constant', constant_values=(pad_color))
line_type = cv2.LINE_AA
font_face = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 1.3
thickness = 2
x, y = 740, 60
color = (255, 255, 255)
text = "Neural Network Output:"
cv2.putText(img, text=text, org=(x, y), fontScale=font_scale, fontFace=font_face, thickness=thickness,
color=color, lineType=line_type)
text = "Input:"
cv2.putText(img, text=text, org=(30, y), fontScale=font_scale, fontFace=font_face, thickness=thickness,
color=color, lineType=line_type)
y = 130
for i, p in enumerate(perc):
if i == guess: color = (255, 218, 158)
else: color = (100, 100, 100)
rect_width = 0
if p > 0: rect_width = int(p * 3.3)
rect_start = 180
cv2.rectangle(img, (x+rect_start, y-5), (x+rect_start+rect_width, y-20), color, -1)
text = '{}: {:>3}%'.format(i, int(p))
cv2.putText(img, text=text, org=(x, y), fontScale=font_scale, fontFace=font_face, thickness=thickness,
color=color, lineType=line_type)
y += 60
# if you don't want to save the output as a video, set this to False
save_video = True
if save_video:
if vw is None:
codec = cv2.VideoWriter_fourcc(*'DIVX')
vid_width_height = img.shape[1], img.shape[0]
vw = cv2.VideoWriter(mnist_prediction_path, codec, 30, vid_width_height)
# 15 fps above doesn't work robustly so we right frame twice at 30 fps
vw.write(img)
vw.write(img)
# scale down image for display
img_disp = cv2.resize(img, (0,0), fx=0.5, fy=0.5)
cv2_imshow(img_disp)
IPython.display.clear_output(wait=True)
cap.release()
if vw is not None:
vw.release()