Machine Learning Engineer Capstone

Preprocessing

The preprocessing routine searches the ./data/videos dir and creates usable datasets for subsequent model learning tasks. It breaks down videos into one second clips and from those clips generates frames, spectrograms, and InceptionV3 feature vectors. Currently clips and frames aren't used directly and spectrograms aren't used at all, but they are kept for labeling, debugging, and future model upgrades. The feature vector is the only output used for learning. Feature vectors are generated in preprocessing because it drastically reduced training time which allows for a faster model iteration cycle. This methodology was inspired from this repo and associated blog post: https://github.com/harvitronix/five-video-classification-methods


In [3]:
import glob
import subprocess
import json
import os
import csv
from tqdm import tnrange, tqdm_notebook
from keras.preprocessing import image
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.models import Model, load_model
from keras.layers import Input
import numpy as np

# Adapted from https://github.com/harvitronix/five-video-classification-methods
class Extractor():
    """Extractor builds an inception model without the top classification 
    layers and extracts a feature array from an image."""
    
    def __init__(self):
        # Get model with pretrained weights.
        base_model = InceptionV3(
            weights='imagenet',
            include_top=True
        )

        # We'll extract features at the final pool layer.
        self.model = Model(
            inputs=base_model.input,
            outputs=base_model.get_layer('avg_pool').output
        )

    def extract(self, image_path):
        img = image.load_img(image_path, target_size=(299, 299))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        # Get the prediction.
        features = self.model.predict(x)
        features = features[0]
        return features

def video_length(path):
    """returns the length of the video in secs"""
    cmd = "ffprobe -i " + path + " -show_entries format=duration -v quiet -of json"
    pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout
    output = pipe.read()
    d = json.loads(output)
    s = d["format"]["duration"]
    return int(float(s))

def video_id(path):
    """returns the id of a video from a path in this format: ./data/videos/:video_id"""
    return path.split("/")[3].split(".")[0]

def clip_dir_path(path):
    """returns the path to dir containing all clips for a video ./data/clips/:video_id"""
    vid_id = video_id(path)
    return "./data/clips/" + vid_id

def create_clips(path):
    """given a path to a video create_clips writes one sec video segments to disk 
    in the following format ./data/clips/:video_id/:clip_id.mp4"""
    
    # create clip dir
    dir_path = clip_dir_path(path)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    # create one sec clips from src
    video_len = video_length(path)
    for i in tqdm_notebook(xrange(video_len), desc="Clips for " + video_id(path)):
        clip_path = dir_path + "/" + '%05d' % i + ".mp4"    
        if not os.path.exists(clip_path):
            cmd = "ffmpeg -v error -y -i " + path + " -ss " + str(i) + " -t 1 " + clip_path
            os.system(cmd)

def create_frames(path):
    """given a path to a video create_frames writes frames from previous generated 
    clips.  create_clips must be run before create_frames.  Frames are saved in the 
    following format ./data/frames/:video_id/:clip_id/:frame_id.jpg"""
        
    # create frame dir
    vid_id = video_id(path)
    dir_path = "./data/frames/" + vid_id
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    # create frames from clip
    video_len = video_length(path)
    for i in tqdm_notebook(xrange(video_len), desc="Frames for " + vid_id):
        clip_path = clip_dir_path(path) + "/" + '%05d' % i + ".mp4"
        frame_dir_path = dir_path + "/" + '%05d' % i
        if not os.path.exists(frame_dir_path):
            os.makedirs(frame_dir_path)
            cmd = "ffmpeg -v error -y -i " + clip_path + " -r 5.0 " + frame_dir_path + "/%5d.jpg"
            os.system(cmd)
            
            # resize frames to 299x299 for InceptionV3
            frame_paths = glob.glob(frame_dir_path + "/*.jpg")
            for fi in xrange(len(frame_paths)):
                path = frame_paths[fi]
                # resize first
                cmd = "convert " + path + " -resize 299x299 " + path
                os.system(cmd)
                # add black background
                cmd = "convert " + path + " -gravity center -background black -extent 299x299 " + path
                os.system(cmd)

def create_spectrograms(path):
    """given a path to a video create_spectrograms writes spectrograms from previous generated 
    clips.  create_clips must be run before create_spectrograms.  Spectrograms are saved in the 
    following format ./data/audio/:video_id/:clip_id.png"""
    
    # create audio dir
    vid_id = video_id(path)
    dir_path = "./data/audio/" + vid_id
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    # create spectrogram from clip
    video_len = video_length(path)
    for i in tqdm_notebook(xrange(video_len), desc="Spectrograms for " + vid_id):
        clip_path = clip_dir_path(path) + "/" + '%05d' % i + ".mp4"
        spec_path = dir_path + "/" + '%05d' % i + ".png"
        if not os.path.exists(spec_path):
            cmd = "ffmpeg -v error -y -i " + clip_path + " -lavfi showspectrumpic=s=32x32:legend=false " + spec_path
            os.system(cmd)


extractor = Extractor()

def create_features(path):
    """given a path to a video create_features writes inceptionV3 feature outputs from previous generated 
    clips.  create_clips and create_frames must be run before create_features.  Feature outputs are saved 
    in the following format ./data/features/:video_id/:clip_id/:frame_id.txt.gz"""
        
    # create feature dir
    vid_id = video_id(path)
    dir_path = "./data/features/" + vid_id
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)    
    
    # save feature array for every frame
    video_len = video_length(path)    
    with tqdm_notebook(total=video_len, desc="Features for " + vid_id) as pbar:
        for root, dirs, files in os.walk('./data/frames/'+ vid_id):
            for f in files:
                if f.endswith(".jpg"):
                    frame_path = root + "/" + f
                    feature_path = frame_path.replace("frames", "features").replace("jpg", "txt.gz")
                    feature_dir = root.replace("frames", "features")
                    if not os.path.exists(feature_dir):
                        os.makedirs(feature_dir)
                    if not os.path.exists(feature_path):
                        features = extractor.extract(frame_path)
                        np.savetxt(feature_path, features)
            pbar.update(1)

# create assets from folder of videos.  This takes a LONG TIME.
video_paths = glob.glob("./data/videos/*.mp4")
videos_len = len(video_paths)
for i in tqdm_notebook(xrange(videos_len), desc="Preprocessing Videos"):
    path = video_paths[i]
    create_clips(path)
    create_frames(path)
    create_spectrograms(path)
    create_features(path)



Create Labels

Labels are generated from the labelmaker's csv output of its internal sqlite database. Labels are shuffled and divided into training, validation, and test sets at a ratio of roughly 3:1:1


In [2]:
import pandas as pd
import glob
import numpy as np

# read in and shuffle data
labels = pd.read_csv("./labelmaker/labels.csv").as_matrix()
print "Labels Shape: {}".format(labels.shape)
np.random.seed(0)
np.random.shuffle(labels)

# split labels into train, validation, and test sets
div = len(labels) / 5
train_labels = labels[0:div*3,:]
val_labels = labels[div*3:div*4,:]
test_labels = labels[div*4:,:]

print "Trainging Labels Shape: {}".format(train_labels.shape)
print "Validation Labels Shape: {}".format(val_labels.shape)
print "Test Labels Shape: {}".format(test_labels.shape)


Labels Shape: (926, 3)
Trainging Labels Shape: (555, 3)
Validation Labels Shape: (185, 3)
Test Labels Shape: (186, 3)

Model

The Keras model is composed of a sequential model with two time sensitive LSTM layers followed by two Dense layers and an output layer. The initial input of (7, 2048) represents seven frames per clip each with a 2048 sized vector generated by InceptionV3. The final 4x1 output vector is the category prediction.


In [47]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Flatten, GRU
from keras import backend as K

model = Sequential([
    LSTM(512, return_sequences=True, input_shape=(7, 2048)),
    LSTM(512, return_sequences=True, input_shape=(7, 512)),  
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(512, activation='relu'),
    Dropout(0.5),    
    Dense(4, activation='softmax')
])
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])

print "Model Compiled"


Model Compiled

Match Labels

This routine retrieves the features from disk and pairs them with their one hot encoded labels. Currently all datasets are loaded into memory, but with enough videos, the code should be switched to using a Keras generator.


In [5]:
def one_hot(i):
    return np.array([int(i==0),int(i==1),int(i==2),int(i==3)])

def get_features(labels):
    x, y = [], []
    for i in xrange(len(labels)):
        video_id = labels[i][0]
        clip_id = labels[i][1]
        label = labels[i][2]

        features = []
        for i in range(7):
            fname = "./data/features/" + video_id + "/" + '%05d' % clip_id + "/" + '%05d' % (i+1) + ".txt.gz"
            f = np.loadtxt(fname)
            features.append(f)
        
        x.append(features)
        y.append(one_hot(label))
    x = np.array(x)
    return x, np.array(y)

print "Getting features"

X_train, Y_train = get_features(train_labels)
X_val, Y_val = get_features(val_labels)

print X_train.shape
print Y_train.shape


Getting features
(555, 7, 2048)
(555, 4)

Training

This routine trains the model and logs updates to the console and Tensorboard. After training is complete the model is saved using the current timestamp to distinguish training runs.


In [49]:
from keras.callbacks import TensorBoard
import time
import numpy as np

tensorboard = TensorBoard(log_dir='./logs', 
                          histogram_freq=0,
                          write_graph=True, 
                          write_images=True)

model.fit(X_train, 
          Y_train, 
          batch_size=100, 
          epochs=30, 
          verbose=2, 
          callbacks=[tensorboard], 
          validation_data=(X_val, Y_val))

file_name = "shot_classifier_" + str(int(time.time())) + ".h5"
model.save(file_name)
print "Model Saved"


Train on 555 samples, validate on 185 samples
Epoch 1/30
9s - loss: 0.8594 - acc: 0.5694 - val_loss: 0.9663 - val_acc: 0.5784
Epoch 2/30
8s - loss: 0.8299 - acc: 0.5964 - val_loss: 1.0400 - val_acc: 0.5622
Epoch 3/30
10s - loss: 0.8315 - acc: 0.6216 - val_loss: 0.8450 - val_acc: 0.6054
Epoch 4/30
9s - loss: 0.7997 - acc: 0.6342 - val_loss: 0.9109 - val_acc: 0.5189
Epoch 5/30
8s - loss: 0.7696 - acc: 0.6126 - val_loss: 0.7708 - val_acc: 0.6865
Epoch 6/30
8s - loss: 0.7296 - acc: 0.6288 - val_loss: 0.7850 - val_acc: 0.6000
Epoch 7/30
9s - loss: 0.6696 - acc: 0.6595 - val_loss: 0.7394 - val_acc: 0.6486
Epoch 8/30
8s - loss: 0.6241 - acc: 0.6937 - val_loss: 0.7530 - val_acc: 0.6649
Epoch 9/30
8s - loss: 0.6476 - acc: 0.6865 - val_loss: 1.0272 - val_acc: 0.6324
Epoch 10/30
8s - loss: 0.6775 - acc: 0.6775 - val_loss: 0.7947 - val_acc: 0.6432
Epoch 11/30
8s - loss: 0.6557 - acc: 0.6739 - val_loss: 0.7841 - val_acc: 0.6541
Epoch 12/30
8s - loss: 0.5822 - acc: 0.7261 - val_loss: 0.8010 - val_acc: 0.6541
Epoch 13/30
9s - loss: 0.5048 - acc: 0.7622 - val_loss: 0.8615 - val_acc: 0.6865
Epoch 14/30
8s - loss: 0.5487 - acc: 0.7550 - val_loss: 0.9534 - val_acc: 0.6216
Epoch 15/30
8s - loss: 0.6183 - acc: 0.7495 - val_loss: 0.9482 - val_acc: 0.5622
Epoch 16/30
8s - loss: 0.5665 - acc: 0.7405 - val_loss: 0.7669 - val_acc: 0.7243
Epoch 17/30
9s - loss: 0.5163 - acc: 0.7676 - val_loss: 0.8521 - val_acc: 0.6919
Epoch 18/30
8s - loss: 0.4823 - acc: 0.7766 - val_loss: 1.2364 - val_acc: 0.6703
Epoch 19/30
8s - loss: 0.5358 - acc: 0.7676 - val_loss: 0.8454 - val_acc: 0.7081
Epoch 20/30
8s - loss: 0.4224 - acc: 0.8234 - val_loss: 0.7991 - val_acc: 0.7405
Epoch 21/30
9s - loss: 0.3806 - acc: 0.8324 - val_loss: 0.9672 - val_acc: 0.6486
Epoch 22/30
9s - loss: 0.4048 - acc: 0.8162 - val_loss: 0.8924 - val_acc: 0.6973
Epoch 23/30
9s - loss: 0.4221 - acc: 0.8198 - val_loss: 0.7659 - val_acc: 0.7297
Epoch 24/30
9s - loss: 0.3954 - acc: 0.8342 - val_loss: 0.8665 - val_acc: 0.7297
Epoch 25/30
8s - loss: 0.3279 - acc: 0.8631 - val_loss: 0.9752 - val_acc: 0.7081
Epoch 26/30
8s - loss: 0.3267 - acc: 0.8631 - val_loss: 1.2729 - val_acc: 0.6865
Epoch 27/30
9s - loss: 0.3667 - acc: 0.8486 - val_loss: 1.4754 - val_acc: 0.5676
Epoch 28/30
8s - loss: 0.4259 - acc: 0.8234 - val_loss: 0.9472 - val_acc: 0.6054
Epoch 29/30
8s - loss: 0.4125 - acc: 0.8072 - val_loss: 1.0666 - val_acc: 0.6973
Epoch 30/30
9s - loss: 0.5015 - acc: 0.7874 - val_loss: 0.8595 - val_acc: 0.7027
Model Saved

Prediction

This routine tests the saved model using the Keras predict method. Overall accuracy and a confusion matrix are displayed to validate that the model is accurate against unseen data.


In [6]:
from keras.models import load_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

def reverse_one_hot(val):
    hi_idx = -1
    hi = -1
    for i in range(len(val)):
        v = val[i]
        if hi == -1 or v > hi:
            hi = v
            hi_idx = i
    return hi_idx

def normalize_labels(Y):
    norm = []
    for v in Y:
        norm.append(reverse_one_hot(v))
    return np.array(norm)

X_test, Y_test = get_features(test_labels)
model = load_model("shot_classifier_1501284149.h5")
Y_pred = model.predict(X_test, verbose=2)

Y_test_norm = normalize_labels(Y_test)
Y_pred_norm = normalize_labels(Y_pred)

print "Overall Accuracy: " + str(accuracy_score(Y_test_norm, Y_pred_norm))

con_m = confusion_matrix(Y_test_norm, Y_pred_norm)

titles = ["forehand", "backhand", "volley", "serve"]

for i in range(4):
    for j in range(4):
        actual = titles[i]
        predicted = titles[j]
        print "predicted " + predicted + " when " + actual + " " + str(con_m[i][j])


Overall Accuracy: 0.672043010753
predicted forehand when forehand 52
predicted backhand when forehand 12
predicted volley when forehand 1
predicted serve when forehand 6
predicted forehand when backhand 28
predicted backhand when backhand 15
predicted volley when backhand 0
predicted serve when backhand 4
predicted forehand when volley 5
predicted backhand when volley 0
predicted volley when volley 20
predicted serve when volley 2
predicted forehand when serve 3
predicted backhand when serve 0
predicted volley when serve 0
predicted serve when serve 38