The preprocessing routine searches the ./data/videos dir and creates usable datasets for subsequent model learning tasks. It breaks down videos into one second clips and from those clips generates frames, spectrograms, and InceptionV3 feature vectors. Currently clips and frames aren't used directly and spectrograms aren't used at all, but they are kept for labeling, debugging, and future model upgrades. The feature vector is the only output used for learning. Feature vectors are generated in preprocessing because it drastically reduced training time which allows for a faster model iteration cycle. This methodology was inspired from this repo and associated blog post: https://github.com/harvitronix/five-video-classification-methods
In [3]:
import glob
import subprocess
import json
import os
import csv
from tqdm import tnrange, tqdm_notebook
from keras.preprocessing import image
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.models import Model, load_model
from keras.layers import Input
import numpy as np
# Adapted from https://github.com/harvitronix/five-video-classification-methods
class Extractor():
"""Extractor builds an inception model without the top classification
layers and extracts a feature array from an image."""
def __init__(self):
# Get model with pretrained weights.
base_model = InceptionV3(
weights='imagenet',
include_top=True
)
# We'll extract features at the final pool layer.
self.model = Model(
inputs=base_model.input,
outputs=base_model.get_layer('avg_pool').output
)
def extract(self, image_path):
img = image.load_img(image_path, target_size=(299, 299))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
# Get the prediction.
features = self.model.predict(x)
features = features[0]
return features
def video_length(path):
"""returns the length of the video in secs"""
cmd = "ffprobe -i " + path + " -show_entries format=duration -v quiet -of json"
pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout
output = pipe.read()
d = json.loads(output)
s = d["format"]["duration"]
return int(float(s))
def video_id(path):
"""returns the id of a video from a path in this format: ./data/videos/:video_id"""
return path.split("/")[3].split(".")[0]
def clip_dir_path(path):
"""returns the path to dir containing all clips for a video ./data/clips/:video_id"""
vid_id = video_id(path)
return "./data/clips/" + vid_id
def create_clips(path):
"""given a path to a video create_clips writes one sec video segments to disk
in the following format ./data/clips/:video_id/:clip_id.mp4"""
# create clip dir
dir_path = clip_dir_path(path)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
# create one sec clips from src
video_len = video_length(path)
for i in tqdm_notebook(xrange(video_len), desc="Clips for " + video_id(path)):
clip_path = dir_path + "/" + '%05d' % i + ".mp4"
if not os.path.exists(clip_path):
cmd = "ffmpeg -v error -y -i " + path + " -ss " + str(i) + " -t 1 " + clip_path
os.system(cmd)
def create_frames(path):
"""given a path to a video create_frames writes frames from previous generated
clips. create_clips must be run before create_frames. Frames are saved in the
following format ./data/frames/:video_id/:clip_id/:frame_id.jpg"""
# create frame dir
vid_id = video_id(path)
dir_path = "./data/frames/" + vid_id
if not os.path.exists(dir_path):
os.makedirs(dir_path)
# create frames from clip
video_len = video_length(path)
for i in tqdm_notebook(xrange(video_len), desc="Frames for " + vid_id):
clip_path = clip_dir_path(path) + "/" + '%05d' % i + ".mp4"
frame_dir_path = dir_path + "/" + '%05d' % i
if not os.path.exists(frame_dir_path):
os.makedirs(frame_dir_path)
cmd = "ffmpeg -v error -y -i " + clip_path + " -r 5.0 " + frame_dir_path + "/%5d.jpg"
os.system(cmd)
# resize frames to 299x299 for InceptionV3
frame_paths = glob.glob(frame_dir_path + "/*.jpg")
for fi in xrange(len(frame_paths)):
path = frame_paths[fi]
# resize first
cmd = "convert " + path + " -resize 299x299 " + path
os.system(cmd)
# add black background
cmd = "convert " + path + " -gravity center -background black -extent 299x299 " + path
os.system(cmd)
def create_spectrograms(path):
"""given a path to a video create_spectrograms writes spectrograms from previous generated
clips. create_clips must be run before create_spectrograms. Spectrograms are saved in the
following format ./data/audio/:video_id/:clip_id.png"""
# create audio dir
vid_id = video_id(path)
dir_path = "./data/audio/" + vid_id
if not os.path.exists(dir_path):
os.makedirs(dir_path)
# create spectrogram from clip
video_len = video_length(path)
for i in tqdm_notebook(xrange(video_len), desc="Spectrograms for " + vid_id):
clip_path = clip_dir_path(path) + "/" + '%05d' % i + ".mp4"
spec_path = dir_path + "/" + '%05d' % i + ".png"
if not os.path.exists(spec_path):
cmd = "ffmpeg -v error -y -i " + clip_path + " -lavfi showspectrumpic=s=32x32:legend=false " + spec_path
os.system(cmd)
extractor = Extractor()
def create_features(path):
"""given a path to a video create_features writes inceptionV3 feature outputs from previous generated
clips. create_clips and create_frames must be run before create_features. Feature outputs are saved
in the following format ./data/features/:video_id/:clip_id/:frame_id.txt.gz"""
# create feature dir
vid_id = video_id(path)
dir_path = "./data/features/" + vid_id
if not os.path.exists(dir_path):
os.makedirs(dir_path)
# save feature array for every frame
video_len = video_length(path)
with tqdm_notebook(total=video_len, desc="Features for " + vid_id) as pbar:
for root, dirs, files in os.walk('./data/frames/'+ vid_id):
for f in files:
if f.endswith(".jpg"):
frame_path = root + "/" + f
feature_path = frame_path.replace("frames", "features").replace("jpg", "txt.gz")
feature_dir = root.replace("frames", "features")
if not os.path.exists(feature_dir):
os.makedirs(feature_dir)
if not os.path.exists(feature_path):
features = extractor.extract(frame_path)
np.savetxt(feature_path, features)
pbar.update(1)
# create assets from folder of videos. This takes a LONG TIME.
video_paths = glob.glob("./data/videos/*.mp4")
videos_len = len(video_paths)
for i in tqdm_notebook(xrange(videos_len), desc="Preprocessing Videos"):
path = video_paths[i]
create_clips(path)
create_frames(path)
create_spectrograms(path)
create_features(path)
In [2]:
import pandas as pd
import glob
import numpy as np
# read in and shuffle data
labels = pd.read_csv("./labelmaker/labels.csv").as_matrix()
print "Labels Shape: {}".format(labels.shape)
np.random.seed(0)
np.random.shuffle(labels)
# split labels into train, validation, and test sets
div = len(labels) / 5
train_labels = labels[0:div*3,:]
val_labels = labels[div*3:div*4,:]
test_labels = labels[div*4:,:]
print "Trainging Labels Shape: {}".format(train_labels.shape)
print "Validation Labels Shape: {}".format(val_labels.shape)
print "Test Labels Shape: {}".format(test_labels.shape)
The Keras model is composed of a sequential model with two time sensitive LSTM layers followed by two Dense layers and an output layer. The initial input of (7, 2048) represents seven frames per clip each with a 2048 sized vector generated by InceptionV3. The final 4x1 output vector is the category prediction.
In [47]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Flatten, GRU
from keras import backend as K
model = Sequential([
LSTM(512, return_sequences=True, input_shape=(7, 2048)),
LSTM(512, return_sequences=True, input_shape=(7, 512)),
Flatten(),
Dense(512, activation='relu'),
Dropout(0.5),
Dense(512, activation='relu'),
Dropout(0.5),
Dense(4, activation='softmax')
])
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
print "Model Compiled"
In [5]:
def one_hot(i):
return np.array([int(i==0),int(i==1),int(i==2),int(i==3)])
def get_features(labels):
x, y = [], []
for i in xrange(len(labels)):
video_id = labels[i][0]
clip_id = labels[i][1]
label = labels[i][2]
features = []
for i in range(7):
fname = "./data/features/" + video_id + "/" + '%05d' % clip_id + "/" + '%05d' % (i+1) + ".txt.gz"
f = np.loadtxt(fname)
features.append(f)
x.append(features)
y.append(one_hot(label))
x = np.array(x)
return x, np.array(y)
print "Getting features"
X_train, Y_train = get_features(train_labels)
X_val, Y_val = get_features(val_labels)
print X_train.shape
print Y_train.shape
In [49]:
from keras.callbacks import TensorBoard
import time
import numpy as np
tensorboard = TensorBoard(log_dir='./logs',
histogram_freq=0,
write_graph=True,
write_images=True)
model.fit(X_train,
Y_train,
batch_size=100,
epochs=30,
verbose=2,
callbacks=[tensorboard],
validation_data=(X_val, Y_val))
file_name = "shot_classifier_" + str(int(time.time())) + ".h5"
model.save(file_name)
print "Model Saved"
In [6]:
from keras.models import load_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
def reverse_one_hot(val):
hi_idx = -1
hi = -1
for i in range(len(val)):
v = val[i]
if hi == -1 or v > hi:
hi = v
hi_idx = i
return hi_idx
def normalize_labels(Y):
norm = []
for v in Y:
norm.append(reverse_one_hot(v))
return np.array(norm)
X_test, Y_test = get_features(test_labels)
model = load_model("shot_classifier_1501284149.h5")
Y_pred = model.predict(X_test, verbose=2)
Y_test_norm = normalize_labels(Y_test)
Y_pred_norm = normalize_labels(Y_pred)
print "Overall Accuracy: " + str(accuracy_score(Y_test_norm, Y_pred_norm))
con_m = confusion_matrix(Y_test_norm, Y_pred_norm)
titles = ["forehand", "backhand", "volley", "serve"]
for i in range(4):
for j in range(4):
actual = titles[i]
predicted = titles[j]
print "predicted " + predicted + " when " + actual + " " + str(con_m[i][j])