Face detection, tracking and matching:

Import dependencies:

import face_recognition
import cv2

import os
from os.path import basename
import glob
import sys
import types
import subprocess
from random import randint
import json
import gc

import skvideo.io
import numpy as np
import scipy.misc
from skimage.transform import rescale, resize, downscale_local_mean

import PIL

import keras
from keras.preprocessing import image
from keras.models import model_from_json
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.applications.inception_v3 import preprocess_input

/usr/local/lib/python3.6/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

OpenCV version check:

(major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.')

if int(major_ver)  < 3 :
    print ("Update OpenCV ...")

Load video files:

source = '../video/One_Direction-Drag_Me_Down.mp4'

    video_capture = cv2.VideoCapture(source)
    print ("Imported video using OpenCV ...")
    video_capture =  skvideo.io.vread(source)
    print ("Imported video using sci-kit video ...")

Imported video using OpenCV ...

Initialize variables for video processing:

sgd = SGD(lr=1e-7, decay=0.5, momentum=1, nesterov=True)
rms = RMSprop(lr=1e-7, rho=0.9, epsilon=1e-08, decay=0.0)
ada = Adagrad(lr=1e-7, epsilon=1e-08, decay=0.0)
optimizer = sgd

length = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))

save_path = "../proc_vid.mp4"
save_audio = "../audio.wav"
save_path_w_audio = "../proc_vid_audio.mp4"
output_dir = '../output/'

face_locations = []
face_encodings = []
face_names = []
frame_number = 0
face_count = 0

w, h = int(video_capture.get(3)),int(video_capture.get(4))
print ("Source image width: "+ str(w))
print ("Source image height: "+ str(h))

fps = video_capture.get(cv2.CAP_PROP_FPS)
print ("Frames per second using video.get(cv2.CAP_PROP_FPS) : {0}".format(fps))

Source image width: 1280
Source image height: 720
Frames per second using video.get(cv2.CAP_PROP_FPS) : 23.976025018098067

fourcc = cv2.VideoWriter_fourcc(*'XVID')
video_writer = cv2.VideoWriter(save_path, fourcc, fps, (w,h), True)

reference_image_path = "../ref_img/"
file_list = glob.glob(reference_image_path + '/*.jpg')

n_proc_frames = length
resize_img = False
verbose = True
gen_train_img = True
interleaved = False
use_deep_learning = True
annotate = True
process_this_frame = True
inverse_scale_factor = 1

def compile_model(model):
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

def load_prediction_model(args):
        with open(args.config_file[0]) as json_file:
              model_json = json_file.read()
        model = model_from_json(model_json)
          print ("Please specify a model configuration file ...")
          print ("Loaded model weights from: " + str(args.weights_file[0]))
          print ("Error loading model weights ...")
        with open(args.labels_file[0]) as json_file:
            labels = json.load(json_file)
        print ("Loaded labels from: " + str(args.labels_file[0]))
        print ("No labels loaded ...")
    return model, labels

def gen_predict(model):
        print ("Model successfully compiled ...")
        print ("Model failed to compile ...")

    print ("Compiling predictor function ...")                                          # to avoid the delay during video capture.
    _ = model.predict(np.zeros((1, n, n, 3), dtype=np.float32), batch_size=1)
    print ("Compilation completed ...")

args = types.SimpleNamespace()
args.config_file = ['../model/trained_config.json']
args.weights_file = ['../model/trained_weights.model']
args.labels_file = ['../model/trained_labels.json']
args.output_dir = ['../output/']

model, labels = load_prediction_model(args)

Loaded model weights from: ../model/trained_weights.model
Loaded labels from: ../model/trained_labels.json

In OpenCV using: interpolation = cv2.INTER_CUBIC argument in cv2.resize, performs a bi-cubic interpolation over 4x4 pixel neighborhood.

while (video_capture.isOpened()):    
    ret, frame = video_capture.read() # Grab a single frame of video
    frame_number += 1
    if resize_img ==True:
        isf = inverse_scale_factor
        small_frame = cv2.resize(frame, (0, 0), fx=(1/isf), fy=(1/isf)) # Resize frame of video to 1/inverse_scale_factor size for faster processing
        isf = 1
        small_frame = frame
    if frame_number <=n_proc_frames:
        if ret ==True:
            if process_this_frame:
                face_locations = face_recognition.face_locations(small_frame) # Find all the faces and face encodings in the current frame of video
                face_encodings = face_recognition.face_encodings(small_frame, face_locations)
                face_names = []
                if annotate == True or gen_train_img == True:
                    for face_encoding in face_encodings:
                        for file_path in file_list:
                            reference_image = face_recognition.load_image_file(file_path)
                                reference_face_encoding = face_recognition.face_encodings(reference_image)[0]
                                if verbose == True:
                                    print ("Processed face encodings ...")
                                if verbose == True:
                                    print("Failed processing face encodings ...")
                            if annotate == True:
                                name_ID = (os.path.splitext(basename(file_path))[0])
                                name_ID = name_ID.replace("_", " ")
                                match = face_recognition.compare_faces([reference_face_encoding], face_encoding) # See if the face is a match for the known face(s)
                                name = "Unknown"
                                if match[0]:
                                    name = name_ID
                    if verbose == True:
                        print ("Skipping face recognition mode ...")
                if verbose == True:
                    print ("Skipping frame ...")
            if interleaved == True:
                process_this_frame = not process_this_frame # Only process every other frame of video to save time
                process_this_frame = process_this_frame

            # Display the results
            for (top, right, bottom, left), name in zip(face_locations, face_names):
                # Scale back up face locations since the frame we detected in was scaled to scaling factor size
                top *= int(isf)
                right *= int(isf)
                bottom *= int(isf)
                left *= int(isf)
                # Draw an ellipse around the face
                ex = left
                ey = top
                ew = int(abs(right - ex))
                eh = int(abs(bottom - ey))
                p1 = int(ew/2 + ex)
                p2 = int(eh/2 + ey)
                h1 = int(ew/2)
                h2 = int(eh/2)
                square = frame[max((ey-eh//2,0)):ey+3*eh//2, max((ex-ew//2,0)):ex+3*ew//2]
                if use_deep_learning == True and annotate == True:
                    preds_square = cv2.resize(square.astype(np.float32),    \
                                        dsize=(IMG_WIDTH, IMG_HEIGHT),\
                                        interpolation = cv2.INTER_CUBIC)
                        _X_ = image.img_to_array(preds_square)
                        del (preds_square)
                        _X_ = np.expand_dims(_X_, axis=0)
                        _X_ = preprocess_input(_X_)
                        probabilities = model.predict(_X_, batch_size=1).flatten()
                        del (_X_)
                        prediction = labels[np.argmax(probabilities)]
                        name = (str(prediction)).replace("_", " ")
                        print ("Face recognition using deep-learning ...")
                        print (prediction + "\t" + "\t".join(map(lambda x: "%.2f" % x, probabilities)))
                        print (str(prediction))
                        del (prediction)
                        print ("Failed to create a prediction ...")         
                if gen_train_img == True:
                    random_number = randint(10000000, 99999999)
                    random_number = str(random_number)
                    cv2.imwrite(os.path.join(output_dir + "//" + 
                                             str(name.replace("", "_")) +"_" + 
                                             str(random_number) +
                                             "_loc_" + str(p1) + "_" + 
                                             str(p2) + "_" +
                                             str(h1) + "_" +
                                             str(h2) + "_" +
                                             "_frame_%d.jpg" % face_count), square)
                    if verbose == True:
                        print ("Saved frame: "+ str(face_count)+" with face detected ..." )
                        if name != "Unknown":
                            print ("Possible match for detected face: " + str(name))
                    cv2.ellipse(frame, (p1, p2), (h1,h2), 0,0,360, (0,255,0), 2)
                    del (square)
                    face_count += 1
                if annotate == True:
                    font = cv2.FONT_HERSHEY_DUPLEX
                    cv2.rectangle(frame, (p1 - 100, bottom - 2), (p1 + 100, bottom + 33), (0, 0, 255), cv2.FILLED) 
                    cv2.putText(frame, name, (p1  - 94, bottom + 23 ), font, 0.75, (255, 255, 255), 1) # Draw a label with a name below the face
                    if verbose == True:
                        print ("No identifiers to annotate. Try setting annotate flag to True ...")
                if verbose == True:
                    print("Processed frame {} / {}".format(frame_number, length))
                if verbose == True:
                    print("Failed writing frame {} / {}".format(frame_number, length))
            if verbose == True:
                print("No frame to process ...")
        if verbose == True:
            print ("Processed "+ str(n_proc_frames) + " frames")
            print ("Detected " + str(face_count) + " faces" )
            print ("Detected " + str(face_count) + " faces" )

Processed face encodings ...
Face recognition using deep-learning ...
Niall_Horan	0.07	0.28	0.08	0.45	0.11
Saved frame: 0 with face detected ...
Possible match for detected face: Niall Horan
Processed face encodings ...
Processed face encodings ...
Face recognition using deep-learning ...
Niall_Horan	0.04	0.01	0.01	0.93	0.01
Saved frame: 4 with face detected ...
Possible match for detected face: Niall Horan
Face recognition using deep-learning ...
Niall_Horan	0.00	0.09	0.03	0.79	0.09
Saved frame: 5 with face detected ...
Possible match for detected face: Niall Horan
Processed face encodings ...
Processed face encodings ...
Processed face encodings ...
Processed face encodings ...
Face recognition using deep-learning ...
Niall_Horan	0.00	0.00	0.00	1.00	0.00
Saved frame: 57 with face detected ...
Possible match for detected face: Niall Horan
Face recognition using deep-learning ...
Liam_Payne	0.00	0.88	0.07	0.03	0.01
Saved frame: 58 with face detected ...
Possible match for detected face: Liam Payne
Face recognition using deep-learning ...
Louis_Tomlinson	0.01	0.25	0.39	0.21	0.14
Saved frame: 59 with face detected ...
Possible match for detected face: Louis Tomlinson
Face recognition using deep-learning ...
Harry_Styles	0.89	0.02	0.04	0.03	0.02
Saved frame: 60 with face detected ...
Possible match for detected face: Harry Styles
Processed frame 50 / 50
Processed 50 frames
Detected 61 faces

Release handle reading the video file or webcam:

Extract audio from a video file:

cmd = 'ffmpeg -i %s -ab 320000 -ac 2 -ar 44100 -vn %s' % (source, save_audio)
print (cmd)
subprocess.call(cmd, shell=True)

ffmpeg -i ../video/One_Direction-Drag_Me_Down.mp4 -ab 320000 -ac 2 -ar 44100 -vn ../audio.wav

Copy audio track from one video to another:

cmd = 'ffmpeg -y -i %s -i %s -shortest -c:v copy -c:a aac -b:a 256k  %s' % (save_path, save_audio, save_path_w_audio)
print (cmd)
subprocess.call(cmd, shell=True)
print('Muxing completed ...')
print('Saved output file to: %s' % (save_path_w_audio))

ffmpeg -y -i ../proc_vid.mp4 -i ../audio.wav -shortest -c:v copy -c:a aac -b:a 256k  ../proc_vid_audio.mp4
Muxing completed ...
Saved output file to: ../proc_vid_audio.mp4

Visualize deep-learning model architecture:

from keras.utils import plot_model 
import pydot 
import graphviz # apt-get install -y graphviz libgraphviz-dev 
from IPython.display import SVG 
from keras.utils.vis_utils import model_to_dot

plot_model(model, to_file= os.path.join(args.output_dir[0] + '/model_face_detection.png')) 
SVG(model_to_dot(model).create(prog='dot', format='svg'))

