In [1]:
# All the audio processing tools are wrapped in the Python Module called AudioPipe
import AudioPipe.speaker.recognition as SR # Speaker Recognition Module
import AudioPipe.fingerprint.panako as FP # Accoustic Fingerprinting Module
from AudioPipe.speaker.silence import remove_silence # tool for remove the silence in the audio, not needed
import numpy as np
from AudioPipe.features import mfcc # Feature Extraction Module, part of the shared preprocessing
import scipy.io.wavfile as wav 
from AudioPipe.speaker.rec import dia2spk, getspk # Speaker Recognition using diarization results
from AudioPipe.utils.utils import video2audio # Format converting module, part of the shared preprocessing
import commands, os
from AudioPipe.diarization.diarization import Diarization # Speaker Diarization Moudule
import AudioPipe.data.manage as DM # Data Management Module

In [2]:
# Select the video file to be processed
Video_node = DM.Node("Data/Video/",".mp4")
name = "2015-08-07_0050_US_FOX-News_US_Presidential_Politics"


(0, '')

In [3]:
# Select the file for the meta infomation
Meta_node = DM.Node("Data/RedHen/",".seg")
meta = Meta_node.Pick(name)


(0, '')

In [4]:
# Store the fingerprint of the video
FP_node= DM.Node("Data/Fingerprint/")
output, err, exitcode = Video_node.Flow(FP.Store, name, FP_node, [])


(0, '')

In [5]:
# Convert the video to audio
Audio_node = DM.Node("Data/Audio/", ".wav")
audio = Video_node.Flow(video2audio, name, Audio_node, [Audio_node.ext])


(0, '')

In [6]:
Dia_node = DM.Node("Data/Diarization/", ".rttm")
args = dict(init_cluster=20, dest_mfcc='Data/MFCC', dest_cfg="Data/Model/DiaCfg")
dia =  Audio_node.Flow(Diarization, name, Dia_node, args)


(0, '')
(0, '')
(0, '')
(0, '')
(0, '')
(0, '')
Now start extracting mfcc features
(0, "ffmpeg version 2.8.2 Copyright (c) 2000-2015 the FFmpeg developers\n  built with gcc 4.9.3 (GCC)\n  configuration: --prefix=/usr/local/ffmpeg/2.8.2 --enable-shared\n  libavutil      54. 31.100 / 54. 31.100\n  libavcodec     56. 60.100 / 56. 60.100\n  libavformat    56. 40.101 / 56. 40.101\n  libavdevice    56.  4.100 / 56.  4.100\n  libavfilter     5. 40.101 /  5. 40.101\n  libswscale      3.  1.101 /  3.  1.101\n  libswresample   1.  2.101 /  1.  2.101\nGuessed Channel Layout for  Input Stream #0.0 : stereo\nInput #0, wav, from 'Data/Audio/2015-08-07_0050_US_FOX-News_US_Presidential_Politics.wav':\n  Metadata:\n    encoder         : Lavf56.40.101\n  Duration: 02:09:53.41, bitrate: 1411 kb/s\n    Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 44100 Hz, 2 channels, s16, 1411 kb/s\nOutput #0, wav, to '/home/hxx124/Pipeline/Audio/Pipeline/AudioPipe/diarization/temp_audio/2015-08-07_0050_US_FOX-News_US_Presidential_Politics.wav':\n  Metadata:\n    ISFT            : Lavf56.40.101\n    Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 16000 Hz, stereo, s16, 512 kb/s\n    Metadata:\n      encoder         : Lavc56.60.100 pcm_s16le\nStream mapping:\n  Stream #0:0 -> #0:0 (pcm_s16le (native) -> pcm_s16le (native))\nPress [q] to stop, [?] for help\nsize=   14714kB time=00:03:55.42 bitrate= 512.0kbits/s    \rsize=   30684kB time=00:08:10.93 bitrate= 512.0kbits/s    \rsize=   46662kB time=00:12:26.59 bitrate= 512.0kbits/s    \rsize=   62650kB time=00:16:42.40 bitrate= 512.0kbits/s    \rsize=   74731kB time=00:19:55.68 bitrate= 512.0kbits/s    \rsize=   90765kB time=00:24:12.24 bitrate= 512.0kbits/s    \rsize=  106783kB time=00:28:28.52 bitrate= 512.0kbits/s    \rsize=  122835kB time=00:32:45.35 bitrate= 512.0kbits/s    \rsize=  135040kB time=00:36:00.63 bitrate= 512.0kbits/s    \rsize=  151059kB time=00:40:16.94 bitrate= 512.0kbits/s    \rsize=  167118kB time=00:44:33.89 bitrate= 512.0kbits/s    \rsize=  183165kB time=00:48:50.63 bitrate= 512.0kbits/s    \rsize=  196599kB time=00:52:25.58 bitrate= 512.0kbits/s    \rsize=  212644kB time=00:56:42.30 bitrate= 512.0kbits/s    \rsize=  228718kB time=01:00:59.48 bitrate= 512.0kbits/s    \rsize=  244823kB time=01:05:17.15 bitrate= 512.0kbits/s    \rsize=  260969kB time=01:09:35.50 bitrate= 512.0kbits/s    \rsize=  272834kB time=01:12:45.35 bitrate= 512.0kbits/s    \rsize=  288877kB time=01:17:02.02 bitrate= 512.0kbits/s    \rsize=  304946kB time=01:21:19.13 bitrate= 512.0kbits/s    \rsize=  321036kB time=01:25:36.57 bitrate= 512.0kbits/s    \rsize=  333275kB time=01:28:52.39 bitrate= 512.0kbits/s    \rsize=  349378kB time=01:33:10.04 bitrate= 512.0kbits/s    \rsize=  365469kB time=01:37:27.50 bitrate= 512.0kbits/s    \rsize=  381584kB time=01:41:45.33 bitrate= 512.0kbits/s    \rsize=  394391kB time=01:45:10.25 bitrate= 512.0kbits/s    \rsize=  410462kB time=01:49:27.39 bitrate= 512.0kbits/s    \rsize=  426536kB time=01:53:44.57 bitrate= 512.0kbits/s    \rsize=  442617kB time=01:58:01.87 bitrate= 512.0kbits/s    \rsize=  458722kB time=02:02:19.54 bitrate= 512.0kbits/s    \rsize=  471477kB time=02:05:43.62 bitrate= 512.0kbits/s    \rsize=  487088kB time=02:09:53.40 bitrate= 512.0kbits/s    \nvideo:0kB audio:487088kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.000016%")
(0, '')
Now start preparing configure file
Now start clustering
(0, "/usr/bin/ld: skipping incompatible /usr/local/cuda-7.0/lib/libcudart.so when searching for -lcudart\n/usr/bin/ld: skipping incompatible /usr/local/cuda-7.0/lib/libcudart.so when searching for -lcudart\ng++ -pthread -fno-strict-aliasing -g -O2 -g -fwrapv -O3 -Wall -fPIC -pthread -shared -Xlinker -export-dynamic -DNDEBUG -I/usr/local/python/2.7.8/include/python2.7 -I/usr/local/cuda-7.0/include /tmp/pbsjobs.460477.hpc/asp_cache_hxx124/46c837d5cb878372cdb2886923bd37a6/module.o /tmp/pbsjobs.460477.hpc/asp_cache_hxx124/7ceed7a1666bbc7019867d851e0660de/gpu.o -L/usr/local/python/2.7.8/lib -L/usr/local/cuda-7.0/lib -L/usr/local/cuda-7.0/lib64 -lcuda -lcudart -lboost_python -lpython2.7 -lpthread -ldl -lutil -o /tmp/pbsjobs.460477.hpc/asp_cache_hxx124/codepy.temp.46c837d5cb878372cdb2886923bd37a6.7ceed7a1666bbc7019867d851e0660de.module.so\nUsing the config file specified: ' Data/Model/DiaCfg/2015-08-07_0050_US_FOX-News_US_Presidential_Politics.cfg '\nSpeech file spnsp_file not specified, continuing without it...\n...Reading in HTK feature file...\nINFO: total number of frames read:  779339\n...Reading in speech/nonspeech file...\n ====================== CLUSTERING ON SUBSET ====================== \nGMM SPECIALIZER: USING CUDA\ng++ -pthread -fno-strict-aliasing -g -O2 -g -fwrapv -O3 -Wall -fPIC -pthread -shared -Xlinker -export-dynamic -DNDEBUG -I/usr/local/python/2.7.8/include/python2.7 -I/home/hxx124/myPython/virtualenv-1.9/ENV/lib/python2.7/site-packages/numpy/core/include -I/usr/local/cuda-7.0/include /tmp/pbsjobs.460477.hpc/asp_cache_hxx124/38d44195c321f1de7e40420e316a4733/module.o /tmp/pbsjobs.460477.hpc/asp_cache_hxx124/11dee452985a0f3b9b6a40453d090df4/gpu.o -L/usr/local/python/2.7.8/lib -L/usr/local/cuda-7.0/lib -L/usr/local/cuda-7.0/lib64 -lcuda -lcudart -lboost_python -lpython2.7 -lpthread -ldl -lutil -o /tmp/pbsjobs.460477.hpc/asp_cache_hxx124/codepy.temp.38d44195c321f1de7e40420e316a4733.11dee452985a0f3b9b6a40453d090df4.module.so\n size of each cluster: [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10]\n size of each cluster: [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10, 10]\n size of each cluster: [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10, 10, 10]\n size of each cluster: [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10, 10, 15]\n size of each cluster: [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10, 10, 15, 10]\n size of each cluster: [5, 5, 5, 5, 5, 5, 5, 5, 5, 10, 10, 15, 10, 10]\n size of each cluster: [5, 5, 5, 5, 5, 5, 5, 5, 10, 10, 15, 10, 15]\n size of each cluster: [5, 5, 5, 5, 5, 5, 5, 10, 10, 15, 10, 20]\n size of each cluster: [5, 5, 5, 5, 5, 5, 10, 15, 10, 20, 15]\n size of each cluster: [5, 5, 5, 5, 5, 5, 10, 15, 10, 35]\n size of each cluster: [5, 5, 5, 5, 5, 15, 10, 35, 15]\n size of each cluster: [5, 5, 5, 5, 10, 35, 15, 20]\n size of each cluster: [5, 5, 5, 35, 15, 20, 15]\n size of each cluster: [5, 5, 35, 15, 20, 20]\n size of each cluster: [5, 5, 35, 20, 35]\n size of each cluster: [5, 5, 35, 55]\n size of each cluster: [5, 55, 40]\n size of each cluster: [40, 60]\n size of each cluster: [100]\n=== Total clustering time:  935.556594133\n=== Final size of each cluster: [100]\n...Writing out RTTM file...\nDONE writing RTTM file")
(0, '')

In [7]:
# Below you can find an example on training a fresh model
# However, please make sure that the sample rate of your training data is the same with your testing data
model_gender_dir = "Data/Model/Gender/"
model_gender_nm = 'gender.model'
model_gender = model_gender_dir+model_gender_nm
if not os.path.isfile(model_gender):
    Gender = SR.GMMRec() # Create a new recognizer
    female_fn = model_gender_dir+'female.wav' # choose the training file for female
    male_fn = model_gender_dir+'male.wav' # choose the training file for male
    Gender.enroll_file('Female', female_fn) # enroll the female audio 
    Gender.enroll_file('Male', male_fn) # enroll the male audio 
    Gender.train() # train the GMMs after you enroll all the training data 
    Gender.dump(model_gender) # save the trained model into a file named "gender.model" for future use

In [8]:
# Gender Identification based on Speaker Diarization
Gen_node = DM.Node("Data/Gender/",".gen")
gen = Audio_node.Flow(dia2spk, name, Gen_node, [model_gender, dia, meta, Gen_node.ext])


(0, '')
(0, '')

In [9]:
# Gender Identification without Speaker Diarization
Genr_node = DM.Node("Data/Gender/",".genr")
gen = Audio_node.Flow(getspk, name, Genr_node, [model_gender, meta, Genr_node.ext])


(0, '')
(0, '')

In [10]:
# you can train a fresh model as follows:
model_speaker_dir = "Data/Model/Speaker/" # The training data should already be here!! 
model_speaker_nm = 'speaker.model'
model_speaker = model_speaker_dir+model_speaker_nm
if not os.path.isfile(model_speaker):
    Speaker = SR.GMMRec() # Create a new recognizer
    other_fn = model_speaker_dir+'Imposter.wav' # choose the training file for female
    trump_fn = model_speaker_dir+'Trump.wav' # choose the training file for male
    Speaker.enroll_file('Other', other_fn) # enroll the female audio 
    Speaker.enroll_file('Trump', trump_fn) # enroll the male audio 
    # You can add more speakers here following the syntax above
    Speaker.train() # train the GMMs after you enroll all the training data 
    Speaker.dump(model_speaker) # save the trained model into a file named "gender.model" for future use

In [11]:
# Speaker Recognition based on Speaker Diarization
Spk_node = DM.Node("Data/Speaker/",".spk")
spk = Audio_node.Flow(dia2spk, name, Spk_node, [model_speaker, dia, meta, Spk_node.ext])


(0, '')
(0, '')

In [12]:
# Speaker Recognition without Speaker Diarization
Spkr_node = DM.Node("Data/Speaker/",".spkr")
spkr = Audio_node.Flow(getspk, name, Spkr_node, [model_speaker, meta, Spkr_node.ext])


(0, '')
(0, '')