In [1]:
# import Speaker Recognition Module
import speaker.recognition as SR
Gender = SR.GMMRec()
In [2]:
import scipy.io.wavfile as wav
from speaker.silence import remove_silence
from features import mfcc
def get_mfcc(audio_path):
(sr, sig) = wav.read(audio_path)
if len(sig.shape) > 1:
sig = sig[:, 0]
cleansig = remove_silence(sr, sig)
mfcc_vecs = mfcc(cleansig, sr, numcep = 15)
return mfcc_vecs
In [3]:
import numpy as np
# Here we use mfcc as the audio features, but in theory, other audio features should work as well, e.g. lpc
female_mfcc = np.array(get_mfcc('./Audio/female.wav')) # female_mfcc.shape = (N1, D); N1 vectors and D dimension
male_mfcc = np.array(get_mfcc('./Audio/male.wav')) # male_mfcc.shape = (N2, D);
Gender.enroll('Female', female_mfcc) # enroll the female audio features
Gender.enroll('Male', male_mfcc) # enroll the male audio features
Gender.train() # train the GMMs with PyCASP
Gender.dump('gender.model') # save the trained model into a file named "gender.model" for future use
In [4]:
Gender = SR.GMMRec.load('gender.model') # this is not necessary if you just trained the model
test_mfcc = np.array(get_mfcc('/Users/xuhe/Downloads/SpectrogramInversion1.02b/tapestr_rec.wav')) # test_mfcc.shape = (N3, D)
Gender.predict(test_mfcc) # predict the speaker, where result is the most porbabel speaker label, and log_lkld is the log likelihood for test_mfcc to be from the recognized speaker.
Out[4]:
In [5]:
def totime(secs):
m, s = divmod(secs, 60)
h, m = divmod(m, 60)
return h, m, s
def showresult(recognizer, sig, sr, head):
cleansig = remove_silence(sr, sig)
mfcc_vecs = mfcc(cleansig, sr, numcep = 15)
print("%d:%02d:%02d" % (totime(head)), recognizer.predict(
mfcc_vecs))
def recognize(recognizer, audio_path, step = 1, duration = 1.5):
(fs, signal) = wav.read(audio_path)
if len(signal.shape) > 1:
signal = signal[:, 0]
head = 0
totallen = np.round(signal.shape[0] / fs).astype(int)
print('Recognition results:')
while head < totallen:
tail = head + duration
if tail > totallen:
tail = totallen
signali = signal[fs * head : np.min([fs * tail, fs * totallen])]
showresult(recognizer, signali, fs, head)
head += step
In [6]:
recognize(Gender, './Audio/female-male.wav', step = 5, duration = 5)
In [ ]: