Human voice is constructed from two factors:
Formant is peak of power spectrum of the vocal tract filter. It is important information applied to speech recognition, and voice synthesis. Especially, low-frequency formant frequencies describe features of vowel sounds.
In this example, we show estimation of formants of five kinds of voewl sound /a/, /i/, /ɯ/, /ɘ/, and /ɔ/ by autoregressive (AR) model.
In [1]:
#require "core" ;;
#require "fftw3" ;;
#use "archimedes_iocaml.ml" ;;
#use "wav.ml" ;;
#print_length 50 ;; (* Omit printing too long array *)
In [2]:
open Core ;;
open Core.Caml.Format ;;
let pi = 3.14159265358979 ;;
Out[2]:
We use an audio file of vowel sound /a/ downloaded from Wikipedia. The original data format is OGG, but it is not easy to read and write due to compression. We convert OGG files into (non-compressed) linear-PCM WAV files by FFmpeg.
FFmpeg options:
-ac 1
: output monoral data-ar N
: set $N$ to sample rate-acodec pcm_s16le
: output linear PCM of litte-endian 16-bit signed integers-loglevel warning
: supress noisy logging-y
: always YES (overwrite a file of the same name)
In [3]:
let read_sound_file fname url =
assert(Unix.system (sprintf "curl -o %s -sL %s" fname url) = Ok ()) ;
assert(Unix.system (sprintf "ffmpeg -loglevel warning -y -i %s -ac 1 -acodec pcm_s16le %s.wav" fname fname) = Ok ()) ;
match Wav.from_file (fname ^ ".wav") with
| (_, `Stereo _) -> assert false (* We assume all sound files are monoral! *)
| (sample_rate, `Monoral xs) ->
let txs = Array.mapi xs ~f:(fun i x -> (float i /. float sample_rate, x)) in (* pairs of time and x *)
(sample_rate, txs)
;;
let (sample_rate, raw_signal) =
read_sound_file
"datasets/a.ogg"
"https://upload.wikimedia.org/wikipedia/commons/6/65/Open_front_unrounded_vowel.ogg" ;;
Out[3]:
Out[3]:
In [4]:
let vp = A.init ~w:800. ~h:250. ["iocaml"] in
A.Axes.box vp ;
A.Viewport.title vp "/a/ (time domain)" ;
A.Viewport.xlabel vp "time (sec)" ;
A.Viewport.ylabel vp "amplitude" ;
A.set_color vp A.Color.red ;
A.Array.xy_pairs vp ~style:`Lines raw_signal ;
A.close vp
Out[4]:
In [5]:
let hamming_window x =
let n = 5000 in
let hamming i = 0.54 -. 0.46 *. Float.cos (2.0 *. pi *. float i /. float (n - 1)) in
Array.sub ~pos:15000 ~len:n x
|> Array.mapi ~f:(fun i (t, x) -> (t, x *. hamming i))
;;
let win_signal = hamming_window raw_signal ;;
Out[5]:
Out[5]:
In [6]:
let vp = A.init ~w:800. ~h:250. ["iocaml"] in
A.Axes.box vp ;
A.Viewport.title vp "/a/ (time domain)" ;
A.Viewport.xlabel vp "time (sec)" ;
A.Viewport.ylabel vp "amplitude" ;
A.set_color vp A.Color.red ;
A.Array.xy_pairs vp ~style:`Lines win_signal ;
A.close vp
Out[6]:
In [7]:
let fft_spectrum ~sample_rate xs =
let module FFT = Fftw3.D in
let open Bigarray in
let n = Array.length xs in
let c_xs = Array1.create FFT.complex C_layout n in (* source memory for time-domain data *)
let c_zs = Array1.create FFT.complex C_layout n in (* destination memory for frequency-domain data *)
let plan = FFT.Array1.dft FFT.Forward c_xs c_zs in
for i = 0 to n - 1 do
let x = xs.(i) in
c_xs.{i} <- Complex.({ re = x; im = 0.0; })
done ;
FFT.exec plan ; (* Execute FFT *)
Array.init (n / 2)
~f:(fun i ->
let f = float i /. float n *. float sample_rate in (* frequency *)
let db = 10.0 *. Float.log10 (Complex.norm2 c_zs.{i}) in (* Convert into power spectrum (decibels) *)
(f, db))
;;
let spectrum = fft_spectrum ~sample_rate @@ Array.map ~f:snd win_signal ;;
Out[7]:
Out[7]:
In [8]:
let vp = A.init ~w:800. ~h:250. ["iocaml"] in
A.Axes.box vp ;
A.Viewport.title vp "/a/ (frequency domain)" ;
A.Viewport.xlabel vp "frequency (Hz)" ;
A.Viewport.ylabel vp "magnitude (dB)" ;
A.set_color vp A.Color.red ;
A.Array.xy_pairs vp ~style:`Lines spectrum ;
A.close vp
Out[8]:
Let $x_t \in \mathbb{R}$ be a signal of vowel sound, and $t = 1,\dots,N$ be a time step. Autoregressive (AR) model assumes that $x_t$ is represented by
$$x_t = - \sum^{k}_{i=1} a_i x_{t-i} + \varepsilon_t$$where $k \in \mathbb{N}$ is the AR order, and $\varepsilon_t \sim \mathcal{N}(0, \sigma^2)$ is an error at time $t$. AR model is IIR filter generating $x_t$ from Gaussian noise $\varepsilon_t$, i.e.,
Levinson-Durbin recursion is approach for estimation of AR coefficients $a_i$ by the minimization of $\sigma^2$. The algorithm computes AR($m+1$) coefficients $a^{(m+1)}_i$ and $\sigma^2_{(m+1)}$ from AR($m$) parameters $a^{(m)}_i$ and $\sigma^2_{(m)}$, recusively. The detailed derivation is explained in this tutorial.
First, we obtain $\sigma^2_{(0)}$ under the assumption that $x_t$ is Gaussian noise $\varepsilon_t$:
$$\sigma^2_{(0)} = R(0).$$Second, we recursively calculate $a^{(m+1)}_i$ and $\sigma^2_{(m+1)}$ by
\begin{align} \lambda_{(m+1)} & = - \left( R(m) + \sum^m_{i=1} a_i^{(m)} R(m+1-i) \right) \sigma^{-2}_{(m)}, \\ a^{(m+1)}_i & = a^{(m)}_i + \lambda_{(m+1)} a^{(m)}_{m-i}, \qquad (i = 1,\dots,m) \\ a^{(m+1)}_{m+1} & = \lambda_{(m+1)}, \\ \sigma^2_{(m+1)} & = \left( 1 - \lambda_{(m+1)}^2 \right) \sigma^2_{(m)}. \end{align}
In [9]:
let levinson r =
let n = Array.length r in
if n = 0 then failwith "levinson: empty autocorrelation" ;
let rec aux m ar sigma2 =
if m + 1 = n then (ar, sigma2)
else begin
let lambda = ~-. (Array.foldi ~init:r.(m + 1) ~f:(fun i acc ai -> acc +. ai *. r.(m - i)) ar) /. sigma2 in
let ar' = Array.init ~f:(fun i -> if i = m then lambda else ar.(i) +. lambda *. ar.(m - i - 1)) (m + 1) in
let sigma2' = sigma2 *. (1.0 -. lambda *. lambda) in
aux (m + 1) ar' sigma2'
end
in
aux 0 [||] r.(0)
Out[9]:
where $R(\tau)$ is autocorrelation of $x_t$ given by
$$R(\tau) = \sum^{N-\tau}_{t=1} x_t x_{t+\tau} \quad (\tau \ge 0).$$
In [10]:
let autocorr ~max_lag x =
let n = Array.length x in
Array.init (max_lag + 1)
~f:(fun lag ->
let sum = ref 0.0 in
for i = 0 to n - lag - 1 do sum := !sum +. x.(i) *. x.(i + lag) done ;
!sum)
Out[10]:
In [11]:
let (ar, sigma2) = levinson @@ autocorr ~max_lag:60 @@ Array.map ~f:snd win_signal
Out[11]:
AR model is an IIR filter generating $x_t$ from Gaussian noise $\varepsilon_t$:
$$\varepsilon_t = x_t + \sum^k_{i=1} a_i x_{t-i} = \sum^k_{i=0} a_i x_{t-i}$$where $a_0 = 1$. Let
then
$$E(\omega) = A(\omega) X(\omega)$$thus
$$X(\omega) = \frac{E(\omega)}{A(\omega)}.$$The decibels of power spectrum of $x_t$ is
$$10 \log_{10} |X(\omega)|^2 = 10 \log_{10} \left| \frac{E(\omega)}{A(\omega)} \right|^2 = 10 \log_{10} \frac{\sigma^2}{|A(\omega)|^2} = 10 \log_{10} \sigma^2 - 10 \log_{10} |A(\omega)|^2.$$$|A(\omega)|^2$ is called spectrum envelope of $|X(\omega)|^2$.
In [12]:
let spectrum_envelope ~sample_rate ~n ar sigma2 =
Array.init n ~f:(fun i -> if i = 0 then 1.0 else if i <= Array.length ar then ar.(i - 1) else 0.0)
|> fft_spectrum ~sample_rate
|> Array.map ~f:(fun (f, z) -> (f, 10.0 *. Float.log10 sigma2 -. z))
;;
let spec_env = spectrum_envelope ~sample_rate ~n:(Array.length win_signal) ar sigma2 ;;
Out[12]:
Out[12]:
In [13]:
let vp = A.init ~w:800. ~h:250. ["iocaml"] in
A.Axes.box vp ;
A.Viewport.title vp "/a/ (frequency domain)" ;
A.Viewport.xrange vp 0.0 10000.0 ;
A.Viewport.xlabel vp "frequency (Hz)" ;
A.Viewport.ylabel vp "magnitude (dB)" ;
(* Spectrum: log |X(w)|^2 *)
A.set_color vp A.Color.red ;
A.Array.xy_pairs vp ~style:`Lines spectrum ;
(* Spectrum envelope: log |A(w)|^2 *)
A.set_color vp A.Color.blue ;
A.Array.xy_pairs vp ~style:`Lines spec_env ;
A.close vp
Out[13]:
In [14]:
(** [extract_peaks z] returns an array of formant frequencies from a given spectrum envlope [z]. *)
let extract_peaks z =
let peaks = ref [] in
for i = 1 to Array.length z - 2 do
if snd z.(i-1) < snd z.(i) && snd z.(i) > snd z.(i+1) then begin
peaks := (fst z.(i)) :: !peaks
end
done ;
List.rev !peaks
;;
let formant = extract_peaks spec_env ;;
Out[14]:
Out[14]:
Plot a vowel chart of five kinds of voewl sound /a/, /i/, /ɯ/, /ɘ/, and /ɔ/.
In [15]:
let formants =
List.filter_map [
"a", "https://upload.wikimedia.org/wikipedia/commons/6/65/Open_front_unrounded_vowel.ogg";
"i", "https://upload.wikimedia.org/wikipedia/commons/9/91/Close_front_unrounded_vowel.ogg";
"w", "https://upload.wikimedia.org/wikipedia/commons/e/e8/Close_back_unrounded_vowel.ogg";
"e", "https://upload.wikimedia.org/wikipedia/commons/e/e0/Mid_front_unrounded_vowel.ogg";
"o", "https://upload.wikimedia.org/wikipedia/commons/0/02/Open-mid_back_rounded_vowel.ogg";
]
~f:(fun (name, url) ->
let (sample_rate, x) = read_sound_file (sprintf "datasets/%s.ogg" name) url in
let x' = hamming_window x in
let (ar, sigma2) = levinson @@ autocorr ~max_lag:60 @@ Array.map ~f:snd x' in
let spec_env = spectrum_envelope ~sample_rate ~n:(Array.length x') ar sigma2 in
match extract_peaks spec_env with
| f1 :: f2 :: _ -> Some (name, f1, f2)
| _ -> None)
Out[15]:
In [16]:
let vp = A.init ["iocaml"] in
A.Axes.box vp ;
A.Viewport.xrange vp 0. 1000. ;
A.Viewport.yrange vp 0. 3000. ;
A.Viewport.xlabel vp "F1 frequency (Hz)" ;
A.Viewport.ylabel vp "F2 frequency (Hz)" ;
A.set_color vp A.Color.red ;
A.List.xy_pairs vp ~style:(`Markers "x") (List.map formants ~f:(fun (_, f1, f2) -> (f1, f2))) ;
A.set_color vp A.Color.black ;
List.iter formants ~f:(fun (name, f1, f2) -> A.Viewport.text vp (f1 +. 20.0) f2 name) ;
A.close vp
Out[16]:
In [ ]: