Formant estimation by AR model

Human voice is constructed from two factors:

  • glottal source, buzz-like sound generated by vocal cords, and
  • vocal tract filtering by lips, a throat, a nasal cavity, etc.

Formant is peak of power spectrum of the vocal tract filter. It is important information applied to speech recognition, and voice synthesis. Especially, low-frequency formant frequencies describe features of vowel sounds.

In this example, we show estimation of formants of five kinds of voewl sound /a/, /i/, /ɯ/, /ɘ/, and /ɔ/ by autoregressive (AR) model.


In [1]:
#require "core" ;;
#require "fftw3" ;;
#use "archimedes_iocaml.ml" ;;
#use "wav.ml" ;;
#print_length 50 ;; (* Omit printing too long array *)


/home/opam/.opam/4.04.1/lib/base/caml: added to search path
/home/opam/.opam/4.04.1/lib/base/caml/caml.cma: loaded
/home/opam/.opam/4.04.1/lib/base/shadow_stdlib: added to search path
/home/opam/.opam/4.04.1/lib/base/shadow_stdlib/shadow_stdlib.cma: loaded
/home/opam/.opam/4.04.1/lib/sexplib/0: added to search path
/home/opam/.opam/4.04.1/lib/sexplib/0/sexplib0.cma: loaded
/home/opam/.opam/4.04.1/lib/base: added to search path
/home/opam/.opam/4.04.1/lib/base/base.cma: loaded
/home/opam/.opam/4.04.1/lib/ocaml/unix.cma: loaded
/home/opam/.opam/4.04.1/lib/ocaml/bigarray.cma: loaded
/home/opam/.opam/4.04.1/lib/fieldslib: added to search path
/home/opam/.opam/4.04.1/lib/fieldslib/fieldslib.cma: loaded
/home/opam/.opam/4.04.1/lib/ppx_compare/runtime-lib: added to search path
/home/opam/.opam/4.04.1/lib/ppx_compare/runtime-lib/ppx_compare_lib.cma: loaded
/home/opam/.opam/4.04.1/lib/sexplib: added to search path
/home/opam/.opam/4.04.1/lib/sexplib/sexplib.cma: loaded
/home/opam/.opam/4.04.1/lib/variantslib: added to search path
/home/opam/.opam/4.04.1/lib/variantslib/variantslib.cma: loaded
/home/opam/.opam/4.04.1/lib/bin_prot/shape: added to search path
/home/opam/.opam/4.04.1/lib/bin_prot/shape/bin_shape_lib.cma: loaded
/home/opam/.opam/4.04.1/lib/bin_prot: added to search path
/home/opam/.opam/4.04.1/lib/bin_prot/bin_prot.cma: loaded
/home/opam/.opam/4.04.1/lib/ppx_hash/runtime-lib: added to search path
/home/opam/.opam/4.04.1/lib/ppx_hash/runtime-lib/ppx_hash_lib.cma: loaded
/home/opam/.opam/4.04.1/lib/ppx_inline_test/config: added to search path
/home/opam/.opam/4.04.1/lib/ppx_inline_test/config/inline_test_config.cma: loaded
/home/opam/.opam/4.04.1/lib/ppx_inline_test/runtime-lib: added to search path
/home/opam/.opam/4.04.1/lib/ppx_inline_test/runtime-lib/ppx_inline_test_lib.cma: loaded
/home/opam/.opam/4.04.1/lib/core_kernel/base_for_tests: added to search path
/home/opam/.opam/4.04.1/lib/core_kernel/base_for_tests/base_for_tests.cma: loaded
/home/opam/.opam/4.04.1/lib/jane-street-headers: added to search path
/home/opam/.opam/4.04.1/lib/jane-street-headers/jane_street_headers.cma: loaded
/home/opam/.opam/4.04.1/lib/ocaml/nums.cma: loaded
/home/opam/.opam/4.04.1/lib/num-top: added to search path
/home/opam/.opam/4.04.1/lib/num-top/num_top.cma: loaded
/home/opam/.opam/4.04.1/lib/num: added to search path
/home/opam/.opam/4.04.1/lib/ppx_assert/runtime-lib: added to search path
/home/opam/.opam/4.04.1/lib/ppx_assert/runtime-lib/ppx_assert_lib.cma: loaded
/home/opam/.opam/4.04.1/lib/ppx_bench/runtime-lib: added to search path
/home/opam/.opam/4.04.1/lib/ppx_bench/runtime-lib/ppx_bench_lib.cma: loaded
/home/opam/.opam/4.04.1/lib/ppx_expect/common: added to search path
/home/opam/.opam/4.04.1/lib/ppx_expect/common/expect_test_common.cma: loaded
/home/opam/.opam/4.04.1/lib/ppx_expect/config: added to search path
/home/opam/.opam/4.04.1/lib/ppx_expect/config/expect_test_config.cma: loaded
/home/opam/.opam/4.04.1/lib/ppx_expect/collector: added to search path
/home/opam/.opam/4.04.1/lib/ppx_expect/collector/expect_test_collector.cma: loaded
/home/opam/.opam/4.04.1/lib/stdio: added to search path
/home/opam/.opam/4.04.1/lib/stdio/stdio.cma: loaded
/home/opam/.opam/4.04.1/lib/typerep: added to search path
/home/opam/.opam/4.04.1/lib/typerep/typerep_lib.cma: loaded
/home/opam/.opam/4.04.1/lib/core_kernel: added to search path
/home/opam/.opam/4.04.1/lib/core_kernel/core_kernel.cma: loaded
/home/opam/.opam/4.04.1/lib/sexplib/unix: added to search path
/home/opam/.opam/4.04.1/lib/sexplib/unix/sexplib_unix.cma: loaded
/home/opam/.opam/4.04.1/lib/spawn: added to search path
/home/opam/.opam/4.04.1/lib/spawn/spawn.cma: loaded
/home/opam/.opam/4.04.1/lib/ocaml/threads: added to search path
/home/opam/.opam/4.04.1/lib/core: added to search path
/home/opam/.opam/4.04.1/lib/core/core.cma: loaded
/home/opam/.opam/4.04.1/lib/fftw3: added to search path
/home/opam/.opam/4.04.1/lib/fftw3/fftw3.cma: loaded
/home/opam/.opam/4.04.1/lib/easy-format: added to search path
/home/opam/.opam/4.04.1/lib/easy-format/easy_format.cmo: loaded
/home/opam/.opam/4.04.1/lib/biniou: added to search path
/home/opam/.opam/4.04.1/lib/biniou/biniou.cma: loaded
/home/opam/.opam/4.04.1/lib/yojson: added to search path
/home/opam/.opam/4.04.1/lib/yojson/yojson.cmo: loaded
/home/opam/.opam/4.04.1/lib/ocaml/str.cma: loaded
/home/opam/.opam/4.04.1/lib/atd: added to search path
/home/opam/.opam/4.04.1/lib/atd/atd.cma: loaded
/home/opam/.opam/4.04.1/lib/atdgen: added to search path
/home/opam/.opam/4.04.1/lib/atdgen/atdgen.cma: loaded
/home/opam/.opam/4.04.1/lib/bytes: added to search path
/home/opam/.opam/4.04.1/lib/result: added to search path
/home/opam/.opam/4.04.1/lib/result/result.cma: loaded
/home/opam/.opam/4.04.1/lib/lwt: added to search path
/home/opam/.opam/4.04.1/lib/lwt/lwt.cma: loaded
/home/opam/.opam/4.04.1/lib/lwt/lwt-log.cma: loaded
/home/opam/.opam/4.04.1/lib/lwt/lwt-unix.cma: loaded
/home/opam/.opam/4.04.1/lib/ctypes: added to search path
/home/opam/.opam/4.04.1/lib/ctypes/ctypes.cma: loaded
/home/opam/.opam/4.04.1/lib/ctypes/ctypes-top.cma: loaded
/home/opam/.opam/4.04.1/lib/ctypes/ctypes-foreign-base.cma: loaded
/home/opam/.opam/4.04.1/lib/ctypes/ctypes-foreign-unthreaded.cma: loaded
/home/opam/.opam/4.04.1/lib/iocaml-kernel: added to search path
/home/opam/.opam/4.04.1/lib/iocaml-kernel/iocaml_lib.cma: loaded
/home/opam/.opam/4.04.1/lib/cairo2: added to search path
/home/opam/.opam/4.04.1/lib/cairo2/cairo2.cma: loaded
/home/opam/.opam/4.04.1/lib/ocaml/dynlink.cma: loaded
/home/opam/.opam/4.04.1/lib/ocaml/camlp4: added to search path
/home/opam/.opam/4.04.1/lib/archimedes: added to search path
/home/opam/.opam/4.04.1/lib/archimedes/archimedes_internals.cma: loaded
/home/opam/.opam/4.04.1/lib/archimedes/archimedes_toploop.cma: loaded
Module Archimedes loaded and aliased as A.
/home/opam/.opam/4.04.1/lib/archimedes/archimedes_cairo.cma: loaded
module Archimedes_iocaml : sig  end
module Wav :
  sig
    type t = [ `Monoral of float array | `Stereo of (float * float) array ]
    val from_string : string -> int * t
    val from_channel : in_channel -> int * t
    val from_file : string -> int * t
    val to_file : ?sample_bits:int -> sample_rate:int -> string -> t -> unit
  end

In [2]:
open Core ;;
open Core.Caml.Format ;;

let pi = 3.14159265358979 ;;


Out[2]:
val pi : float = 3.14159265358979

Preliminaries

We use an audio file of vowel sound /a/ downloaded from Wikipedia. The original data format is OGG, but it is not easy to read and write due to compression. We convert OGG files into (non-compressed) linear-PCM WAV files by FFmpeg.

FFmpeg options:

  • -ac 1: output monoral data
  • -ar N: set $N$ to sample rate
  • -acodec pcm_s16le: output linear PCM of litte-endian 16-bit signed integers
  • -loglevel warning: supress noisy logging
  • -y: always YES (overwrite a file of the same name)

In [3]:
let read_sound_file fname url =
  assert(Unix.system (sprintf "curl -o %s -sL %s" fname url) = Ok ()) ;
  assert(Unix.system (sprintf "ffmpeg -loglevel warning -y -i %s -ac 1 -acodec pcm_s16le %s.wav" fname fname) = Ok ()) ;
  match Wav.from_file (fname ^ ".wav") with
  | (_, `Stereo _) -> assert false (* We assume all sound files are monoral! *)
  | (sample_rate, `Monoral xs) ->
    let txs = Array.mapi xs ~f:(fun i x -> (float i /. float sample_rate, x)) in (* pairs of time and x *)
    (sample_rate, txs)
;;

let (sample_rate, raw_signal) =
  read_sound_file
    "datasets/a.ogg"
    "https://upload.wikimedia.org/wikipedia/commons/6/65/Open_front_unrounded_vowel.ogg" ;;


Out[3]:
val read_sound_file : string -> string -> int * (float * float) Core.Array.t =
  <fun>
Out[3]:
val sample_rate : int = 44100
val raw_signal : (float * float) Core.Array.t =
  [|(0., 0.); (2.26757369614512476e-05, 0.001129150390625);
    (4.53514739229024953e-05, 0.002197265625);
    (6.80272108843537395e-05, 0.002960205078125);
    (9.07029478458049905e-05, 0.002655029296875);
    (0.000113378684807256242, 0.001312255859375);
    (0.000136054421768707479, 0.0008544921875);
    (0.00015873015873015873, 0.001007080078125);
    (0.000181405895691609981, 0.000274658203125);
    (0.000204081632653061232, -0.0010986328125);
    (0.000226757369614512483, -0.0018310546875);
    (0.000249433106575963734, -0.00164794921875);
    (0.000272108843537414958, -0.0013427734375);
    (0.000294784580498866236, -0.001861572265625);
    (0.00031746031746031746, -0.002838134765625);
    (0.000340136054421768684, ...); ...|]

In [4]:
let vp = A.init ~w:800. ~h:250. ["iocaml"] in
A.Axes.box vp ;
A.Viewport.title vp "/a/ (time domain)" ;
A.Viewport.xlabel vp "time (sec)" ;
A.Viewport.ylabel vp "amplitude" ;
A.set_color vp A.Color.red ;
A.Array.xy_pairs vp ~style:`Lines raw_signal ;
A.close vp


Out[4]:
- : unit = ()

Preprocessing

In this example, we extract a sub-array of index 15000 to 20000, and apply the hamming window:

$$w(i) = 0.54 - 0.46 \cos\left( \frac{2 \pi i}{N-1} \right).$$

In [5]:
let hamming_window x =
  let n = 5000 in
  let hamming i = 0.54 -. 0.46 *. Float.cos (2.0 *. pi *. float i /. float (n - 1)) in
  Array.sub ~pos:15000 ~len:n x
  |> Array.mapi ~f:(fun i (t, x) -> (t, x *. hamming i))
;;

let win_signal = hamming_window raw_signal ;;


Out[5]:
val hamming_window : ('a * float) Core.Array.t -> ('a * float) Core.Array.t =
  <fun>
Out[5]:
val win_signal : (float * float) Core.Array.t =
  [|(0.340136054421768697, 0.00372802734375000076);
    (0.34015873015873016, 0.00268800048964651);
    (0.340181405895691624, 0.0014087169984098708);
    (0.340204081632653088, 0.000361342894815645202);
    (0.340226757369614496, -0.0002954316234697735);
    (0.34024943310657596, -0.000712971570515843079);
    (0.340272108843537424, -0.00116474119143974964);
    (0.340294784580498888, -0.00167029351353935166);
    (0.340317460317460296, -0.00214661988108693367);
    (0.34034013605442176, -0.00267432332451698548);
    (0.340362811791383224, -0.00321190733248099495);
    (0.340385487528344688, -0.00355908373792880291);
    (0.340408163265306096, -0.00375000954121899443);
    (0.34043083900226756, -0.00398254665853326785);
    (0.340453514739229, -0.0044522019086400029); (0.340476190476190488, ...);
    ...|]

In [6]:
let vp = A.init ~w:800. ~h:250. ["iocaml"] in
A.Axes.box vp ;
A.Viewport.title vp "/a/ (time domain)" ;
A.Viewport.xlabel vp "time (sec)" ;
A.Viewport.ylabel vp "amplitude" ;
A.set_color vp A.Color.red ;
A.Array.xy_pairs vp ~style:`Lines win_signal ;
A.close vp


Out[6]:
- : unit = ()

Power spectrum


In [7]:
let fft_spectrum ~sample_rate xs =
  let module FFT = Fftw3.D in
  let open Bigarray in
  let n = Array.length xs in
  let c_xs = Array1.create FFT.complex C_layout n in (* source memory for time-domain data *)
  let c_zs = Array1.create FFT.complex C_layout n in (* destination memory for frequency-domain data *)
  let plan = FFT.Array1.dft FFT.Forward c_xs c_zs in
  for i = 0 to n - 1 do
    let x = xs.(i) in
    c_xs.{i} <- Complex.({ re = x; im = 0.0; })
  done ;
  FFT.exec plan ; (* Execute FFT *)
  Array.init (n / 2)
    ~f:(fun i ->
        let f = float i /. float n *. float sample_rate in (* frequency *)
        let db = 10.0 *. Float.log10 (Complex.norm2 c_zs.{i}) in (* Convert into power spectrum (decibels) *)
        (f, db))
;;
let spectrum = fft_spectrum ~sample_rate @@ Array.map ~f:snd win_signal ;;


Out[7]:
val fft_spectrum :
  sample_rate:int -> float Core.Array.t -> (float * float) Core.Array.t =
  <fun>
Out[7]:
val spectrum : (float * float) Core.Array.t =
  [|(0., -16.3981659621761686); (8.82, 10.54702919071614);
    (17.64, 12.9381493850392832); (26.4599999999999973, 3.10820173803326849);
    (35.28, 10.1225596086416267); (44.1, 15.3572463641570671);
    (52.9199999999999946, 6.31669095414016724); (61.74, 12.5306958973086893);
    (70.56, 13.579175394292422); (79.38, 13.6013498073354118);
    (88.2, 3.01461225733947247); (97.0200000000000102, 3.78326595980915581);
    (105.839999999999989, 5.12572133070504599);
    (114.66, -16.5298978038182725); (123.48, 5.31665820364177222);
    (132.3, ...); ...|]

In [8]:
let vp = A.init ~w:800. ~h:250. ["iocaml"] in
A.Axes.box vp ;
A.Viewport.title vp "/a/ (frequency domain)" ;
A.Viewport.xlabel vp "frequency (Hz)" ;
A.Viewport.ylabel vp "magnitude (dB)" ;
A.set_color vp A.Color.red ;
A.Array.xy_pairs vp ~style:`Lines spectrum ;
A.close vp


Out[8]:
- : unit = ()

Auroregressive model

Let $x_t \in \mathbb{R}$ be a signal of vowel sound, and $t = 1,\dots,N$ be a time step. Autoregressive (AR) model assumes that $x_t$ is represented by

$$x_t = - \sum^{k}_{i=1} a_i x_{t-i} + \varepsilon_t$$

where $k \in \mathbb{N}$ is the AR order, and $\varepsilon_t \sim \mathcal{N}(0, \sigma^2)$ is an error at time $t$. AR model is IIR filter generating $x_t$ from Gaussian noise $\varepsilon_t$, i.e.,

  • $a_i$ is voice tract filter coefficients,
  • $\varepsilon_t$ is a glottal source, and
  • $x_t$ is observed voice sound.

Levinson-Durvin recursion

Levinson-Durbin recursion is approach for estimation of AR coefficients $a_i$ by the minimization of $\sigma^2$. The algorithm computes AR($m+1$) coefficients $a^{(m+1)}_i$ and $\sigma^2_{(m+1)}$ from AR($m$) parameters $a^{(m)}_i$ and $\sigma^2_{(m)}$, recusively. The detailed derivation is explained in this tutorial.

First, we obtain $\sigma^2_{(0)}$ under the assumption that $x_t$ is Gaussian noise $\varepsilon_t$:

$$\sigma^2_{(0)} = R(0).$$

Second, we recursively calculate $a^{(m+1)}_i$ and $\sigma^2_{(m+1)}$ by

\begin{align} \lambda_{(m+1)} & = - \left( R(m) + \sum^m_{i=1} a_i^{(m)} R(m+1-i) \right) \sigma^{-2}_{(m)}, \\ a^{(m+1)}_i & = a^{(m)}_i + \lambda_{(m+1)} a^{(m)}_{m-i}, \qquad (i = 1,\dots,m) \\ a^{(m+1)}_{m+1} & = \lambda_{(m+1)}, \\ \sigma^2_{(m+1)} & = \left( 1 - \lambda_{(m+1)}^2 \right) \sigma^2_{(m)}. \end{align}

In [9]:
let levinson r =
  let n = Array.length r in
  if n = 0 then failwith "levinson: empty autocorrelation" ;
  let rec aux m ar sigma2 =
    if m + 1 = n then (ar, sigma2)
    else begin
      let lambda = ~-. (Array.foldi ~init:r.(m + 1) ~f:(fun i acc ai -> acc +. ai *. r.(m - i)) ar) /. sigma2 in
      let ar' = Array.init ~f:(fun i -> if i = m then lambda else ar.(i) +. lambda *. ar.(m - i - 1)) (m + 1) in
      let sigma2' = sigma2 *. (1.0 -. lambda *. lambda) in
      aux (m + 1) ar' sigma2'
    end
  in
  aux 0 [||] r.(0)


Out[9]:
val levinson : float Core.Array.t -> float Core.Array.t * float = <fun>

where $R(\tau)$ is autocorrelation of $x_t$ given by

$$R(\tau) = \sum^{N-\tau}_{t=1} x_t x_{t+\tau} \quad (\tau \ge 0).$$

In [10]:
let autocorr ~max_lag x =
  let n = Array.length x in
  Array.init (max_lag + 1)
    ~f:(fun lag ->
        let sum = ref 0.0 in
        for i = 0 to n - lag - 1 do sum := !sum +. x.(i) *. x.(i + lag) done ;
        !sum)


Out[10]:
val autocorr : max_lag:int -> float Core.Array.t -> float Core.Array.t =
  <fun>

In [11]:
let (ar, sigma2) = levinson @@ autocorr ~max_lag:60 @@ Array.map ~f:snd win_signal


Out[11]:
val ar : float Core.Array.t =
  [|-3.15488819100091433; 4.39693011682255186; -4.03894703717712478;
    3.51162541376807491; -3.42527195508538274; 3.19806748888310555;
    -2.84333748436180533; 2.27983966767366919; -1.13920665663205334;
    0.300301176361091771; 0.0431275062420504174; -0.771292371371321894;
    1.399309995860061; -1.16832669639398556; 0.678766574647057;
    -0.558863651769131442; 0.43623797325028113; -0.0389132567919332206;
    -0.290126910165048379; 0.640423369276837; -0.884529470231625758;
    0.721720695613261132; -0.406037068450769922; 0.0696315933770184775;
    0.176825152668555252; -0.267361992363075729; 0.333013962989415824;
    -0.429283357554527956; 0.467182831773884144; -0.391132621636673505;
    0.199143209026254153; 0.167622288490863469; -0.441859482161495509;
    0.331907241425724919; -0.262734093432248628; 0.329044088095172216;
    -0.185777536036301399; 0.0133290776647882955; 0.116232212559062159;
    -0.265295768004786159; 0.347166881308671293; -0.266808004893802631;
    0.188329635104868554; -0.188419404977809291; 0.0418982708136971;
    0.1382993053268185; -0.284082830150884313; ...|]
val sigma2 : float = 0.00419180662910843826

Spectrum envelope

AR model is an IIR filter generating $x_t$ from Gaussian noise $\varepsilon_t$:

$$\varepsilon_t = x_t + \sum^k_{i=1} a_i x_{t-i} = \sum^k_{i=0} a_i x_{t-i}$$

where $a_0 = 1$. Let

  • $A(\omega)$ be a voice tract filter characteristics, i.e., a spectrum of $a_i$,
  • $E(\omega)$ be a spectrum of glottal source $\varepsilon_t$, and
  • $X(\omega)$ is a spectrum of observed voice sound $x_t$,

then

$$E(\omega) = A(\omega) X(\omega)$$

thus

$$X(\omega) = \frac{E(\omega)}{A(\omega)}.$$

The decibels of power spectrum of $x_t$ is

$$10 \log_{10} |X(\omega)|^2 = 10 \log_{10} \left| \frac{E(\omega)}{A(\omega)} \right|^2 = 10 \log_{10} \frac{\sigma^2}{|A(\omega)|^2} = 10 \log_{10} \sigma^2 - 10 \log_{10} |A(\omega)|^2.$$

$|A(\omega)|^2$ is called spectrum envelope of $|X(\omega)|^2$.


In [12]:
let spectrum_envelope ~sample_rate ~n ar sigma2 =
  Array.init n ~f:(fun i -> if i = 0 then 1.0 else if i <= Array.length ar then ar.(i - 1) else 0.0)
  |> fft_spectrum ~sample_rate
  |> Array.map ~f:(fun (f, z) -> (f, 10.0 *. Float.log10 sigma2 -. z))
;;
let spec_env = spectrum_envelope ~sample_rate ~n:(Array.length win_signal) ar sigma2 ;;


Out[12]:
val spectrum_envelope :
  sample_rate:int ->
  n:int -> float Core.Array.t -> Core.Float.t -> (float * float) Core.Array.t =
  <fun>
Out[12]:
val spec_env : (float * float) Core.Array.t =
  [|(0., 15.5233067810192189); (8.82, 15.5224565835321222);
    (17.64, 15.5199169330780045); (26.4599999999999973, 15.5157205136918925);
    (35.28, 15.5099213292569296); (44.1, 15.5025940123619925);
    (52.9199999999999946, 15.4938328823594524); (61.74, 15.4837507768561231);
    (70.56, 15.4724776859202642); (79.38, 15.4601592220876221);
    (88.2, 15.446954961746723); (97.0200000000000102, 15.4330366944853488);
    (105.839999999999989, 15.4185866168986827);
    (114.66, 15.4037955059087501); (123.48, 15.3888609042499205);
    (132.3, ...); ...|]

In [13]:
let vp = A.init ~w:800. ~h:250. ["iocaml"] in
A.Axes.box vp ;
A.Viewport.title vp "/a/ (frequency domain)" ;
A.Viewport.xrange vp 0.0 10000.0 ;
A.Viewport.xlabel vp "frequency (Hz)" ;
A.Viewport.ylabel vp "magnitude (dB)" ;

(* Spectrum: log |X(w)|^2 *)
A.set_color vp A.Color.red ;
A.Array.xy_pairs vp ~style:`Lines spectrum ;

(* Spectrum envelope: log |A(w)|^2 *)
A.set_color vp A.Color.blue ;
A.Array.xy_pairs vp ~style:`Lines spec_env ;

A.close vp


Out[13]:
- : unit = ()

Formant frequency

Formant frequency is a peak of a spectrum envelope.


In [14]:
(** [extract_peaks z] returns an array of formant frequencies from a given spectrum envlope [z]. *)
let extract_peaks z =
  let peaks = ref [] in
  for i = 1 to Array.length z - 2 do
    if snd z.(i-1) < snd z.(i) && snd z.(i) > snd z.(i+1) then begin
      peaks := (fst z.(i)) :: !peaks
    end
  done ;
  List.rev !peaks
;;

let formant = extract_peaks spec_env ;;


Out[14]:
val extract_peaks : ('a * 'b) Core.Array.t -> 'a Core.List.t = <fun>
Out[14]:
val formant : float Core.List.t =
  [943.739999999999895; 1323.; 2672.46; 3607.38; 4921.56; 5486.04;
   5856.48000000000047; 6473.88000000000102; 7205.94; 8370.18; 10125.36;
   10725.12; 12753.7200000000012; 14050.26; 15796.62; 16511.04; 17066.7;
   17745.84; 18363.239999999998; 19580.4; 20427.12; 21229.739999999998]

Vowel chart

Plot a vowel chart of five kinds of voewl sound /a/, /i/, /ɯ/, /ɘ/, and /ɔ/.


In [15]:
let formants = 
  List.filter_map [
      "a", "https://upload.wikimedia.org/wikipedia/commons/6/65/Open_front_unrounded_vowel.ogg";
      "i", "https://upload.wikimedia.org/wikipedia/commons/9/91/Close_front_unrounded_vowel.ogg";
      "w", "https://upload.wikimedia.org/wikipedia/commons/e/e8/Close_back_unrounded_vowel.ogg";
      "e", "https://upload.wikimedia.org/wikipedia/commons/e/e0/Mid_front_unrounded_vowel.ogg";
      "o", "https://upload.wikimedia.org/wikipedia/commons/0/02/Open-mid_back_rounded_vowel.ogg";
    ]
  ~f:(fun (name, url) ->
      let (sample_rate, x) = read_sound_file (sprintf "datasets/%s.ogg" name) url in
      let x' = hamming_window x in
      let (ar, sigma2) = levinson @@ autocorr ~max_lag:60 @@ Array.map ~f:snd x' in
      let spec_env = spectrum_envelope ~sample_rate ~n:(Array.length x') ar sigma2 in
      match extract_peaks spec_env with
      | f1 :: f2 :: _ -> Some (name, f1, f2)
      | _ -> None)


Out[15]:
val formants : (string * float * float) Core.List.t =
  [("a", 943.739999999999895, 1323.);
   ("i", 211.679999999999978, 2584.25999999999976); ("w", 246.96, 1111.32);
   ("e", 299.88, 1852.2); ("o", 432.18, 590.94)]

In [16]:
let vp = A.init ["iocaml"] in
A.Axes.box vp ;
A.Viewport.xrange vp 0. 1000. ;
A.Viewport.yrange vp 0. 3000. ;
A.Viewport.xlabel vp "F1 frequency (Hz)" ;
A.Viewport.ylabel vp "F2 frequency (Hz)" ;

A.set_color vp A.Color.red ;
A.List.xy_pairs vp ~style:(`Markers "x") (List.map formants ~f:(fun (_, f1, f2) -> (f1, f2))) ;

A.set_color vp A.Color.black ;
List.iter formants ~f:(fun (name, f1, f2) -> A.Viewport.text vp (f1 +. 20.0) f2 name) ;

A.close vp


Out[16]:
- : unit = ()

In [ ]: