ASR Tensorflow Testbed


In [1]:
import audio
audio.example()


The file is 23248 samples long
The sample rate is 16000 Hz
The file is 1453 milliseconds long
(1, 144, 14, 1)
magspec (144, 257)
spe (23520, 1)
sof (23248, 1)
(1, 23248, 2)
[23248]
[  1 144 400   2]
[  1 144 400   2]
[  1 144   2]
[  1 144 257   2]
The DC offset was: [[[  3.14912476e-05  -6.29824952e-05]]]
The frame length was: 400
The frame shift was: 160
The padding was: 272
The total padding was: 272
There were 144 frames
The padded length was 23520 samples
The framed audio was [[[[ -5.80824418e-04   1.16164884e-03]
   [ -7.02898456e-04   1.40579691e-03]
   [ -1.25223163e-03   2.50446325e-03]
   ..., 
   [ -9.72738107e-07   1.94547621e-06]
   [ -3.06157833e-04   6.12315666e-04]
   [ -2.45120814e-04   4.90241628e-04]]

  [[ -7.94453984e-04   1.58890797e-03]
   [ -6.41861437e-04   1.28372287e-03]
   [ -5.19787399e-04   1.03957480e-03]
   ..., 
   [ -1.95415734e-03   3.90831469e-03]
   [ -1.92363884e-03   3.84727767e-03]
   [ -1.86260182e-03   3.72520363e-03]]

  [[  1.46391572e-03  -2.92783143e-03]
   [  1.58598975e-03  -3.17197951e-03]
   [  1.67754528e-03  -3.35509057e-03]
   ..., 
   [ -9.77565041e-04   1.95513008e-03]
   [ -8.24972494e-04   1.64994499e-03]
   [ -6.11342928e-04   1.22268586e-03]]

  ..., 
  [[ -7.53904458e-03   1.50780892e-02]
   [ -6.40985973e-03   1.28197195e-02]
   [ -6.31830420e-03   1.26366084e-02]
   ..., 
   [ -2.56060022e-02   5.12120044e-02]
   [ -2.90240752e-02   5.80481505e-02]
   [ -3.10688154e-02   6.21376308e-02]]

  [[  1.38400468e-01  -2.76800935e-01]
   [  1.33761654e-01  -2.67523309e-01]
   [  1.32388321e-01  -2.64776643e-01]
   ..., 
   [  7.46473014e-02  -1.49294603e-01]
   [  7.23278947e-02  -1.44655789e-01]
   [  7.06493767e-02  -1.41298753e-01]]

  [[  3.47901281e-02  -6.95802561e-02]
   [  3.60719055e-02  -7.21438109e-02]
   [  3.56446463e-02  -7.12892927e-02]
   ..., 
   [  0.00000000e+00   0.00000000e+00]
   [  0.00000000e+00   0.00000000e+00]
   [  0.00000000e+00   0.00000000e+00]]]]
The framed audio was [[[[ -5.80824418e-04   1.16164884e-03]
   [ -1.39498770e-04   2.78997541e-04]
   [ -5.70420124e-04   1.14084025e-03]
   ..., 
   [  2.07191497e-04  -4.14382994e-04]
   [ -3.05214277e-04   6.10428554e-04]
   [  5.18522840e-05  -1.03704568e-04]]

  [[ -2.31054299e-04   4.62108598e-04]
   [  1.28758928e-04  -2.57517856e-04]
   [  1.02818195e-04  -2.05636390e-04]
   ..., 
   [  2.07801867e-04  -4.15603735e-04]
   [ -2.81062109e-05   5.62124217e-05]
   [  3.32785390e-06  -6.65570780e-06]]

  [[  6.95182464e-04  -1.39036493e-03]
   [  1.65991509e-04  -3.31983019e-04]
   [  1.39135221e-04  -2.78270442e-04]
   ..., 
   [ -8.85328596e-05   1.77065719e-04]
   [  1.23265596e-04  -2.46531192e-04]
   [  1.88880392e-04  -3.77760783e-04]]

  ..., 
  [[  9.94611587e-05  -1.98922317e-04]
   [  9.03013513e-04  -1.80602703e-03]
   [ -1.00740263e-04   2.01480527e-04]
   ..., 
   [ -2.75157800e-03   5.50315599e-03]
   [ -4.18625313e-03   8.37250625e-03]
   [ -2.91546239e-03   5.83092478e-03]]

  [[  1.96139542e-03  -3.92279084e-03]
   [ -4.86799408e-04   9.73598817e-04]
   [  2.63951670e-03  -5.27903340e-03]
   ..., 
   [  7.84033872e-05  -1.56806774e-04]
   [ -7.99876770e-05   1.59975354e-04]
   [  4.91318820e-04  -9.82637641e-04]]

  [[  2.10941019e-03  -4.21882039e-03]
   [  2.32548124e-03  -4.65096248e-03]
   [  6.54898031e-04  -1.30979606e-03]
   ..., 
   [  0.00000000e+00   0.00000000e+00]
   [  0.00000000e+00   0.00000000e+00]
   [  0.00000000e+00   0.00000000e+00]]]]
The framed audio energy for the 50th frame was [ 0.83252431  2.21881867]
The framed audio energy for the 50th frame was [-1.13134505  0.25494931]
Average s_of error: %.f 1.68444613283e-12
Average s_pe error: %.f 6.26711224067e-08
There were 2 frames
Spec: [[[[-55.98958051 -49.9689806 ]
   [-47.58722305 -41.56662314]
   [-50.10849483 -44.08789492]
   ..., 
   [-54.64914964 -48.62854973]
   [-53.85024177 -47.82964185]
   [-55.32373949 -49.30313958]]

  [[-62.4307727  -56.41017279]
   [-48.91753461 -42.89693469]
   [-58.76472313 -52.74412322]
   ..., 
   [-62.54247586 -56.52187595]
   [-65.68224616 -59.66164625]
   [-62.11119467 -56.09059475]]

  [[-42.37092751 -36.3503276 ]
   [-43.62969315 -37.60909324]
   [-46.66933316 -40.64873324]
   ..., 
   [-61.5389449  -55.51834498]
   [-61.6454395  -55.62483959]
   [-75.22067826 -69.20007835]]

  ..., 
  [[-21.66504185 -15.64444194]
   [-15.37637416  -9.35577425]
   [ -9.37091478  -3.35031487]
   ..., 
   [-41.16453324 -35.14393333]
   [-58.47637546 -52.45577555]
   [-54.93463645 -48.91403654]]

  [[-18.1644427  -12.14384278]
   [-18.86583411 -12.8452342 ]
   [-11.9306115   -5.91001159]
   ..., 
   [-43.13261264 -37.11201273]
   [-45.81837683 -39.79777692]
   [-48.33789855 -42.31729864]]

  [[-18.03328712 -12.0126872 ]
   [-13.4938477   -7.47324779]
   [ -9.67952555  -3.65892564]
   ..., 
   [-48.01533996 -41.99474004]
   [-51.46253751 -45.4419376 ]
   [-56.08418498 -50.06358507]]]]
Spec size: (1, 144, 257, 2)

In [2]:
import numpy as np
import specplotting
import matplotlib.pyplot as plt
d = np.load("./scratch/kaldispec.npy")
spec = d.tolist()["gasstation"]
specplotting.plot_spec(spec, sample_rate=16000, title="Kaldi Spectrogram")
plt.show()