In [34]:
from keras.datasets import mnist
from keras.utils import np_utils

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop

In [43]:
batch_size = 32
num_classes = 10
epochs = 20

In [44]:
def my_generator():
    # TODO: 音声ファイルのディレクトリからwavデータを読み込む
    # 拡張のベースとなるディレクトリの音声ファイルはすべてメモリに乗せてもよいかも
    (X_train, y_train), (X_test, y_test) = mnist.load_data()

    X_train = X_train.reshape(60000, 784)
    X_test = X_test.reshape(10000, 784)
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    X_train /= 255
    X_test /= 255
    y_train = np_utils.to_categorical(y_train, 10)

    # バッチサイズ単位でデータとラベルのタプルをyieldで返す
    while True:
        for i in range(1875):  # 1875 * 32 (batch_size) = 60000
            data_batch = X_train[i * batch_size:(i + 1) * batch_size]
            label_batch = y_train[i * batch_size:(i + 1) * batch_size]

            # TODO: ここで音声ファイルの拡張処理(ノイズ付与など) + 特徴抽出

            yield data_batch, label_batch

In [45]:
gen = my_generator()

In [46]:
gen


Out[46]:
<generator object my_generator at 0x13b924620>

In [47]:
images, labels = gen.__next__()

In [48]:
images.shape


Out[48]:
(32, 784)

In [49]:
labels.shape


Out[49]:
(32, 10)

In [51]:
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(784,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

In [53]:
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

In [54]:
# steps_per_epoch: 1エポックを宣言してから次のエポックの開始前までにgeneratorから生成される
# サンプル (サンプルのバッチ) の総数. 典型的には,データにおけるユニークなサンプル数をバッチサイズで割った値です.
model.fit_generator(my_generator(), steps_per_epoch=1875, epochs=epochs, verbose=1)


Epoch 1/20
1875/1875 [==============================] - 19s - loss: 0.2161 - acc: 0.9362    
Epoch 2/20
1875/1875 [==============================] - 19s - loss: 0.1287 - acc: 0.9685    
Epoch 3/20
1875/1875 [==============================] - 18s - loss: 0.1214 - acc: 0.9732    
Epoch 4/20
1875/1875 [==============================] - 18s - loss: 0.1152 - acc: 0.9765    
Epoch 5/20
1875/1875 [==============================] - 19s - loss: 0.1113 - acc: 0.9783    
Epoch 6/20
1875/1875 [==============================] - 17s - loss: 0.1130 - acc: 0.9791    
Epoch 7/20
1875/1875 [==============================] - 16s - loss: 0.1071 - acc: 0.9814    
Epoch 8/20
1875/1875 [==============================] - 17s - loss: 0.1031 - acc: 0.9826    
Epoch 9/20
1875/1875 [==============================] - 20s - loss: 0.1074 - acc: 0.9825    
Epoch 10/20
1875/1875 [==============================] - 17s - loss: 0.1082 - acc: 0.9829    
Epoch 11/20
1875/1875 [==============================] - 19s - loss: 0.1092 - acc: 0.9836    
Epoch 12/20
1875/1875 [==============================] - 20s - loss: 0.1080 - acc: 0.9851    
Epoch 13/20
1875/1875 [==============================] - 17s - loss: 0.1125 - acc: 0.9837    
Epoch 14/20
1875/1875 [==============================] - 17s - loss: 0.1102 - acc: 0.9847    
Epoch 15/20
1875/1875 [==============================] - 16s - loss: 0.1084 - acc: 0.9854    
Epoch 16/20
1875/1875 [==============================] - 17s - loss: 0.1093 - acc: 0.9858    
Epoch 17/20
1875/1875 [==============================] - 19s - loss: 0.1113 - acc: 0.9864    
Epoch 18/20
1875/1875 [==============================] - 16s - loss: 0.1038 - acc: 0.9870    
Epoch 19/20
1875/1875 [==============================] - 18s - loss: 0.1078 - acc: 0.9863    
Epoch 20/20
1875/1875 [==============================] - 16s - loss: 0.0985 - acc: 0.9879    
Out[54]:
<keras.callbacks.History at 0x10611aef0>

音声用のGenerator


In [55]:
import os
import glob
import numpy as np
import librosa

In [88]:
def pad(y, length):
    arr = np.zeros(length)
    if len(y) < length:
        arr[:len(y)] = y
    else:
        arr[:] = y[:length]
    return arr

def extract_melspectrogram(wave_batch, sr, length, n_fft, hop_length, n_mels):
    melgram_batch = []
    
    for y in wave_batch:
        x = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
        melgram = librosa.power_to_db(x, ref=np.max)
        melgram_batch.append(melgram)
    
    melgram_batch = np.array(melgram_batch)

    return melgram_batch

def add_nose(wave_batch, sr, ratio=0.05):
    noise_wave_batch = []
    
    for y in wave_batch:
        # 0 - ratio
        noise = np.random.normal(0, np.std(y) * ratio, y.shape[0])

In [89]:
def sound_generator(sound_dir, batch_size=10, sr=16000, length=3,
                    n_fft=512, hop_length=256, n_mels=128):
    wave_data = []
    labels = []

    # load all data and labels from directory
    for fpath in glob.glob(os.path.join(sound_dir, '*.wav')):
        fname = os.path.basename(fpath)

        # load sound
        # メモリを圧迫する場合はwhileループの中でI/Oする方法もある
        # 8万ファイルで4GB程度なので最初にメモリに載せた方が効率よさそう
        y, sr = librosa.load(fpath, sr=sr)
        y = pad(y, sr * length)
        wave_data.append(y)

        # extract label
        label = int(fname.split('.')[0].split('_')[-1][:2]) - 1
        labels.append(label)

    wave_data = np.array(wave_data, dtype=np.float32)
    labels = np.array(labels, dtype=np.int32)

    print(wave_data.dtype, wave_data.shape)
    print(labels.dtype, labels.shape)

    while True:
        # 各バッチに対する処理
        for i in range(batch_size):  # 100 samples / 10 batch_size = 10 batch
            wave_batch = wave_data[i * batch_size:(i + 1) * batch_size]
            label_batch = labels[i * batch_size:(i + 1) * batch_size]
            
            print(i, wave_batch.shape, label_batch.shape)

            # TODO: ここで音声ファイルの拡張処理(ノイズ付与など)
            noise_wave_batch = add_noise(wave_batch, snr)

            # バッチ単位で特徴抽出
            data_batch = extract_melspectrogram(wave_batch, sr, length, n_fft, hop_length, n_mels)

            yield data_batch, label_batch

In [90]:
gen = sound_generator('../../Projects/speech-emotion-recognition/data/sounds/sample/')

In [91]:
gen.__next__()


float32 (100, 48000)
int32 (100,)
0 (10, 48000) (10,)
Out[91]:
(array([[[-32.30982954, -34.984229  , -27.52332604, ..., -80.        ,
          -80.        , -80.        ],
         [-28.24036922, -33.31718039, -27.09672633, ..., -80.        ,
          -80.        , -80.        ],
         [-26.06931372, -32.02468645, -26.63682338, ..., -80.        ,
          -80.        , -80.        ],
         ..., 
         [-80.        , -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ],
         [-80.        , -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ],
         [-80.        , -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ]],
 
        [[-57.16541029, -75.87655468, -75.61125944, ..., -80.        ,
          -80.        , -80.        ],
         [-57.04721801, -76.0775156 , -62.90697491, ..., -80.        ,
          -80.        , -80.        ],
         [-56.86804705, -76.23355357, -59.8938694 , ..., -80.        ,
          -80.        , -80.        ],
         ..., 
         [-76.12353534, -77.38806989, -77.89556493, ..., -80.        ,
          -80.        , -80.        ],
         [-78.86884334, -77.32854848, -75.84517533, ..., -80.        ,
          -80.        , -80.        ],
         [-78.72209156, -75.86473667, -78.52149499, ..., -80.        ,
          -80.        , -80.        ]],
 
        [[-61.97431102, -72.47778458, -73.61469134, ..., -80.        ,
          -80.        , -80.        ],
         [-64.3430686 , -69.40057677, -71.45922721, ..., -80.        ,
          -80.        , -80.        ],
         [-70.23575022, -67.51506397, -69.92955489, ..., -80.        ,
          -80.        , -80.        ],
         ..., 
         [-78.13379463, -76.90200217, -77.26110145, ..., -80.        ,
          -80.        , -80.        ],
         [-78.56864999, -77.81839419, -80.        , ..., -80.        ,
          -80.        , -80.        ],
         [-74.83485817, -78.20930691, -78.73502546, ..., -80.        ,
          -80.        , -80.        ]],
 
        ..., 
        [[-70.73205876, -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ],
         [-72.15036921, -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ],
         [-74.28830432, -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ],
         ..., 
         [-80.        , -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ],
         [-80.        , -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ],
         [-80.        , -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ]],
 
        [[-28.60478476, -29.03776449, -31.47523103, ..., -80.        ,
          -80.        , -80.        ],
         [-26.21652872, -31.86810023, -33.74495046, ..., -80.        ,
          -80.        , -80.        ],
         [-24.58681048, -44.41322579, -39.00203586, ..., -80.        ,
          -80.        , -80.        ],
         ..., 
         [-70.83477328, -76.24278936, -78.24521161, ..., -80.        ,
          -80.        , -80.        ],
         [-69.75237358, -74.36421852, -78.1617174 , ..., -80.        ,
          -80.        , -80.        ],
         [-71.28257172, -77.64345535, -80.        , ..., -80.        ,
          -80.        , -80.        ]],
 
        [[-40.8662439 , -32.65086604, -34.87308953, ..., -80.        ,
          -80.        , -80.        ],
         [-34.16580248, -33.69276653, -35.86661242, ..., -80.        ,
          -80.        , -80.        ],
         [-31.53119448, -35.05190479, -37.13938307, ..., -80.        ,
          -80.        , -80.        ],
         ..., 
         [-80.        , -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ],
         [-80.        , -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ],
         [-80.        , -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ]]]),
 array([4, 0, 6, 1, 6, 6, 6, 3, 3, 7], dtype=int32))

S/N比


In [11]:
import numpy as np
import librosa
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
y1, sr = librosa.load('data/hand.wav', sr=16000)

In [4]:
y2, sr = librosa.load('data/ear.wav', sr=16000)

In [5]:
y1[:10]


Out[5]:
array([ 0.00244141,  0.00311279,  0.00396729,  0.00375366,  0.003479  ,
        0.00445557,  0.00491333,  0.00393677,  0.00360107,  0.00357056], dtype=float32)

In [6]:
y1.shape


Out[6]:
(16518,)

In [7]:
y2.shape


Out[7]:
(18276,)

In [8]:
plt.plot(y1)


Out[8]:
[<matplotlib.lines.Line2D at 0x114777160>]

In [9]:
plt.plot(y2)


Out[9]:
[<matplotlib.lines.Line2D at 0x11494f128>]

In [12]:
print(np.mean(y2))
print(np.std(y2))


1.51953e-07
0.0032909

In [22]:
import numpy as np
noise = np.random.normal(0, np.std(y2) * 0.08, y2.shape[0])

In [23]:
noise.shape


Out[23]:
(18276,)

In [24]:
noise


Out[24]:
array([  1.76203666e-04,   9.41245502e-05,   1.41392443e-04, ...,
         3.87761069e-04,   3.28565603e-05,  -2.13289317e-04])

In [25]:
plt.plot(noise)


Out[25]:
[<matplotlib.lines.Line2D at 0x118471f28>]

In [26]:
librosa.output.write_wav('noise.wav', noise, sr=16000)

In [27]:
plt.plot(y2 + noise)


Out[27]:
[<matplotlib.lines.Line2D at 0x118e4d2b0>]

In [28]:
librosa.output.write_wav('noise_sound.wav', y2 + noise, sr=16000)

In [29]:
y2


Out[29]:
array([ -1.83105469e-04,  -1.52587891e-04,  -9.15527344e-05, ...,
         6.10351562e-05,   6.10351562e-05,   6.10351562e-05], dtype=float32)

In [30]:
plt.plot(y2)


Out[30]:
[<matplotlib.lines.Line2D at 0x118f485f8>]

In [ ]:


In [ ]: