In [34]:
from keras.datasets import mnist
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop
In [43]:
batch_size = 32
num_classes = 10
epochs = 20
In [44]:
def my_generator():
# TODO: 音声ファイルのディレクトリからwavデータを読み込む
# 拡張のベースとなるディレクトリの音声ファイルはすべてメモリに乗せてもよいかも
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
y_train = np_utils.to_categorical(y_train, 10)
# バッチサイズ単位でデータとラベルのタプルをyieldで返す
while True:
for i in range(1875): # 1875 * 32 (batch_size) = 60000
data_batch = X_train[i * batch_size:(i + 1) * batch_size]
label_batch = y_train[i * batch_size:(i + 1) * batch_size]
# TODO: ここで音声ファイルの拡張処理(ノイズ付与など) + 特徴抽出
yield data_batch, label_batch
In [45]:
gen = my_generator()
In [46]:
gen
Out[46]:
In [47]:
images, labels = gen.__next__()
In [48]:
images.shape
Out[48]:
In [49]:
labels.shape
Out[49]:
In [51]:
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(784,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))
In [53]:
model.compile(loss='categorical_crossentropy',
optimizer=RMSprop(),
metrics=['accuracy'])
In [54]:
# steps_per_epoch: 1エポックを宣言してから次のエポックの開始前までにgeneratorから生成される
# サンプル (サンプルのバッチ) の総数. 典型的には,データにおけるユニークなサンプル数をバッチサイズで割った値です.
model.fit_generator(my_generator(), steps_per_epoch=1875, epochs=epochs, verbose=1)
Out[54]:
In [55]:
import os
import glob
import numpy as np
import librosa
In [88]:
def pad(y, length):
arr = np.zeros(length)
if len(y) < length:
arr[:len(y)] = y
else:
arr[:] = y[:length]
return arr
def extract_melspectrogram(wave_batch, sr, length, n_fft, hop_length, n_mels):
melgram_batch = []
for y in wave_batch:
x = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
melgram = librosa.power_to_db(x, ref=np.max)
melgram_batch.append(melgram)
melgram_batch = np.array(melgram_batch)
return melgram_batch
def add_nose(wave_batch, sr, ratio=0.05):
noise_wave_batch = []
for y in wave_batch:
# 0 - ratio
noise = np.random.normal(0, np.std(y) * ratio, y.shape[0])
In [89]:
def sound_generator(sound_dir, batch_size=10, sr=16000, length=3,
n_fft=512, hop_length=256, n_mels=128):
wave_data = []
labels = []
# load all data and labels from directory
for fpath in glob.glob(os.path.join(sound_dir, '*.wav')):
fname = os.path.basename(fpath)
# load sound
# メモリを圧迫する場合はwhileループの中でI/Oする方法もある
# 8万ファイルで4GB程度なので最初にメモリに載せた方が効率よさそう
y, sr = librosa.load(fpath, sr=sr)
y = pad(y, sr * length)
wave_data.append(y)
# extract label
label = int(fname.split('.')[0].split('_')[-1][:2]) - 1
labels.append(label)
wave_data = np.array(wave_data, dtype=np.float32)
labels = np.array(labels, dtype=np.int32)
print(wave_data.dtype, wave_data.shape)
print(labels.dtype, labels.shape)
while True:
# 各バッチに対する処理
for i in range(batch_size): # 100 samples / 10 batch_size = 10 batch
wave_batch = wave_data[i * batch_size:(i + 1) * batch_size]
label_batch = labels[i * batch_size:(i + 1) * batch_size]
print(i, wave_batch.shape, label_batch.shape)
# TODO: ここで音声ファイルの拡張処理(ノイズ付与など)
noise_wave_batch = add_noise(wave_batch, snr)
# バッチ単位で特徴抽出
data_batch = extract_melspectrogram(wave_batch, sr, length, n_fft, hop_length, n_mels)
yield data_batch, label_batch
In [90]:
gen = sound_generator('../../Projects/speech-emotion-recognition/data/sounds/sample/')
In [91]:
gen.__next__()
Out[91]:
In [11]:
import numpy as np
import librosa
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
y1, sr = librosa.load('data/hand.wav', sr=16000)
In [4]:
y2, sr = librosa.load('data/ear.wav', sr=16000)
In [5]:
y1[:10]
Out[5]:
In [6]:
y1.shape
Out[6]:
In [7]:
y2.shape
Out[7]:
In [8]:
plt.plot(y1)
Out[8]:
In [9]:
plt.plot(y2)
Out[9]:
In [12]:
print(np.mean(y2))
print(np.std(y2))
In [22]:
import numpy as np
noise = np.random.normal(0, np.std(y2) * 0.08, y2.shape[0])
In [23]:
noise.shape
Out[23]:
In [24]:
noise
Out[24]:
In [25]:
plt.plot(noise)
Out[25]:
In [26]:
librosa.output.write_wav('noise.wav', noise, sr=16000)
In [27]:
plt.plot(y2 + noise)
Out[27]:
In [28]:
librosa.output.write_wav('noise_sound.wav', y2 + noise, sr=16000)
In [29]:
y2
Out[29]:
In [30]:
plt.plot(y2)
Out[30]:
In [ ]:
In [ ]: