Created by Judit Acs
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import wavfile
import os
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler
%matplotlib inline
from keras.layers import Input, Dense, Bidirectional, Dropout, Conv1D, MaxPooling1D
from keras.layers.recurrent import LSTM, GRU
from keras.models import Model
from keras.layers.wrappers import TimeDistributed
from keras.callbacks import EarlyStopping
Data source on Kaggle.
The data is split into train and test sets. One wave file is one sample.
We first load all wave files as a vector of integers into a lists of lists.
In [2]:
def read_dir(dirname, X, y, label):
for fn in os.listdir(dirname):
w = wavfile.read(os.path.join(dirname, fn))
assert w[0] == 16000
X.append(w[1])
y.append(label)
X_train_cat = []
y_train_cat = []
X_test_cat = []
y_test_cat = []
X_train_dog = []
y_train_dog = []
X_test_dog = []
y_test_dog = []
read_dir("data/cat_dog/train/cat", X_train_cat, y_train_cat, 1)
read_dir("data/cat_dog/train/dog", X_train_dog, y_train_dog, 0)
read_dir("data/cat_dog/test/cat", X_test_cat, y_test_cat, 1)
read_dir("data/cat_dog/test/dog", X_test_dog, y_test_dog, 0)
len(X_train_cat), len(X_train_dog)
Out[2]:
We discard some of the cat files to balance the classes, then merge the cat and dog matrices.
In [3]:
maxlen = min(len(X_train_cat), len(X_train_dog))
X_train = X_train_cat[:maxlen]
X_train.extend(X_train_dog[:maxlen])
y_train = y_train_cat[:maxlen]
y_train.extend(y_train_dog[:maxlen])
print(maxlen, len(X_train))
maxlen = min(len(X_test_cat), len(X_test_dog))
X_test = X_test_cat[:maxlen]
X_test.extend(X_test_dog[:maxlen])
y_test = y_test_cat[:maxlen]
y_test.extend(y_test_dog[:maxlen])
maxlen
Out[3]:
In [4]:
def create_padded_mtx(list_of_lists, labels, maxlen, pad=0):
X = []
y = []
for i, x in enumerate(list_of_lists):
x_mult = np.array_split(x, x.shape[0] // maxlen)
for x in x_mult:
pad_size = maxlen-x.shape[0]
if pad_size > 0:
pad = np.zeros((maxlen-l.shape[0]))
x = np.concatenate((pad, l))
X.append(x[-maxlen:])
label = labels[i]
y.extend([label] * len(x_mult))
return np.array(X), np.array(y)
sample_len = 10000
X_train, y_train = create_padded_mtx(X_train, y_train, sample_len)
X_test, y_test = create_padded_mtx(X_test, y_test, sample_len)
Wav samples vary in a large range, we prefere values closer to zero.
StandardScaler scales all values so that the dataset has a mean of 0 and a standard deviation of 1.
Note that we fit StandardScaler on the train data only and use those value to transform both the train and the test matrices.
In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
np.mean(X_train), np.std(X_train), np.mean(X_test), np.std(X_test)
Out[5]:
In [6]:
shuf_idx = np.arange(X_train.shape[0])
np.random.shuffle(shuf_idx)
X_train = X_train[shuf_idx]
y_train = y_train[shuf_idx]
X_train.shape, X_test.shape
Out[6]:
The number of unique cat and dog samples:
In [7]:
cnt = np.unique(y_train, return_counts=True)[1]
print("Number of vaus: {}\nNumber of mieuws: {}".format(cnt[0], cnt[1]))
In [8]:
input_layer = Input(batch_shape=(None, X_train.shape[1]))
layer = Dense(100, activation="sigmoid")(input_layer)
# randomly disable 20% of the neurons, prevents or reduces overfitting
layer = Dropout(.2)(layer)
layer = Dense(100, activation="sigmoid")(input_layer)
layer = Dropout(.2)(layer)
layer = Dense(1, activation="sigmoid")(layer)
model = Model(inputs=input_layer, outputs=layer)
model.compile("rmsprop", loss="binary_crossentropy")
In [9]:
%time
ea = EarlyStopping(patience=2)
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1, validation_split=.1, callbacks=[ea])
Out[9]:
In [10]:
pred = model.predict(X_test)
labels = np.round(pred)
In [11]:
prec, rec, F, _ = precision_recall_fscore_support(y_test, labels)
print("Dog\n===========\nprec:{}\nrec:{}\nF-score:{}".format(prec[0], rec[0], F[0]))
print("\nCat\n===========\nprec:{}\nrec:{}\nF-score:{}".format(prec[1], rec[1], F[1]))
CNNs are not only good at image processing but also at handling long temporal data such as audio files.
CNNs and RNNs require 3D tensors instead of 2D tensors (normal matrices).
3D tensors are usually shaped as batch_size x timestep x feature_number, where batch_size is the number of samples fed to the network at once, timestep is the number of time steps the samples cover and feature_number is the dimension of the feature vectors. Audio files are one dimensional, so feature_number is 1.
Reshaping X_train and X_test:
In [12]:
X_train_3d = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_3d = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
In [13]:
# list of convolutional layers, try adding more or changing the parameters
conv_layers = [
{'filters': 200, 'kernel_size': 40, 'strides': 2, 'padding': "same", 'activation': "relu"},
{'filters': 200, 'kernel_size': 10, 'strides': 10, 'padding': "same", 'activation': "relu"},
{'filters': 50, 'kernel_size': 10, 'strides': 10, 'padding': "same", 'activation': "relu"},
]
input_layer = Input(batch_shape=(None, X_train_3d.shape[1], 1))
layer = Conv1D(**(conv_layers[0]))(input_layer)
for cfg in conv_layers[1:]:
layer = Conv1D(**cfg)(layer)
layer = Dropout(.2)(layer)
# reduce the number of parameters
layer = MaxPooling1D(2, padding="same")(layer)
layer = LSTM(128)(layer)
out = Dense(1, activation="sigmoid")(layer)
m = Model(inputs=input_layer, outputs=out)
m.compile("adam", loss='binary_crossentropy')
In [14]:
%%time
ea = EarlyStopping(patience=2)
hist = m.fit(X_train_3d, y_train, epochs=1000, batch_size=128, validation_split=.1, callbacks=[ea])
In [15]:
pred = m.predict(X_test_3d)
labels = (pred > 0.5).astype(int)
In [16]:
prec, rec, F, _ = precision_recall_fscore_support(y_test, labels)
print("Dog\n===========\nprec:{}\nrec:{}\nF-score:{}".format(prec[0], rec[0], F[0]))
print("\nCat\n===========\nprec:{}\nrec:{}\nF-score:{}".format(prec[1], rec[1], F[1]))