In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '6,7'
import numpy as np
import pandas as pd
from scipy.io import wavfile
from scipy import signal
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision.transforms import Compose
# from lib.non_local_concatenation import NONLocalBlock2D
# from lib.non_local_gaussian import NONLocalBlock2D
# from non_local_embedded_gaussian import NONLocalBlock2D
from non_local_dot_product import NONLocalBlock2D
import tensorboardX
from tqdm import tqdm
import matplotlib.pyplot as plt
In [2]:
import warnings
warnings.filterwarnings('ignore') # scipy throws future warnings on fft (known bug)
In [3]:
class IdentificationDataset(Dataset):
def __init__(self, path, train, transform=None):
iden_split_path = os.path.join(path, 'iden_split.txt')
split = pd.read_table(iden_split_path, sep=' ', header=None, names=['phase', 'path'])
if train:
phases = [1, 2]
else:
phases = [3]
mask = split['phase'].isin(phases)
self.dataset = split['path'][mask].reset_index(drop=True)
self.path = path
self.train = train
self.transform = transform
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
# path
track_path = self.dataset[idx]
audio_path = os.path.join(self.path, 'train', track_path)
# read .wav
rate, samples = wavfile.read(audio_path)
# extract label from path like id10003/L9_sh8msGV59/00001.txt
# subtracting 1 because PyTorch assumes that C_i in [0, 1251-1]
label = int(track_path.split('/')[0].replace('id1', '')) - 1
## parameters
window = 'hamming'
# window width and step size
Tw = 25 # ms
Ts = 10 # ms
# frame duration (samples)
Nw = int(rate * Tw * 1e-3)
Ns = int(rate * (Tw - Ts) * 1e-3)
# overlapped duration (samples)
# 2 ** to the next pow of 2 of (Nw - 1)
nfft = 2 ** (Nw - 1).bit_length()
pre_emphasis = 0.97
# preemphasis filter
samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])
# removes DC component of the signal and add a small dither
samples = signal.lfilter([1, -1], [1, -0.99], samples)
dither = np.random.uniform(-1, 1, samples.shape)
spow = np.std(samples)
samples = samples + 1e-6 * spow * dither
if self.train:
# segment selection
segment_len = 3 # sec
upper_bound = len(samples) - segment_len * rate
start = np.random.randint(0, upper_bound)
end = start + segment_len * rate
samples = samples[start:end]
#samples = np.hstack((np.hstack((samples,samples)),samples)) # hstack 3 times
# spectogram
_, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft,
mode='magnitude', return_onesided=False)
# just multiplying it by 1600 makes spectrograms in the paper and here "the same"
spec *= rate / 10
if self.transform:
spec = self.transform(spec)
return label, spec
In [4]:
class Normalize(object):
"""Normalizes voice spectrogram (mean-varience)"""
def __call__(self, spec):
# (Freq, Time)
# mean-variance normalization for every spectrogram (not batch-wise)
mu = spec.mean(axis=1).reshape(512, 1)
sigma = spec.std(axis=1).reshape(512, 1)
spec = (spec - mu) / sigma
return spec
class ToTensor(object):
"""Convert spectogram to Tensor."""
def __call__(self, spec):
F, T = spec.shape
# now specs are of size (Freq, Time) and 2D but has to be 3D (channel dim)
spec = spec.reshape(1, F, T)
# make the ndarray to be of a proper type (was float64)
spec = spec.astype(np.float32)
return torch.from_numpy(spec)
In [5]:
class VoiceNet(nn.Module):
def __init__(self, num_classes=2):
super(VoiceNet, self).__init__()
self.conv1 = nn.Conv2d(in_channels=1, out_channels=96, kernel_size=7, stride=2, padding=1)
self.conv2 = nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=2, padding=1)
self.conv3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
self.conv4 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
self.conv5 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
self.attention_1 = NONLocalBlock2D(in_channels=256)
#self.attention_2 = NONLocalBlock2D(in_channels=256)
self.bn1 = nn.BatchNorm2d(num_features=96)
self.bn2 = nn.BatchNorm2d(num_features=256)
self.bn3 = nn.BatchNorm2d(num_features=256)
self.bn4 = nn.BatchNorm2d(num_features=256)
self.bn5 = nn.BatchNorm2d(num_features=256)
self.bn6 = nn.BatchNorm2d(num_features=4096)
self.bn7 = nn.BatchNorm1d(num_features=1024)
self.relu = nn.ReLU()
self.softmax = nn.Softmax()
self.mpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
self.mpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
self.mpool5 = nn.MaxPool2d(kernel_size=(5, 3), stride=(3, 2))
# Conv2d with weights of size (H, 1) is identical to FC with H weights
self.fc6 = nn.Conv2d(in_channels=256, out_channels=4096, kernel_size=(9, 1))
self.fc7 = nn.Linear(in_features=4096, out_features=1024)
self.fc8 = nn.Linear(in_features=1024, out_features=num_classes)
def forward(self, x):
B, C, H, W = x.size()
x = self.relu(self.bn1(self.conv1(x)))
x = self.mpool1(x)
#x = self.attention_96(x)
x = self.relu(self.bn2(self.conv2(x)))
x = self.mpool2(x)
#x = self.attention_256(x)
x = self.relu(self.bn3(self.conv3(x)))
x = self.relu(self.bn4(self.conv4(x)))
x = self.relu(self.bn5(self.conv5(x)))
x = self.mpool5(x)
x = self.attention_1(x)
x = self.relu(self.bn6(self.fc6(x)))
_, _, _, W = x.size()
self.apool6 = nn.AvgPool2d(kernel_size=(1, W))
x = self.apool6(x)
x = x.view(x.size(0), -1)
x = self.relu(self.bn7(self.fc7(x)))
x = self.fc8(x)
# during training, there's no need for SoftMax because CELoss calculates it
if self.train:
return x
else:
return self.softmax(x)
In [6]:
DATASET_PATH = '/data/hktxt/voxceleb/'
LOG_PATH = '/data/hktxt/Condadev/voxpy/logs/vgg_atten'
EPOCH_NUM = 30
# in shared code B = 100 but PyTorch throws CUDA out of memory at B = 97
# though B=96 takes only 90.6% of the GPU Mem (bug?):
# https://discuss.pytorch.org/t/lesser-memory-consumption-with-a-larger-batch-in-multi-gpu-setup/29087
# B = 96
# but when
torch.backends.cudnn.deterministic = True
# I can set B = 100
B = 64
WEIGHT_DECAY = 5e-4
LR_INIT = 1e-2
LR_LAST = 1e-4#1e-8
# lr scheduler parameter
gamma = 10 ** (np.log10(LR_LAST / LR_INIT) / (EPOCH_NUM - 1))
MOMENTUM = 0.9
#DEVICE = "5,6,7"
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
NUM_WORKERS = 20
TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)
In [7]:
#os.environ["CUDA_VISIBLE_DEVICES"] = DEVICE
net = VoiceNet(num_classes=1251)
#net.to(DEVICE)
#net = nn.DataParallel(net).cuda()
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
net = nn.DataParallel(net)
net.to(DEVICE)
#net.cuda()
transforms = Compose([
Normalize(),
ToTensor()
])
trainset = IdentificationDataset(DATASET_PATH, train=True, transform=transforms)
trainsetloader = torch.utils.data.DataLoader(trainset, batch_size=B, num_workers=NUM_WORKERS, shuffle=True)
testset = IdentificationDataset(DATASET_PATH, train=False, transform=transforms)
testsetloader = torch.utils.data.DataLoader(testset, batch_size=1, num_workers=NUM_WORKERS*2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), LR_INIT, MOMENTUM, weight_decay=WEIGHT_DECAY)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)
In [8]:
train_start = time.time()
for epoch_num in range(EPOCH_NUM):
lr_scheduler.step()
# train
print('Epoch {}/{}'.format(epoch_num+1, EPOCH_NUM))
net.train()
for iter_num, (labels, specs) in tqdm(enumerate(trainsetloader)):
optimizer.zero_grad()
labels, specs = labels.to(DEVICE), specs.to(DEVICE)
#labels, specs = labels.cuda(), specs.cuda()
scores = net(specs)
loss = criterion(scores, labels)
loss.backward()
optimizer.step()
# TBoard
step_num = epoch_num * len(trainsetloader) + iter_num
TBoard.add_scalar('gMetrics/train_loss', loss.item(), step_num)
TBoard.add_scalar('gMetrics/lr', lr_scheduler.get_lr()[0], step_num)
# TBoard.add_scalar('weights/conv1', net.conv1.weight.mean(), step_num)
# TBoard.add_scalar('weights/conv5', net.conv5.weight.mean(), step_num)
# TBoard.add_scalar('weights/fc6', net.fc6.weight.mean(), step_num)
# TBoard.add_scalar('weights/fc7', net.fc7.weight.mean(), step_num)
# TBoard.add_scalar('weights/fc8', net.fc8.weight.mean(), step_num)
# TBoard.add_scalar('grads/conv1', net.conv1.weight.grad.mean(), step_num)
# TBoard.add_scalar('grads/conv5', net.conv5.weight.grad.mean(), step_num)
# TBoard.add_scalar('grads/fc6', net.fc6.weight.grad.mean(), step_num)
# TBoard.add_scalar('grads/fc7', net.fc7.weight.grad.mean(), step_num)
# TBoard.add_scalar('grads/fc8', net.fc8.weight.grad.mean(), step_num)
# test
net.eval()
top5_accuracy = 0
top1_accuracy = 0
for _, (label, spec) in tqdm(enumerate(testsetloader)):
label, spec = label.to(DEVICE), spec.to(DEVICE)
#labels, specs = labels.cuda(), specs.cuda()
probs = net(spec)
# calculate Top-5 and Top-1 accuracy
pred_top5 = probs.topk(5)[1].view(5)
if label in pred_top5:
# increment top-5 accuracy
top5_accuracy += 1
if label == pred_top5[0]:
# increment top-1 accuracy
top1_accuracy += 1
top5_accuracy /= len(testsetloader)
top1_accuracy /= len(testsetloader)
TBoard.add_scalar('gMetrics/test_top5', top5_accuracy, epoch_num)
TBoard.add_scalar('gMetrics/test_top1', top1_accuracy, epoch_num)
train_end = time.time() - train_start
print('Training complete in {:.0f}m {:.0f}s'.format(
train_end // 60, train_end % 60))
# when the training is finished save the model
torch.save(net.state_dict(), os.path.join(LOG_PATH, 'model_snapshot.txt'))
TBoard.close()
print('top 1 accuracy @ the end: {}'.format(round(top1_accuracy, 3)))
print('top 5 accuracy @ the end: {}'.format(round(top5_accuracy, 3)))
print('loss @ the end: {}'.format(round(loss.item(), 3)))
In [ ]: