VGG-M model with non_local_dot_product trained on vox1


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '6,7'
import numpy as np
import pandas as pd
from scipy.io import wavfile
from scipy import signal
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision.transforms import Compose
# from lib.non_local_concatenation import NONLocalBlock2D
# from lib.non_local_gaussian import NONLocalBlock2D
# from non_local_embedded_gaussian import NONLocalBlock2D
from non_local_dot_product import NONLocalBlock2D

import tensorboardX
from tqdm import tqdm

import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore') # scipy throws future warnings on fft (known bug)

In [3]:
class IdentificationDataset(Dataset):
    
    def __init__(self, path, train, transform=None):
        iden_split_path = os.path.join(path, 'iden_split.txt')
        split = pd.read_table(iden_split_path, sep=' ', header=None, names=['phase', 'path'])
        
        if train:
            phases = [1, 2]
        
        else:
            phases = [3]
            
        mask = split['phase'].isin(phases)
        self.dataset = split['path'][mask].reset_index(drop=True)
        self.path = path
        self.train = train
        self.transform = transform
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # path
        track_path = self.dataset[idx]
        audio_path = os.path.join(self.path, 'train', track_path)

        # read .wav
        rate, samples = wavfile.read(audio_path)
        # extract label from path like id10003/L9_sh8msGV59/00001.txt
        # subtracting 1 because PyTorch assumes that C_i in [0, 1251-1]
        label = int(track_path.split('/')[0].replace('id1', '')) - 1

        ## parameters
        window = 'hamming'
        # window width and step size
        Tw = 25 # ms
        Ts = 10 # ms
        # frame duration (samples)
        Nw = int(rate * Tw * 1e-3)
        Ns = int(rate * (Tw - Ts) * 1e-3)
        # overlapped duration (samples)
        # 2 ** to the next pow of 2 of (Nw - 1)
        nfft = 2 ** (Nw - 1).bit_length()
        pre_emphasis = 0.97
        
        # preemphasis filter
        samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])
        
        # removes DC component of the signal and add a small dither
        samples = signal.lfilter([1, -1], [1, -0.99], samples)
        dither = np.random.uniform(-1, 1, samples.shape)
        spow = np.std(samples)
        samples = samples + 1e-6 * spow * dither
        
        if self.train:
            # segment selection
            segment_len = 3 # sec
            upper_bound = len(samples) - segment_len * rate
            start = np.random.randint(0, upper_bound)
            end = start + segment_len * rate
            samples = samples[start:end]
            #samples = np.hstack((np.hstack((samples,samples)),samples)) # hstack 3 times
        
        # spectogram
        _, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                        mode='magnitude', return_onesided=False)
        
        # just multiplying it by 1600 makes spectrograms in the paper and here "the same"
        spec *= rate / 10
        
        if self.transform:
            spec = self.transform(spec)

        return label, spec

In [4]:
class Normalize(object):
    """Normalizes voice spectrogram (mean-varience)"""
    
    def __call__(self, spec):
        
        # (Freq, Time)
        # mean-variance normalization for every spectrogram (not batch-wise)
        mu = spec.mean(axis=1).reshape(512, 1)
        sigma = spec.std(axis=1).reshape(512, 1)
        spec = (spec - mu) / sigma

        return spec

class ToTensor(object):
    """Convert spectogram to Tensor."""
    
    def __call__(self, spec):
        F, T = spec.shape
        
        # now specs are of size (Freq, Time) and 2D but has to be 3D (channel dim)
        spec = spec.reshape(1, F, T)
        
        # make the ndarray to be of a proper type (was float64)
        spec = spec.astype(np.float32)
        
        return torch.from_numpy(spec)

In [5]:
class VoiceNet(nn.Module):

    def __init__(self, num_classes=2):
        super(VoiceNet, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=96, kernel_size=7, stride=2, padding=1)
        self.conv2 = nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv5 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        
        self.attention_1 = NONLocalBlock2D(in_channels=256)
        #self.attention_2 = NONLocalBlock2D(in_channels=256)
        
        self.bn1 = nn.BatchNorm2d(num_features=96)
        self.bn2 = nn.BatchNorm2d(num_features=256)
        self.bn3 = nn.BatchNorm2d(num_features=256)
        self.bn4 = nn.BatchNorm2d(num_features=256)
        self.bn5 = nn.BatchNorm2d(num_features=256)
        self.bn6 = nn.BatchNorm2d(num_features=4096)
        self.bn7 = nn.BatchNorm1d(num_features=1024)
        
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()
        
        self.mpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool5 = nn.MaxPool2d(kernel_size=(5, 3), stride=(3, 2))
        
        # Conv2d with weights of size (H, 1) is identical to FC with H weights
        self.fc6 = nn.Conv2d(in_channels=256, out_channels=4096, kernel_size=(9, 1))
        self.fc7 = nn.Linear(in_features=4096, out_features=1024)
        self.fc8 = nn.Linear(in_features=1024, out_features=num_classes)
        
    def forward(self, x):
        B, C, H, W = x.size()
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.mpool1(x)
        #x = self.attention_96(x)
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.mpool2(x)
        #x = self.attention_256(x)
        x = self.relu(self.bn3(self.conv3(x)))
        x = self.relu(self.bn4(self.conv4(x)))
        x = self.relu(self.bn5(self.conv5(x)))
        x = self.mpool5(x)
        x = self.attention_1(x)
        x = self.relu(self.bn6(self.fc6(x)))
        
        _, _, _, W = x.size()
        self.apool6 = nn.AvgPool2d(kernel_size=(1, W))
        x = self.apool6(x)
        
        x = x.view(x.size(0), -1)
        x = self.relu(self.bn7(self.fc7(x)))
        x = self.fc8(x)
        
        # during training, there's no need for SoftMax because CELoss calculates it
        if self.train:
            return x
        
        else:
            return self.softmax(x)

In [6]:
DATASET_PATH = '/data/hktxt/voxceleb/'
LOG_PATH = '/data/hktxt/Condadev/voxpy/logs/vgg_atten'
EPOCH_NUM = 30

# in shared code B = 100 but PyTorch throws CUDA out of memory at B = 97 
# though B=96 takes only 90.6% of the GPU Mem (bug?):
# https://discuss.pytorch.org/t/lesser-memory-consumption-with-a-larger-batch-in-multi-gpu-setup/29087
# B = 96
# but when 
torch.backends.cudnn.deterministic = True
# I can set B = 100
B = 64

WEIGHT_DECAY = 5e-4
LR_INIT = 1e-2
LR_LAST = 1e-4#1e-8
# lr scheduler parameter
gamma = 10 ** (np.log10(LR_LAST / LR_INIT) / (EPOCH_NUM - 1))
MOMENTUM = 0.9
#DEVICE = "5,6,7"
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
NUM_WORKERS = 20
TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)

In [7]:
#os.environ["CUDA_VISIBLE_DEVICES"] = DEVICE
net = VoiceNet(num_classes=1251)
#net.to(DEVICE)
#net = nn.DataParallel(net).cuda()
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    net = nn.DataParallel(net)

net.to(DEVICE)
#net.cuda()

transforms = Compose([
    Normalize(),
    ToTensor()
])

trainset = IdentificationDataset(DATASET_PATH, train=True, transform=transforms)
trainsetloader = torch.utils.data.DataLoader(trainset, batch_size=B, num_workers=NUM_WORKERS, shuffle=True)

testset = IdentificationDataset(DATASET_PATH, train=False, transform=transforms)
testsetloader = torch.utils.data.DataLoader(testset, batch_size=1, num_workers=NUM_WORKERS*2)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), LR_INIT, MOMENTUM, weight_decay=WEIGHT_DECAY)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)


Let's use 2 GPUs!

In [8]:
train_start = time.time()
for epoch_num in range(EPOCH_NUM):
    lr_scheduler.step()
    
    # train
    print('Epoch {}/{}'.format(epoch_num+1, EPOCH_NUM))
    net.train()
    
    for iter_num, (labels, specs) in tqdm(enumerate(trainsetloader)):
        optimizer.zero_grad()
        labels, specs = labels.to(DEVICE), specs.to(DEVICE)
        #labels, specs = labels.cuda(), specs.cuda()
        scores = net(specs)
        loss = criterion(scores, labels)
        loss.backward()
        optimizer.step()
        
        # TBoard
        step_num = epoch_num * len(trainsetloader) + iter_num
        TBoard.add_scalar('gMetrics/train_loss', loss.item(), step_num)
        TBoard.add_scalar('gMetrics/lr', lr_scheduler.get_lr()[0], step_num)
        
#         TBoard.add_scalar('weights/conv1', net.conv1.weight.mean(), step_num)
#         TBoard.add_scalar('weights/conv5', net.conv5.weight.mean(), step_num)
#         TBoard.add_scalar('weights/fc6', net.fc6.weight.mean(), step_num)
#         TBoard.add_scalar('weights/fc7', net.fc7.weight.mean(), step_num)
#         TBoard.add_scalar('weights/fc8', net.fc8.weight.mean(), step_num)
#         TBoard.add_scalar('grads/conv1', net.conv1.weight.grad.mean(), step_num)
#         TBoard.add_scalar('grads/conv5', net.conv5.weight.grad.mean(), step_num)
#         TBoard.add_scalar('grads/fc6', net.fc6.weight.grad.mean(), step_num)
#         TBoard.add_scalar('grads/fc7', net.fc7.weight.grad.mean(), step_num)
#         TBoard.add_scalar('grads/fc8', net.fc8.weight.grad.mean(), step_num)
        
    
    # test
    net.eval()
    
    top5_accuracy = 0
    top1_accuracy = 0

    for _, (label, spec) in tqdm(enumerate(testsetloader)):
        label, spec = label.to(DEVICE), spec.to(DEVICE)
        #labels, specs = labels.cuda(), specs.cuda()
        probs = net(spec)

        # calculate Top-5 and Top-1 accuracy
        pred_top5 = probs.topk(5)[1].view(5)

        if label in pred_top5:
            # increment top-5 accuracy
            top5_accuracy += 1 

            if label == pred_top5[0]:
                # increment top-1 accuracy
                top1_accuracy += 1
    
    top5_accuracy /= len(testsetloader)
    top1_accuracy /= len(testsetloader)

    TBoard.add_scalar('gMetrics/test_top5', top5_accuracy, epoch_num)
    TBoard.add_scalar('gMetrics/test_top1', top1_accuracy, epoch_num)

train_end = time.time() - train_start
print('Training complete in {:.0f}m {:.0f}s'.format(
    train_end // 60, train_end % 60))    

# when the training is finished save the model
torch.save(net.state_dict(), os.path.join(LOG_PATH, 'model_snapshot.txt'))
TBoard.close()
print('top 1 accuracy @ the end: {}'.format(round(top1_accuracy, 3)))
print('top 5 accuracy @ the end: {}'.format(round(top5_accuracy, 3)))
print('loss @ the end: {}'.format(round(loss.item(), 3)))


Epoch 1/30
2270it [06:39,  6.01it/s]
8251it [01:30, 91.16it/s] 
Epoch 2/30
2270it [06:29,  6.27it/s]
8251it [01:23, 98.33it/s] 
Epoch 3/30
2270it [06:30,  6.20it/s]
8251it [01:31, 89.97it/s]
Epoch 4/30
2270it [06:31,  6.16it/s]
8251it [01:25, 96.66it/s] 
Epoch 5/30
2270it [06:31,  6.05it/s]
8251it [01:21, 101.23it/s]
Epoch 6/30
2270it [06:29,  6.11it/s]
8251it [01:23, 98.94it/s] 
Epoch 7/30
2270it [06:31,  6.14it/s]
8251it [01:18, 105.38it/s]
Epoch 8/30
2270it [06:28,  6.20it/s]
8251it [01:20, 102.13it/s]
Epoch 9/30
2270it [06:30,  6.17it/s]
8251it [01:18, 105.51it/s]
Epoch 10/30
2270it [06:29,  6.30it/s]
8251it [01:20, 111.04it/s]
Epoch 11/30
2270it [06:52,  6.05it/s]
8251it [01:22, 111.32it/s]
Epoch 12/30
2270it [08:03,  6.07it/s]
8251it [01:19, 103.46it/s]
Epoch 13/30
2270it [06:34,  6.06it/s]
8251it [01:32, 91.17it/s]
Epoch 14/30
2270it [06:31,  6.19it/s]
8251it [01:20, 102.54it/s]
Epoch 15/30
2270it [06:31,  6.34it/s]
8251it [01:18, 104.56it/s]
Epoch 16/30
2270it [06:28,  6.29it/s]
8251it [01:19, 107.92it/s]
Epoch 17/30
2270it [06:29,  6.26it/s]
8251it [01:21, 101.74it/s]
Epoch 18/30
2270it [06:32,  6.10it/s]
8251it [01:28, 93.57it/s] 
Epoch 19/30
2270it [06:31,  6.19it/s]
8251it [01:23, 98.78it/s] 
Epoch 20/30
2270it [06:32,  6.13it/s]
8251it [01:31, 90.57it/s] 
Epoch 21/30
2270it [06:33,  6.04it/s]
8251it [01:25, 96.95it/s] 
Epoch 22/30
2270it [06:34,  6.01it/s]
8251it [01:21, 98.56it/s]
Epoch 23/30
2270it [06:33,  6.22it/s]
8251it [01:27, 94.63it/s]
Epoch 24/30
2270it [06:33,  6.08it/s]
8251it [01:28, 93.21it/s]
Epoch 25/30
2270it [06:32,  6.05it/s]
8251it [01:24, 111.44it/s]
Epoch 26/30
2270it [06:33,  6.27it/s]
8251it [01:33, 88.58it/s]
Epoch 27/30
2270it [06:33,  6.06it/s]
8251it [01:23, 98.29it/s] 
Epoch 28/30
2270it [06:34,  6.25it/s]
8251it [01:22, 109.25it/s]
Epoch 29/30
2270it [06:32,  6.09it/s]
8251it [01:24, 100.26it/s]
Epoch 30/30
2270it [06:35,  6.17it/s]
8251it [01:30, 91.08it/s] 
Training complete in 242m 55s
top 1 accuracy @ the end: 0.793
top 5 accuracy @ the end: 0.921
loss @ the end: 0.164

In [ ]: