resnet34, drop less than 1s, and random crop 1s three times then concat to 3s for training


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '6,7'
import numpy as np
import pandas as pd
from scipy.io import wavfile
from scipy import signal
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision.transforms import Compose

import tensorboardX
from tqdm import tqdm

import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore') # scipy throws future warnings on fft (known bug)

In [3]:
import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1]))

In [4]:
!ulimit -n


2048

In [5]:
class IdentificationDataset(Dataset):
    
    def __init__(self, path, train, transform=None):
        #iden_split_path = os.path.join(path, 'iden_split.txt')
        split = pd.read_csv('droped_wav.csv', sep=',')
        split = split.sample(frac=1) #shuffer
        
        if train:
            phases = [1, 2]
        
        else:
            phases = [3]
            
        mask = split['phase'].isin(phases)
        self.split = split
        self.dataset = split['file'][mask].reset_index(drop=True)
        self.path = path
        self.train = train
        self.transform = transform
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # path
        track_path = self.dataset[idx] # 'wav/C0688/IC0688W0488.wav'
        audio_path = os.path.join(self.path, track_path)

        # read .wav
        rate, samples = wavfile.read(audio_path)
        #wav_len = samples[0].shape/rate #length(s) of wav files
        # extract label
        label = int(self.split.loc[self.split['file'] == track_path].label)

        ## parameters
        window = 'hamming'
        # window width and step size
        Tw = 25 # ms
        Ts = 10 # ms
        # frame duration (samples)
        Nw = int(rate * Tw * 1e-3)
        Ns = int(rate * (Tw - Ts) * 1e-3)
        # overlapped duration (samples)
        # 2 ** to the next pow of 2 of (Nw - 1)
        nfft = 2 ** (Nw - 1).bit_length()
        pre_emphasis = 0.97
        
        # preemphasis filter
        samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])
        
        # removes DC component of the signal and add a small dither
        samples = signal.lfilter([1, -1], [1, -0.99], samples)
        dither = np.random.uniform(-1, 1, samples.shape)
        spow = np.std(samples)
        samples = samples + 1e-6 * spow * dither
        
        if self.train:
            
            #if wav_len >=3:
            # segment selection
            segment_len = 1 # sec
            upper_bound = len(samples) - segment_len * rate
            start = np.random.randint(0, upper_bound, size=3) # random crop 1s thee times and concat
            end = start + segment_len * rate
            samples1 = samples[start[0]:end[0]]
            samples2 = samples[start[1]:end[1]]
            samples3 = samples[start[2]:end[2]]
            samples = np.hstack((np.hstack((samples1,samples2)),samples3)) # hstack 3 times

        
        # spectogram
        _, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                        mode='magnitude', return_onesided=False)
        
        # just multiplying it by 1600 makes spectrograms in the paper and here "the same"
        spec *= rate / 10
        
        if self.transform:
            spec = self.transform(spec)

        return label, spec

In [6]:
class Normalize(object):
    """Normalizes voice spectrogram (mean-varience)"""
    
    def __call__(self, spec):
        
        # (Freq, Time)
        # mean-variance normalization for every spectrogram (not batch-wise)
        mu = spec.mean(axis=1).reshape(512, 1)
        sigma = spec.std(axis=1).reshape(512, 1)
        spec = (spec - mu) / sigma

        return spec

class ToTensor(object):
    """Convert spectogram to Tensor."""
    
    def __call__(self, spec):
        F, T = spec.shape
        
        # now specs are of size (Freq, Time) and 2D but has to be 3D (channel dim)
        spec = spec.reshape(1, F, T)
        
        # make the ndarray to be of a proper type (was float64)
        spec = spec.astype(np.float32)
        
        return torch.from_numpy(spec)

In [7]:
import torch.nn as nn
import torch.utils.model_zoo as model_zoo


__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152']


model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}


def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = conv1x1(inplanes, planes)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = conv3x3(planes, planes, stride)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = conv1x1(planes, planes * self.expansion)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False):
        super(ResNet, self).__init__()
        self.inplanes = 64
        self.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


def resnet18(pretrained=False, **kwargs):
    """Constructs a ResNet-18 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
    return model


def resnet34(pretrained=False, **kwargs):
    """Constructs a ResNet-34 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
    return model


def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
    return model


def resnet101(pretrained=False, **kwargs):
    """Constructs a ResNet-101 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
    return model


def resnet152(pretrained=False, **kwargs):
    """Constructs a ResNet-152 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
    return model

In [8]:
DATASET_PATH = '/data/hktxt/AISHELL-2/iOS/data/'
LOG_PATH = '/data/hktxt/Condadev/voxpy/CN/logs/Res34_ori_1s'
EPOCH_NUM = 35

# in shared code B = 100 but PyTorch throws CUDA out of memory at B = 97 
# though B=96 takes only 90.6% of the GPU Mem (bug?):
# https://discuss.pytorch.org/t/lesser-memory-consumption-with-a-larger-batch-in-multi-gpu-setup/29087
# B = 96
# but when 
torch.backends.cudnn.deterministic = True
# I can set B = 100
B = 120

WEIGHT_DECAY = 5e-4
LR_INIT = 1e-2
LR_LAST = 1e-4
# lr scheduler parameter
gamma = 10 ** (np.log10(LR_LAST / LR_INIT) / (EPOCH_NUM - 1))
MOMENTUM = 0.9
#DEVICE = "5,6,7"
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
NUM_WORKERS = 64
TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)

In [9]:
#os.environ["CUDA_VISIBLE_DEVICES"] = DEVICE
net = resnet34(pretrained=False, num_classes=1991)
#net.to(DEVICE)
#net = nn.DataParallel(net).cuda()
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    net = nn.DataParallel(net)

net.to(DEVICE)

transforms = Compose([
    Normalize(),
    ToTensor()
])

trainset = IdentificationDataset(DATASET_PATH, train=True, transform=transforms)
trainsetloader = torch.utils.data.DataLoader(trainset, batch_size=B, num_workers=NUM_WORKERS, shuffle=True)

testset = IdentificationDataset(DATASET_PATH, train=False, transform=transforms)
testsetloader = torch.utils.data.DataLoader(testset, batch_size=1, num_workers=NUM_WORKERS)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), LR_INIT, MOMENTUM, weight_decay=WEIGHT_DECAY)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)


Let's use 2 GPUs!

In [10]:
trainset[1][1].shape


Out[10]:
torch.Size([1, 512, 298])

In [11]:
trainset[1][0]


Out[11]:
274

In [12]:
train_start = time.time()
for epoch_num in range(EPOCH_NUM):
    lr_scheduler.step()
    
    # train
    print('Epoch {}/{}'.format(epoch_num+1, EPOCH_NUM))
    net.train()
    
    for iter_num, (labels, specs) in tqdm(enumerate(trainsetloader)):
        optimizer.zero_grad()
        labels, specs = labels.to(DEVICE), specs.to(DEVICE)
        scores = net(specs)
        loss = criterion(scores, labels)
        loss.backward()
        optimizer.step()
        
        # TBoard
        step_num = epoch_num * len(trainsetloader) + iter_num
        TBoard.add_scalar('gMetrics/train_loss', loss.item(), step_num)
        TBoard.add_scalar('gMetrics/lr', lr_scheduler.get_lr()[0], step_num)
        
#         TBoard.add_scalar('weights/conv1', net.conv1.weight.mean(), step_num)
#         TBoard.add_scalar('weights/conv5', net.conv5.weight.mean(), step_num)
#         TBoard.add_scalar('weights/fc6', net.fc6.weight.mean(), step_num)
#         TBoard.add_scalar('weights/fc7', net.fc7.weight.mean(), step_num)
#         TBoard.add_scalar('weights/fc8', net.fc8.weight.mean(), step_num)
#         TBoard.add_scalar('grads/conv1', net.conv1.weight.grad.mean(), step_num)
#         TBoard.add_scalar('grads/conv5', net.conv5.weight.grad.mean(), step_num)
#         TBoard.add_scalar('grads/fc6', net.fc6.weight.grad.mean(), step_num)
#         TBoard.add_scalar('grads/fc7', net.fc7.weight.grad.mean(), step_num)
#         TBoard.add_scalar('grads/fc8', net.fc8.weight.grad.mean(), step_num)
        
    
    # test
    net.eval()
    
    top5_accuracy = 0
    top1_accuracy = 0

    for _, (label, spec) in tqdm(enumerate(testsetloader)):
        label, spec = label.to(DEVICE), spec.to(DEVICE)
        probs = net(spec)

        # calculate Top-5 and Top-1 accuracy
        pred_top5 = probs.topk(5)[1].view(5)

        if label in pred_top5:
            # increment top-5 accuracy
            top5_accuracy += 1 

            if label == pred_top5[0]:
                # increment top-1 accuracy
                top1_accuracy += 1
    
    top5_accuracy /= len(testsetloader)
    top1_accuracy /= len(testsetloader)

    TBoard.add_scalar('gMetrics/test_top5', top5_accuracy, epoch_num)
    TBoard.add_scalar('gMetrics/test_top1', top1_accuracy, epoch_num)
    
    #save model every epoch
    torch.save(net.state_dict(), os.path.join(LOG_PATH, 'model_snapshot_{}.pkl'.format(epoch_num+1)))

train_end = time.time() - train_start
print('Training complete in {:.0f}m {:.0f}s'.format(
    train_end // 60, train_end % 60))    

# when the training is finished save the model
#torch.save(net.state_dict(), os.path.join(LOG_PATH, 'model_snapshot.txt'))
TBoard.close()
print('top 1 accuracy @ the end: {}'.format(round(top1_accuracy, 3)))
print('top 5 accuracy @ the end: {}'.format(round(top5_accuracy, 3)))
print('loss @ the end: {}'.format(round(loss.item(), 3)))


Epoch 1/35
6724it [1:59:22,  1.64it/s]
202286it [1:32:16, 36.54it/s]
Epoch 2/35
6724it [1:52:15,  1.67it/s]
202286it [1:30:16, 37.35it/s]
Epoch 3/35
6724it [1:53:54,  1.67it/s]
202286it [1:31:07, 37.00it/s]
Epoch 4/35
6724it [1:50:26,  1.66it/s]
202286it [1:31:04, 37.02it/s]
Epoch 5/35
6724it [1:47:36,  1.67it/s]
202286it [1:30:14, 37.36it/s]
Epoch 6/35
6724it [1:41:33,  1.66it/s]
202286it [1:30:24, 37.29it/s]
Epoch 7/35
6724it [1:46:12,  1.67it/s]
202286it [1:30:30, 37.25it/s]
Epoch 8/35
6724it [1:48:02,  1.66it/s]
202286it [1:29:56, 41.88it/s]
Epoch 9/35
6724it [1:49:19,  1.67it/s]
202286it [1:30:06, 37.42it/s]
Epoch 10/35
6724it [1:49:06,  1.67it/s]
202286it [1:30:11, 37.38it/s]
Epoch 11/35
6724it [1:48:36,  1.67it/s]
202286it [1:29:58, 37.47it/s]
Epoch 12/35
6724it [1:48:51,  1.66it/s]
202286it [1:30:14, 37.36it/s]
Epoch 13/35
6724it [1:49:22,  1.67it/s]
202286it [1:30:16, 37.34it/s]
Epoch 14/35
6724it [1:50:50,  1.67it/s]
202286it [1:29:44, 37.57it/s]
Epoch 15/35
6724it [1:51:11,  1.67it/s]
202286it [1:30:05, 42.59it/s]
Epoch 16/35
6724it [1:52:39,  1.68it/s]
202286it [1:30:15, 37.35it/s]
Epoch 17/35
6724it [1:51:43,  1.66it/s]
202286it [1:30:32, 42.35it/s]
Epoch 18/35
6724it [1:51:38,  1.67it/s]
202286it [1:30:11, 37.38it/s]
Epoch 19/35
6724it [1:50:36,  1.67it/s]
202286it [1:30:14, 37.36it/s]
Epoch 20/35
6724it [1:52:14,  1.66it/s]
202286it [1:30:16, 39.70it/s]
Epoch 21/35
6724it [1:50:08,  1.68it/s]
202286it [1:30:16, 37.35it/s]
Epoch 22/35
6724it [1:33:36,  1.67it/s]
202286it [1:25:35, 39.39it/s]
Epoch 23/35
6724it [1:23:25,  1.63it/s]
202286it [1:29:35, 37.63it/s]
Epoch 24/35
6724it [1:24:21,  1.65it/s]
202286it [1:34:18, 35.75it/s]
Epoch 25/35
6724it [1:24:38,  1.65it/s]
202286it [1:27:10, 38.67it/s]
Epoch 26/35
6724it [1:39:27,  1.66it/s]
202286it [1:28:51, 42.39it/s]
Epoch 27/35
6724it [1:32:21,  1.66it/s]
202286it [1:24:26, 39.92it/s]
Epoch 28/35
6724it [1:25:18,  1.64it/s]
202286it [1:29:14, 38.21it/s]
Epoch 29/35
6724it [1:24:53,  1.67it/s]
202286it [1:25:34, 39.40it/s]
Epoch 30/35
6724it [1:24:48,  1.64it/s]
202286it [1:26:44, 38.87it/s]
Epoch 31/35
6724it [1:26:57,  1.66it/s]
202286it [1:37:06, 34.72it/s]
Epoch 32/35
6724it [2:13:26,  1.66it/s]
202286it [1:57:19, 28.74it/s]
Epoch 33/35
6724it [2:15:27,  1.54it/s]
202286it [1:56:29, 28.94it/s]
Epoch 34/35
6724it [2:21:56,  1.61it/s]
202286it [1:55:24, 29.21it/s]
Epoch 35/35
6724it [2:22:57,  1.60it/s]
202286it [1:55:46, 35.39it/s]
Training complete in 7009m 19s
top 1 accuracy @ the end: 0.998
top 5 accuracy @ the end: 1.0
loss @ the end: 0.138

In [ ]: