resnet34, drop less than 1s, and random crop 1s three times then concat to 3s for training

import os
os.environ["CUDA_VISIBLE_DEVICES"] = '6,7'
import numpy as np
import pandas as pd
from import wavfile
from scipy import signal
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from import Dataset
from torchvision.transforms import Compose

import tensorboardX
from tqdm import tqdm

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore') # scipy throws future warnings on fft (known bug)

import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1]))

ulimit -n


class IdentificationDataset(Dataset):
    def __init__(self, path, train, transform=None):
        #iden_split_path = os.path.join(path, 'iden_split.txt')
        split = pd.read_csv('droped_wav.csv', sep=',')
        split = split.sample(frac=1) #shuffer
        if train:
            phases = [1, 2]
            phases = [3]
        mask = split['phase'].isin(phases)
        self.split = split
        self.dataset = split['file'][mask].reset_index(drop=True)
        self.path = path
        self.train = train
        self.transform = transform
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        # path
        track_path = self.dataset[idx] # 'wav/C0688/IC0688W0488.wav'
        audio_path = os.path.join(self.path, track_path)

        # read .wav
        rate, samples =
        #wav_len = samples[0].shape/rate #length(s) of wav files
        # extract label
        label = int(self.split.loc[self.split['file'] == track_path].label)

        ## parameters
        window = 'hamming'
        # window width and step size
        Tw = 25 # ms
        Ts = 10 # ms
        # frame duration (samples)
        Nw = int(rate * Tw * 1e-3)
        Ns = int(rate * (Tw - Ts) * 1e-3)
        # overlapped duration (samples)
        # 2 ** to the next pow of 2 of (Nw - 1)
        nfft = 2 ** (Nw - 1).bit_length()
        pre_emphasis = 0.97
        # preemphasis filter
        samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])
        # removes DC component of the signal and add a small dither
        samples = signal.lfilter([1, -1], [1, -0.99], samples)
        dither = np.random.uniform(-1, 1, samples.shape)
        spow = np.std(samples)
        samples = samples + 1e-6 * spow * dither
        if self.train:
            #if wav_len >=3:
            # segment selection
            segment_len = 1 # sec
            upper_bound = len(samples) - segment_len * rate
            start = np.random.randint(0, upper_bound, size=3) # random crop 1s thee times and concat
            end = start + segment_len * rate
            samples1 = samples[start[0]:end[0]]
            samples2 = samples[start[1]:end[1]]
            samples3 = samples[start[2]:end[2]]
            samples = np.hstack((np.hstack((samples1,samples2)),samples3)) # hstack 3 times

        # spectogram
        _, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                        mode='magnitude', return_onesided=False)
        # just multiplying it by 1600 makes spectrograms in the paper and here "the same"
        spec *= rate / 10
        if self.transform:
            spec = self.transform(spec)

        return label, spec

class Normalize(object):
    """Normalizes voice spectrogram (mean-varience)"""
    def __call__(self, spec):
        # (Freq, Time)
        # mean-variance normalization for every spectrogram (not batch-wise)
        mu = spec.mean(axis=1).reshape(512, 1)
        sigma = spec.std(axis=1).reshape(512, 1)
        spec = (spec - mu) / sigma

        return spec

class ToTensor(object):
    """Convert spectogram to Tensor."""
    def __call__(self, spec):
        F, T = spec.shape
        # now specs are of size (Freq, Time) and 2D but has to be 3D (channel dim)
        spec = spec.reshape(1, F, T)
        # make the ndarray to be of a proper type (was float64)
        spec = spec.astype(np.float32)
        return torch.from_numpy(spec)

import torch.nn as nn
import torch.utils.model_zoo as model_zoo

__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',

model_urls = {
    'resnet18': '',
    'resnet34': '',
    'resnet50': '',
    'resnet101': '',
    'resnet152': '',

def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)

def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = conv1x1(inplanes, planes)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = conv3x3(planes, planes, stride)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = conv1x1(planes, planes * self.expansion)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False):
        super(ResNet, self).__init__()
        self.inplanes = 64
        self.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3,
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                nn.BatchNorm2d(planes * block.expansion),

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

def resnet18(pretrained=False, **kwargs):
    """Constructs a ResNet-18 model.
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    if pretrained:
    return model

def resnet34(pretrained=False, **kwargs):
    """Constructs a ResNet-34 model.
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    if pretrained:
    return model

def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
    return model

def resnet101(pretrained=False, **kwargs):
    """Constructs a ResNet-101 model.
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    if pretrained:
    return model

def resnet152(pretrained=False, **kwargs):
    """Constructs a ResNet-152 model.
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    if pretrained:
    return model

DATASET_PATH = '/data/hktxt/AISHELL-2/iOS/data/'
LOG_PATH = '/data/hktxt/Condadev/voxpy/CN/logs/Res34_ori_1s'

# in shared code B = 100 but PyTorch throws CUDA out of memory at B = 97 
# though B=96 takes only 90.6% of the GPU Mem (bug?):
# B = 96
# but when 
torch.backends.cudnn.deterministic = True
# I can set B = 100
B = 120

LR_INIT = 1e-2
LR_LAST = 1e-4
# lr scheduler parameter
gamma = 10 ** (np.log10(LR_LAST / LR_INIT) / (EPOCH_NUM - 1))
#DEVICE = "5,6,7"
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)

net = resnet34(pretrained=False, num_classes=1991)
#net = nn.DataParallel(net).cuda()
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    net = nn.DataParallel(net)

transforms = Compose([

trainset = IdentificationDataset(DATASET_PATH, train=True, transform=transforms)
trainsetloader =, batch_size=B, num_workers=NUM_WORKERS, shuffle=True)

testset = IdentificationDataset(DATASET_PATH, train=False, transform=transforms)
testsetloader =, batch_size=1, num_workers=NUM_WORKERS)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), LR_INIT, MOMENTUM, weight_decay=WEIGHT_DECAY)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)

Let's use 2 GPUs!

torch.Size([1, 512, 298])

train_start = time.time()
for epoch_num in range(EPOCH_NUM):
    # train
    print('Epoch {}/{}'.format(epoch_num+1, EPOCH_NUM))
    for iter_num, (labels, specs) in tqdm(enumerate(trainsetloader)):
        labels, specs =,
        scores = net(specs)
        loss = criterion(scores, labels)
        # TBoard
        step_num = epoch_num * len(trainsetloader) + iter_num
        TBoard.add_scalar('gMetrics/train_loss', loss.item(), step_num)
        TBoard.add_scalar('gMetrics/lr', lr_scheduler.get_lr()[0], step_num)
#         TBoard.add_scalar('weights/conv1', net.conv1.weight.mean(), step_num)
#         TBoard.add_scalar('weights/conv5', net.conv5.weight.mean(), step_num)
#         TBoard.add_scalar('weights/fc6', net.fc6.weight.mean(), step_num)
#         TBoard.add_scalar('weights/fc7', net.fc7.weight.mean(), step_num)
#         TBoard.add_scalar('weights/fc8', net.fc8.weight.mean(), step_num)
#         TBoard.add_scalar('grads/conv1', net.conv1.weight.grad.mean(), step_num)
#         TBoard.add_scalar('grads/conv5', net.conv5.weight.grad.mean(), step_num)
#         TBoard.add_scalar('grads/fc6', net.fc6.weight.grad.mean(), step_num)
#         TBoard.add_scalar('grads/fc7', net.fc7.weight.grad.mean(), step_num)
#         TBoard.add_scalar('grads/fc8', net.fc8.weight.grad.mean(), step_num)
    # test
    top5_accuracy = 0
    top1_accuracy = 0

    for _, (label, spec) in tqdm(enumerate(testsetloader)):
        label, spec =,
        probs = net(spec)

        # calculate Top-5 and Top-1 accuracy
        pred_top5 = probs.topk(5)[1].view(5)

        if label in pred_top5:
            # increment top-5 accuracy
            top5_accuracy += 1 

            if label == pred_top5[0]:
                # increment top-1 accuracy
                top1_accuracy += 1
    top5_accuracy /= len(testsetloader)
    top1_accuracy /= len(testsetloader)

    TBoard.add_scalar('gMetrics/test_top5', top5_accuracy, epoch_num)
    TBoard.add_scalar('gMetrics/test_top1', top1_accuracy, epoch_num)
    #save model every epoch, os.path.join(LOG_PATH, 'model_snapshot_{}.pkl'.format(epoch_num+1)))

train_end = time.time() - train_start
print('Training complete in {:.0f}m {:.0f}s'.format(
    train_end // 60, train_end % 60))    

# when the training is finished save the model, os.path.join(LOG_PATH, 'model_snapshot.txt'))
print('top 1 accuracy @ the end: {}'.format(round(top1_accuracy, 3)))
print('top 5 accuracy @ the end: {}'.format(round(top5_accuracy, 3)))
print('loss @ the end: {}'.format(round(loss.item(), 3)))

Epoch 1/35
6724it [1:59:22,  1.64it/s]
202286it [1:32:16, 36.54it/s]
Epoch 2/35
6724it [1:52:15,  1.67it/s]
202286it [1:30:16, 37.35it/s]
Epoch 3/35
6724it [1:53:54,  1.67it/s]
202286it [1:31:07, 37.00it/s]
Epoch 4/35
6724it [1:50:26,  1.66it/s]
202286it [1:31:04, 37.02it/s]
Epoch 5/35
6724it [1:47:36,  1.67it/s]
202286it [1:30:14, 37.36it/s]
Epoch 6/35
6724it [1:41:33,  1.66it/s]
202286it [1:30:24, 37.29it/s]
Epoch 7/35
6724it [1:46:12,  1.67it/s]
202286it [1:30:30, 37.25it/s]
Epoch 8/35
6724it [1:48:02,  1.66it/s]
202286it [1:29:56, 41.88it/s]
Epoch 9/35
6724it [1:49:19,  1.67it/s]
202286it [1:30:06, 37.42it/s]
Epoch 10/35
6724it [1:49:06,  1.67it/s]
202286it [1:30:11, 37.38it/s]
Epoch 11/35
6724it [1:48:36,  1.67it/s]
202286it [1:29:58, 37.47it/s]
Epoch 12/35
6724it [1:48:51,  1.66it/s]
202286it [1:30:14, 37.36it/s]
Epoch 13/35
6724it [1:49:22,  1.67it/s]
202286it [1:30:16, 37.34it/s]
Epoch 14/35
6724it [1:50:50,  1.67it/s]
202286it [1:29:44, 37.57it/s]
Epoch 15/35
6724it [1:51:11,  1.67it/s]
202286it [1:30:05, 42.59it/s]
Epoch 16/35
6724it [1:52:39,  1.68it/s]
202286it [1:30:15, 37.35it/s]
Epoch 17/35
6724it [1:51:43,  1.66it/s]
202286it [1:30:32, 42.35it/s]
Epoch 18/35
6724it [1:51:38,  1.67it/s]
202286it [1:30:11, 37.38it/s]
Epoch 19/35
6724it [1:50:36,  1.67it/s]
202286it [1:30:14, 37.36it/s]
Epoch 20/35
6724it [1:52:14,  1.66it/s]
202286it [1:30:16, 39.70it/s]
Epoch 21/35
6724it [1:50:08,  1.68it/s]
202286it [1:30:16, 37.35it/s]
Epoch 22/35
6724it [1:33:36,  1.67it/s]
202286it [1:25:35, 39.39it/s]
Epoch 23/35
6724it [1:23:25,  1.63it/s]
202286it [1:29:35, 37.63it/s]
Epoch 24/35
6724it [1:24:21,  1.65it/s]
202286it [1:34:18, 35.75it/s]
Epoch 25/35
6724it [1:24:38,  1.65it/s]
202286it [1:27:10, 38.67it/s]
Epoch 26/35
6724it [1:39:27,  1.66it/s]
202286it [1:28:51, 42.39it/s]
Epoch 27/35
6724it [1:32:21,  1.66it/s]
202286it [1:24:26, 39.92it/s]
Epoch 28/35
6724it [1:25:18,  1.64it/s]
202286it [1:29:14, 38.21it/s]
Epoch 29/35
6724it [1:24:53,  1.67it/s]
202286it [1:25:34, 39.40it/s]
Epoch 30/35
6724it [1:24:48,  1.64it/s]
202286it [1:26:44, 38.87it/s]
Epoch 31/35
6724it [1:26:57,  1.66it/s]
202286it [1:37:06, 34.72it/s]
Epoch 32/35
6724it [2:13:26,  1.66it/s]
202286it [1:57:19, 28.74it/s]
Epoch 33/35
6724it [2:15:27,  1.54it/s]
202286it [1:56:29, 28.94it/s]
Epoch 34/35
6724it [2:21:56,  1.61it/s]
202286it [1:55:24, 29.21it/s]
Epoch 35/35
6724it [2:22:57,  1.60it/s]
202286it [1:55:46, 35.39it/s]
Training complete in 7009m 19s
top 1 accuracy @ the end: 0.998
top 5 accuracy @ the end: 1.0
loss @ the end: 0.138

