In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '6,7'
import numpy as np
import pandas as pd
from scipy.io import wavfile
from scipy import signal
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision.transforms import Compose
import tensorboardX
from tqdm import tqdm
import matplotlib.pyplot as plt
In [2]:
import warnings
warnings.filterwarnings('ignore') # scipy throws future warnings on fft (known bug)
In [3]:
import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1]))
In [4]:
!ulimit -n
In [5]:
class IdentificationDataset(Dataset):
def __init__(self, path, train, transform=None):
#iden_split_path = os.path.join(path, 'iden_split.txt')
split = pd.read_csv('droped_wav.csv', sep=',')
split = split.sample(frac=1) #shuffer
if train:
phases = [1, 2]
else:
phases = [3]
mask = split['phase'].isin(phases)
self.split = split
self.dataset = split['file'][mask].reset_index(drop=True)
self.path = path
self.train = train
self.transform = transform
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
# path
track_path = self.dataset[idx] # 'wav/C0688/IC0688W0488.wav'
audio_path = os.path.join(self.path, track_path)
# read .wav
rate, samples = wavfile.read(audio_path)
#wav_len = samples[0].shape/rate #length(s) of wav files
# extract label
label = int(self.split.loc[self.split['file'] == track_path].label)
## parameters
window = 'hamming'
# window width and step size
Tw = 25 # ms
Ts = 10 # ms
# frame duration (samples)
Nw = int(rate * Tw * 1e-3)
Ns = int(rate * (Tw - Ts) * 1e-3)
# overlapped duration (samples)
# 2 ** to the next pow of 2 of (Nw - 1)
nfft = 2 ** (Nw - 1).bit_length()
pre_emphasis = 0.97
# preemphasis filter
samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])
# removes DC component of the signal and add a small dither
samples = signal.lfilter([1, -1], [1, -0.99], samples)
dither = np.random.uniform(-1, 1, samples.shape)
spow = np.std(samples)
samples = samples + 1e-6 * spow * dither
if self.train:
#if wav_len >=3:
# segment selection
segment_len = 1 # sec
upper_bound = len(samples) - segment_len * rate
start = np.random.randint(0, upper_bound, size=3) # random crop 1s thee times and concat
end = start + segment_len * rate
samples1 = samples[start[0]:end[0]]
samples2 = samples[start[1]:end[1]]
samples3 = samples[start[2]:end[2]]
samples = np.hstack((np.hstack((samples1,samples2)),samples3)) # hstack 3 times
# spectogram
_, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft,
mode='magnitude', return_onesided=False)
# just multiplying it by 1600 makes spectrograms in the paper and here "the same"
spec *= rate / 10
if self.transform:
spec = self.transform(spec)
return label, spec
In [6]:
class Normalize(object):
"""Normalizes voice spectrogram (mean-varience)"""
def __call__(self, spec):
# (Freq, Time)
# mean-variance normalization for every spectrogram (not batch-wise)
mu = spec.mean(axis=1).reshape(512, 1)
sigma = spec.std(axis=1).reshape(512, 1)
spec = (spec - mu) / sigma
return spec
class ToTensor(object):
"""Convert spectogram to Tensor."""
def __call__(self, spec):
F, T = spec.shape
# now specs are of size (Freq, Time) and 2D but has to be 3D (channel dim)
spec = spec.reshape(1, F, T)
# make the ndarray to be of a proper type (was float64)
spec = spec.astype(np.float32)
return torch.from_numpy(spec)
In [7]:
import torch.nn as nn
import torch.utils.model_zoo as model_zoo
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = conv1x1(inplanes, planes)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = conv3x3(planes, planes, stride)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = conv1x1(planes, planes * self.expansion)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000, zero_init_residual=False):
super(ResNet, self).__init__()
self.inplanes = 64
self.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if zero_init_residual:
for m in self.modules():
if isinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight, 0)
elif isinstance(m, BasicBlock):
nn.init.constant_(m.bn2.weight, 0)
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def resnet18(pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
return model
def resnet34(pretrained=False, **kwargs):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
return model
def resnet50(pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
return model
def resnet101(pretrained=False, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
return model
def resnet152(pretrained=False, **kwargs):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
return model
In [8]:
DATASET_PATH = '/data/hktxt/AISHELL-2/iOS/data/'
LOG_PATH = '/data/hktxt/Condadev/voxpy/CN/logs/Res34_ori_1s'
EPOCH_NUM = 35
# in shared code B = 100 but PyTorch throws CUDA out of memory at B = 97
# though B=96 takes only 90.6% of the GPU Mem (bug?):
# https://discuss.pytorch.org/t/lesser-memory-consumption-with-a-larger-batch-in-multi-gpu-setup/29087
# B = 96
# but when
torch.backends.cudnn.deterministic = True
# I can set B = 100
B = 120
WEIGHT_DECAY = 5e-4
LR_INIT = 1e-2
LR_LAST = 1e-4
# lr scheduler parameter
gamma = 10 ** (np.log10(LR_LAST / LR_INIT) / (EPOCH_NUM - 1))
MOMENTUM = 0.9
#DEVICE = "5,6,7"
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
NUM_WORKERS = 64
TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)
In [9]:
#os.environ["CUDA_VISIBLE_DEVICES"] = DEVICE
net = resnet34(pretrained=False, num_classes=1991)
#net.to(DEVICE)
#net = nn.DataParallel(net).cuda()
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
net = nn.DataParallel(net)
net.to(DEVICE)
transforms = Compose([
Normalize(),
ToTensor()
])
trainset = IdentificationDataset(DATASET_PATH, train=True, transform=transforms)
trainsetloader = torch.utils.data.DataLoader(trainset, batch_size=B, num_workers=NUM_WORKERS, shuffle=True)
testset = IdentificationDataset(DATASET_PATH, train=False, transform=transforms)
testsetloader = torch.utils.data.DataLoader(testset, batch_size=1, num_workers=NUM_WORKERS)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), LR_INIT, MOMENTUM, weight_decay=WEIGHT_DECAY)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)
In [10]:
trainset[1][1].shape
Out[10]:
In [11]:
trainset[1][0]
Out[11]:
In [12]:
train_start = time.time()
for epoch_num in range(EPOCH_NUM):
lr_scheduler.step()
# train
print('Epoch {}/{}'.format(epoch_num+1, EPOCH_NUM))
net.train()
for iter_num, (labels, specs) in tqdm(enumerate(trainsetloader)):
optimizer.zero_grad()
labels, specs = labels.to(DEVICE), specs.to(DEVICE)
scores = net(specs)
loss = criterion(scores, labels)
loss.backward()
optimizer.step()
# TBoard
step_num = epoch_num * len(trainsetloader) + iter_num
TBoard.add_scalar('gMetrics/train_loss', loss.item(), step_num)
TBoard.add_scalar('gMetrics/lr', lr_scheduler.get_lr()[0], step_num)
# TBoard.add_scalar('weights/conv1', net.conv1.weight.mean(), step_num)
# TBoard.add_scalar('weights/conv5', net.conv5.weight.mean(), step_num)
# TBoard.add_scalar('weights/fc6', net.fc6.weight.mean(), step_num)
# TBoard.add_scalar('weights/fc7', net.fc7.weight.mean(), step_num)
# TBoard.add_scalar('weights/fc8', net.fc8.weight.mean(), step_num)
# TBoard.add_scalar('grads/conv1', net.conv1.weight.grad.mean(), step_num)
# TBoard.add_scalar('grads/conv5', net.conv5.weight.grad.mean(), step_num)
# TBoard.add_scalar('grads/fc6', net.fc6.weight.grad.mean(), step_num)
# TBoard.add_scalar('grads/fc7', net.fc7.weight.grad.mean(), step_num)
# TBoard.add_scalar('grads/fc8', net.fc8.weight.grad.mean(), step_num)
# test
net.eval()
top5_accuracy = 0
top1_accuracy = 0
for _, (label, spec) in tqdm(enumerate(testsetloader)):
label, spec = label.to(DEVICE), spec.to(DEVICE)
probs = net(spec)
# calculate Top-5 and Top-1 accuracy
pred_top5 = probs.topk(5)[1].view(5)
if label in pred_top5:
# increment top-5 accuracy
top5_accuracy += 1
if label == pred_top5[0]:
# increment top-1 accuracy
top1_accuracy += 1
top5_accuracy /= len(testsetloader)
top1_accuracy /= len(testsetloader)
TBoard.add_scalar('gMetrics/test_top5', top5_accuracy, epoch_num)
TBoard.add_scalar('gMetrics/test_top1', top1_accuracy, epoch_num)
#save model every epoch
torch.save(net.state_dict(), os.path.join(LOG_PATH, 'model_snapshot_{}.pkl'.format(epoch_num+1)))
train_end = time.time() - train_start
print('Training complete in {:.0f}m {:.0f}s'.format(
train_end // 60, train_end % 60))
# when the training is finished save the model
#torch.save(net.state_dict(), os.path.join(LOG_PATH, 'model_snapshot.txt'))
TBoard.close()
print('top 1 accuracy @ the end: {}'.format(round(top1_accuracy, 3)))
print('top 5 accuracy @ the end: {}'.format(round(top5_accuracy, 3)))
print('loss @ the end: {}'.format(round(loss.item(), 3)))
In [ ]: