In [1]:
import numpy as np
import librosa
In [3]:
# load audio
audio_manifest = librosa.util.find_files("pcsnpny-20150204-mkj")
sig1, sr1 = librosa.core.load(audio_manifest[0], sr=None)
sig2, sr2 = librosa.core.load(audio_manifest[1], sr=None)
print(sig1.shape, sr1, sig2.shape, sr2)
In [8]:
win_len = 1024
hop_len = win_len // 2
gram1 = librosa.feature.melspectrogram(sig1, sr=sr1, n_fft=win_len, hop_length=hop_len)
gram2 = librosa.feature.melspectrogram(sig2, sr=sr2, n_fft=win_len, hop_length=hop_len)
gram1 = librosa.power_to_db(gram1, ref=np.max)
gram2 = librosa.power_to_db(gram2, ref=np.max)
gram1 -= gram1.min()
gram2 -= gram2.min()
print(gram1.shape, gram2.shape, gram1.min(), gram2.min())
In [56]:
import torch
import torch.nn as nn
from torch.autograd import Variable
BATCH_SIZE = 2
MAX_LENGTH = max([g.shape[1] for g in [gram1, gram2]])
HIDDEN_SIZE = 20
N_LAYERS = 3
def pad2d(t, length):
if t.size(1) == length:
return(t)
else:
return torch.cat((t, t.new(t.size(0), length - t.size(1)).zero_()),1)
seq_lens = [g.shape[1] for g in [gram1, gram2]]
batch_in = [pad2d(torch.Tensor(g), MAX_LENGTH) for g in [gram1, gram2]]
batch_in = torch.stack(batch_in).transpose(1,2)
print(seq_lens, batch_in.size(), in_size)
batch_in = Variable(batch_in)
pack = torch.nn.utils.rnn.pack_padded_sequence(batch_in, seq_lens, batch_first=True)
print("pack size:", pack.data.size())
rnn = nn.GRU(128, HIDDEN_SIZE, N_LAYERS, batch_first=True)
h0 = Variable(torch.randn(N_LAYERS, BATCH_SIZE, HIDDEN_SIZE))
print("h0:", h0.size())
# forward
out, hidden_new = rnn(pack, h0)
print("out size:", out.data.size(), hidden_new.size())
unpacked, unpacked_len = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
print(unpacked.size(), unpacked_len)
In [55]:
from torch.utils.data import TensorDataset, DataLoader
DataLoader()
Out[55]: