In [1]:
import numpy as np
import librosa

In [3]:
# load audio
audio_manifest = librosa.util.find_files("pcsnpny-20150204-mkj")
sig1, sr1 = librosa.core.load(audio_manifest[0], sr=None)
sig2, sr2 = librosa.core.load(audio_manifest[1], sr=None)
print(sig1.shape, sr1, sig2.shape, sr2)


(78000,) 16000 (74000,) 16000

In [8]:
win_len = 1024
hop_len = win_len // 2
gram1 = librosa.feature.melspectrogram(sig1, sr=sr1, n_fft=win_len, hop_length=hop_len)
gram2 = librosa.feature.melspectrogram(sig2, sr=sr2, n_fft=win_len, hop_length=hop_len)

gram1 = librosa.power_to_db(gram1, ref=np.max)
gram2 = librosa.power_to_db(gram2, ref=np.max)

gram1 -= gram1.min()
gram2 -= gram2.min()

print(gram1.shape, gram2.shape, gram1.min(), gram2.min())


(128, 153) (128, 145) 0.0 0.0

In [56]:
import torch
import torch.nn as nn
from torch.autograd import Variable

BATCH_SIZE = 2
MAX_LENGTH = max([g.shape[1] for g in [gram1, gram2]])
HIDDEN_SIZE = 20
N_LAYERS = 3

def pad2d(t, length):
    if t.size(1) == length:
        return(t)
    else:
        return torch.cat((t, t.new(t.size(0), length - t.size(1)).zero_()),1)

    
seq_lens = [g.shape[1] for g in [gram1, gram2]]
batch_in = [pad2d(torch.Tensor(g), MAX_LENGTH) for g in [gram1, gram2]]
batch_in = torch.stack(batch_in).transpose(1,2)
print(seq_lens, batch_in.size(), in_size)
batch_in = Variable(batch_in)

pack = torch.nn.utils.rnn.pack_padded_sequence(batch_in, seq_lens, batch_first=True)
print("pack size:", pack.data.size())

rnn = nn.GRU(128, HIDDEN_SIZE, N_LAYERS, batch_first=True)
h0 = Variable(torch.randn(N_LAYERS, BATCH_SIZE, HIDDEN_SIZE))
print("h0:", h0.size())
# forward
out, hidden_new = rnn(pack, h0)

print("out size:", out.data.size(), hidden_new.size())

unpacked, unpacked_len = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)

print(unpacked.size(), unpacked_len)


[153, 145] torch.Size([2, 153, 128]) torch.Size([153, 128])
pack size: torch.Size([298, 128])
h0: torch.Size([3, 2, 20])
out size: torch.Size([298, 20]) torch.Size([3, 2, 20])
torch.Size([2, 153, 20]) [153, 145]

In [55]:
from torch.utils.data import TensorDataset, DataLoader
DataLoader()


Out[55]:
torch.utils.data.dataset.TensorDataset