In [ ]:
import cPickle
import numpy as np
import scipy

In [ ]:
SIZE = 108 - 21 + 1
def padzeros(lst, front=True, return_mask=False):
    # TODO add docs for ``front``
    """Given a list of arrays, pad every array with up front  zeros until they
    reach unit length.
    Each element of `lst` can have a different first dimension, but has to be
    equal on the other dimensions.
    """
    n_items = len(lst)
    # Get the longest item.
    maxlength = max(len(i) for i in lst)
    restshape = list(lst[0].shape)[1:]
    item_shape = [maxlength] + restshape
    total_shape = [n_items] + item_shape

    data = scipy.zeros(total_shape, dtype=lst[0].dtype)
    if return_mask:
        mask = scipy.zeros(total_shape, dtype=lst[0].dtype)
    for i in range(n_items):
        # Iterate over indices because we work in place of the list.
        thislength = lst[i].shape[0]
        if front:
            data[i][-thislength:] = lst[i]
            if return_mask:
                mask[i][-thislength:] = 1
        else:
            data[i][:thislength] = lst[i]
            if return_mask:
                mask[i][:thislength] = 1

    if return_mask:
        return data, scipy.asarray(mask)
    return data

def interleave(lst):
    """Given a list of arrays, interleave the arrays in a way that the
    first dimension represents the first dimension of every array.
    This is useful for time series, where multiple time series should be
    processed in a single swipe."""
    arr = scipy.asarray(lst)
    return scipy.swapaxes(arr, 0, 1)

def split(X, maxlength):
    """Return a list of sequences where each sequence has a length of at most
    `maxlength`.
    Given a list of sequences `X`, the sequences are split accordingly."""
    new_X = []
    for seq in X:
        n_new_seqs, rest = divmod(seq.shape[0], maxlength)
        if rest:
            n_new_seqs += 1
        for i in range(n_new_seqs):
            new_X.append(seq[i * maxlength:(i + 1) * maxlength])
    return new_X

def masked(idxs):
    x = np.zeros(SIZE)
    x[(np.array(idxs) - 21).tolist()] = 1
    return x


def rolls_from_sequences(seqs):
    x = []
    for seq in seqs:
        x.append([])
        for item in seq:
            x[-1].append(masked(item))
        x[-1] = np.array(x[-1])
    return x


def load_data(handle):
    with open('%s.pickle' % handle) as fp:
        data = cPickle.load(fp)

    train, valid, test = data['train'], data['valid'], data['test']

    x = rolls_from_sequences(data['train'])
    vx = rolls_from_sequences(data['valid'])
    tx = rolls_from_sequences(data['test'])

    print 'size of x', sum([i.size for i in x])
    print 'size of vx', sum([i.size for i in vx])

    x = split(x, 100)
    vx = split(vx, 100)

    # Standardize.
    #scaler = StandardScaler()
    #x, (vx, tx) = static_transform([scaler], x, [vx, tx])

    X = interleave(padzeros(x, False))
    VX = interleave(padzeros(vx, False))

    print 'size of X', X.size
    print 'size of VX', VX.size

    Z, VZ, tz = X[1:], VX[1:], [i[1:] for i in tx]
    X, VX, tx = X[:-1], VX[:-1], [i[:-1] for i in tx]

    X, VX, Z, VZ = [i.astype('float32') for i in (X, VX, Z, VZ)]
    tx = [i.astype('float32') for i in tx]
    tz = [i.astype('float32') for i in tz]

    print 'training shape', X.shape
    print 'validation shape', VX.shape

    return X, Z, VX, VZ, tx, tz

X, Z, VX, VZ, tx, tz = load_data('/Users/eder/Copy/python/data/midi/JSBChorales')
X = X.transpose(1,0,2)
Z = Z.transpose(1,0,2)
VX = VX.transpose(1,0,2)
VZ = VZ.transpose(1,0,2)

In [ ]:
with open('%s.pickle' % '/Users/eder/Copy/python/data/midi/JSBChorales') as fp:
        data = cPickle.load(fp)

train, valid, test = data['train'], data['valid'], data['test']

x = rolls_from_sequences(data['train'])
vx = rolls_from_sequences(data['valid'])
tx = rolls_from_sequences(data['test'])

print 'size of x', sum([i.size for i in x])
print 'size of vx', sum([i.size for i in vx])

MID dataset test


In [ ]:
from mid import MID
from pylearn2.sandbox.rnn.space import SequenceSpace
from pylearn2.space import VectorSpace

In [ ]:
dataset = MID('train', '/Users/eder/Copy/python/data/midi/JSBChorales')
dataset.X_space

In [ ]:
for d in dataset.iterator(mode='sequential', batch_size=27, num_batches=243/27, 
                          data_specs=(dataset.X_space.components,('data','mask'))):
    print d.shape

In [ ]:
dataset.X_space.components

In [ ]:
from pylearn2.space import VectorSpace