In [ ]:
import codecs
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


/Users/pasquale/anaconda3/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5
  return f(*args, **kwds)
/Users/pasquale/anaconda3/lib/python3.5/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters

Load data in Tensorflow.


In [3]:
root = "../"
training_data_folder = '%straining_data/web-radio/output/rec' % root
embDir = '%sembeddings' % root
what = 'artist'

uri_file = '%s/%s.emb.u' % (embDir, what)
vector_file = '%s/%s.emb.v' % (embDir, what)
# header_file = '%s/%s.emb.h' % (embDir, what)
training_file = '%s/%s.dat' % (training_data_folder, what)

vectors = np.array([line.strip().split(' ') for line in codecs.open(vector_file, 'r', 'utf-8')])
# heads = np.array([line.strip() for line in codecs.open(header_file, 'r', 'utf-8')])
uris = np.array([line.strip() for line in codecs.open(uri_file, 'r', 'utf-8')])

train_array = np.array([line.strip().split(' ') for line in codecs.open(training_file, 'r', 'utf-8')])
pd.DataFrame(train_array, columns=['seed', 'target', 'score']).head()


Out[3]:
seed target score
0 http://data.doremus.org/artist/d33ebb23-7b8d-3... http://data.doremus.org/artist/6329cd86-d47a-3... 1
1 http://data.doremus.org/artist/01915146-b964-3... http://data.doremus.org/artist/6329cd86-d47a-3... 1
2 http://data.doremus.org/artist/01915146-b964-3... http://data.doremus.org/artist/d33ebb23-7b8d-3... 1
3 http://data.doremus.org/artist/72b3b303-5c15-3... http://data.doremus.org/artist/6329cd86-d47a-3... 1
4 http://data.doremus.org/artist/72b3b303-5c15-3... http://data.doremus.org/artist/d33ebb23-7b8d-3... 1

Data pre-processing: I want to substitute the seed and target with their embeddings


In [4]:
def get_embs(x):
    # uri to embedding
    v = vectors[np.argwhere(uris == x)]
    if v.size == 0:
        result = -2. * np.ones(vectors[0].size)
    else:
        result = v[0][0]
    return result.astype('float32')

In [5]:
col1 = np.array([get_embs(xi) for xi in train_array[:, 0]])
col2 = np.array([get_embs(xi) for xi in train_array[:, 1]])
col1 = np.concatenate((col1, [12., 45., 73.] * np.ones((train_array.shape[0], 3))), axis=1)
col2 = np.concatenate((col2, [12., 45., 73.] * np.ones((train_array.shape[0], 3))), axis=1)
col3 = np.array(train_array[:, 2]).astype('float32')
col3 = col3.reshape((col3.size, 1))

In [6]:
def next_batch(num, data, labels):
    """
    Return a total of `num` random samples and labels. 
    """
    idx = np.arange(0, len(data))
    np.random.shuffle(idx)
    idx = idx[:num]
    data_shuffle = data[idx]
    labels_shuffle = labels[idx]
    return data_shuffle, labels_shuffle

In [ ]:


In [7]:
training_vector = np.concatenate((col1, col2, col3), axis=1)
training_vector.shape


Out[7]:
(12333, 35)

Split test and train


In [8]:
train, test = train_test_split(training_vector, train_size=0.7)

train_vector = train[:, : -1]
train_label = train[:, -1]
train_label = train_label.reshape((len(train_label), 1))

test_vector = test[:, :-1]
test_label = test[:, -1]
test_label = test_label.reshape((len(test_label), 1))


/Users/pasquale/anaconda3/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
  FutureWarning)

In [9]:
print('Train')
print(train_vector.shape)
print(train_label.shape)
print('Test')
print(test_vector.shape)
print(test_label.shape)


Train
(8633, 34)
(8633, 1)
Test
(3700, 34)
(3700, 1)

In [10]:
# Parameters
learning_rate = 0.1
num_steps = 1000
batch_size = 64
display_step = 100

In [14]:
# Network Parameters
n_hidden_1 = 256  # 1st layer number of neurons
n_hidden_2 = 256  # 2nd layer number of neurons
num_input = train_vector[0].size
num_output = col1[0].size
num_output_wrap = train_label[0].size

# tf Graph input
X = tf.placeholder(tf.float32, [None, num_input], name="X")
Y = tf.placeholder(tf.float32, [None, num_output_wrap], name="Y")

Network


In [78]:
def weighted_l2(a, b, w):
    with tf.name_scope('weighted_l2') as scope:
        # https://stackoverflow.com/a/8861999/1218213
        q = tf.subtract(a, b, name="q")
        # return np.sqrt((w * q * q).sum())
        pow_q = tf.cast(tf.pow(q, 2), tf.float32, name="q-power")
        return tf.reduce_sum(tf.multiply(w, pow_q), name="o", keepdims=True)

In [91]:
def compute_penalty(expected, taken, total):
    with tf.name_scope('penalty') as scope:
        penalty = tf.divide(tf.subtract(expected, taken), total)
        return tf.cast(penalty, tf.float32)

w = tf.Variable(tf.random_normal([1, num_output]), name='w')

def neural_net_wrap(x):        
        seed, target = tf.split(x, [num_output, num_output], axis=1)
        
        bs = tf.equal(seed, -2.)
        bt = tf.equal(target, -2.)

        _ones = tf.ones_like(w, tf.float32)
        max_distance = weighted_l2(_ones, _ones * -1., w)

        bad_mask = tf.logical_or(bs, bt)
        good_mask = tf.logical_not(bad_mask)

        bs_count = tf.count_nonzero(tf.logical_not(bs), axis=1, keepdims=True)
        good_count = tf.count_nonzero(good_mask, axis=1, keepdims=True)

        _zeros = tf.zeros_like(seed, tf.float32)
        _seed = tf.where(good_mask, seed, _zeros)
        _target = tf.where(good_mask, target, _zeros)

        # distance
        d = weighted_l2(_seed, _target, w)

        # how much info I am not finding
        penalty = compute_penalty(bs_count, good_count, num_output)
        multiplier = tf.subtract(1., penalty)
        
        # score
        s = tf.divide(tf.subtract(max_distance, d), max_distance)
        return tf.multiply(s, multiplier)

In [92]:
# Construct model
logits = neural_net_wrap(X)


(1, 17)
(1, 17)
(1, 17)
(?, 17)

In [93]:
Y.shape


Out[93]:
TensorShape([Dimension(None), Dimension(1)])

In [94]:
logits.shape


Out[94]:
TensorShape([Dimension(None), Dimension(1)])

In [95]:
# Define loss and optimizer
# loss_op = MSE
loss_op = tf.reduce_mean(tf.square(tf.subtract(logits, Y)))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.less(tf.subtract(logits, Y), 0.1)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [ ]:


In [96]:
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [99]:
with tf.Session() as sess:
    writer = tf.summary.FileWriter("output", sess.graph)

    # Run the initializer
    sess.run(init)

    print("Start learning")
    for step in range(1, num_steps + 1):
        batch_x, batch_y = next_batch(batch_size, train_vector, train_label)

        # Run optimization op (backprop)
        sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
        if step % display_step == 0 or step == 1:
            # Calculate batch loss and accuracy
            preds, my_weights, loss, acc = sess.run([logits, w, loss_op, accuracy],
                                                    feed_dict={X: batch_x, Y: batch_y})
            
            print("Step " + str(step) + ", Minibatch Loss= " + \
                  "{:.4f}".format(loss) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc))
            # print("Predictions %s VS %s" % (preds[0], batch_y[0]))
            np.set_printoptions(precision=2)
            print("My weights %s" % np.mean(my_weights, axis=0))

    print("Optimization Finished!")

    print("Testing Accuracy:",
          sess.run(accuracy, feed_dict={X: test_vector, Y: test_label}))
    writer.close()


Start learning
Step 1, Minibatch Loss= 0.0880, Training Accuracy= 0.953
My weights [-0.26  3.04 -0.96 -0.5   0.17  0.24  1.49 -1.61 -0.63 -0.48  1.69  2.2
 -2.43 -0.67  0.72  0.02  0.54]
Step 100, Minibatch Loss= 0.0820, Training Accuracy= 0.969
My weights [-0.31  2.99 -0.94 -0.51  0.2   0.01  1.55 -1.57 -0.61 -0.39  1.77  2.3
 -2.33 -0.57  0.82  0.12  0.64]
Step 200, Minibatch Loss= 0.0588, Training Accuracy= 0.953
My weights [-0.37  2.94 -0.94 -0.52  0.21 -0.14  1.58 -1.56 -0.61 -0.35  1.8   2.36
 -2.27 -0.51  0.87  0.17  0.69]
Step 300, Minibatch Loss= 0.0503, Training Accuracy= 0.984
My weights [-0.42  2.89 -0.95 -0.53  0.2  -0.26  1.59 -1.56 -0.62 -0.33  1.81  2.38
 -2.25 -0.49  0.9   0.2   0.71]
Step 400, Minibatch Loss= 0.0633, Training Accuracy= 0.953
My weights [-0.45  2.86 -0.96 -0.55  0.19 -0.34  1.59 -1.56 -0.63 -0.32  1.81  2.39
 -2.24 -0.48  0.91  0.21  0.73]
Step 500, Minibatch Loss= 0.0634, Training Accuracy= 0.969
My weights [-0.48  2.83 -0.95 -0.55  0.2  -0.4   1.6  -1.56 -0.63 -0.31  1.82  2.41
 -2.22 -0.46  0.93  0.23  0.74]
Step 600, Minibatch Loss= 0.0592, Training Accuracy= 0.969
My weights [-0.51  2.8  -0.95 -0.55  0.2  -0.46  1.6  -1.56 -0.63 -0.3   1.83  2.43
 -2.21 -0.45  0.94  0.24  0.76]
Step 700, Minibatch Loss= 0.0511, Training Accuracy= 0.406
My weights [-0.53  2.78 -0.94 -0.55  0.2  -0.48  1.61 -1.56 -0.62 -0.29  1.84  2.44
 -2.19 -0.43  0.95  0.25  0.77]
Step 800, Minibatch Loss= 0.0685, Training Accuracy= 0.953
My weights [-0.54  2.78 -0.92 -0.53  0.21 -0.48  1.62 -1.55 -0.61 -0.27  1.86  2.45
 -2.18 -0.42  0.97  0.27  0.79]
Step 900, Minibatch Loss= 0.0441, Training Accuracy= 0.969
My weights [-0.55  2.76 -0.92 -0.52  0.22 -0.5   1.62 -1.54 -0.61 -0.27  1.86  2.46
 -2.17 -0.41  0.98  0.28  0.79]
Step 1000, Minibatch Loss= 0.0945, Training Accuracy= 0.922
My weights [-0.56  2.75 -0.91 -0.52  0.22 -0.52  1.63 -1.54 -0.61 -0.26  1.87  2.47
 -2.16 -0.4   0.98  0.28  0.8 ]
Optimization Finished!
Testing Accuracy: 0.038108107

In [ ]:


In [ ]:


In [ ]:


In [ ]: