In [1]:
import numpy as np
import tensorflow as tf

In [2]:
tf.__version__


Out[2]:
'1.2.0'

First, let's forge 3 regression problems (tasks). The first regression problem has 3 outputs; second has 2 outputs; third has 5 outputs. All tasks' samples are represented as 20D vector.


In [3]:
T = 3 # number of tasks
O = [3, 2, 5] # number of outputs for every task
N = 100 # number of training samples
D = 20 # dimension of feature vector -- assumed to be the same for all tasks

In [4]:
X = [np.random.randn(N,D) for _ in range(T)]
Y = [x.dot(np.random.randn(D,o)) for x,o in zip(X,O)]

Second, let's train a regression model for each task independently. Regression model is realised by a 3-layer neuron network, and the hidden layer has 10 neurons


In [5]:
sess = tf.InteractiveSession()

In [6]:
H = 10
W_input_to_hidden = [tf.Variable(tf.truncated_normal(shape=[D, H])) for _ in range(T)]
b_input_to_hidden = [tf.Variable(tf.zeros(shape=[H])) for _ in range(T)]
W_hidden_to_output = [tf.Variable(tf.truncated_normal(shape=[H, o])) for o in O]
b_hidden_to_output = [tf.Variable(tf.zeros(shape=[o])) for o in O]

In [7]:
X_placeholder = [tf.placeholder(tf.float32, shape=[None, D]) for _ in range(T)]
Y_placeholder = [tf.placeholder(tf.float32, shape=[None, o]) for o in O]

In [8]:
Y_hat = [tf.nn.xw_plus_b(tf.nn.sigmoid(tf.nn.xw_plus_b(x,w0,b0)),w1,b1) 
         for x,w0,b0,w1,b1 in zip(X_placeholder, W_input_to_hidden, b_input_to_hidden, W_hidden_to_output, b_hidden_to_output)]

In [9]:
MSE = [tf.reduce_mean(tf.squared_difference(y,y_hat)) for y,y_hat in zip(Y_placeholder,Y_hat)]

In [10]:
loss = tf.reduce_mean(MSE)

In [11]:
opt = tf.train.AdamOptimizer(learning_rate=0.01)

In [12]:
train = opt.minimize(loss)

In [13]:
sess.run(tf.global_variables_initializer())

In [14]:
feed_dict = dict(list(zip(X_placeholder,X))+list(zip(Y_placeholder,Y)))

In [15]:
for _ in range(1000):
    train.run(feed_dict=feed_dict)
    if _ % 100 == 0:
        print(loss.eval(feed_dict=feed_dict))


29.3576
9.27657
2.90455
1.48431
0.951992
0.685612
0.505882
0.38454
0.302834
0.243541

In this example, the 'shareable' layer is the input-to-hidden layer. Let's do soft sharing using the method proposed in paper


In [16]:
# We can resue the parameters learned above for initialisation, though it is optional
W_init = np.stack(sess.run(W_input_to_hidden))

In [17]:
# We put the task-axis in the last position
W_init = np.transpose(W_init, axes=[1,2,0])

In [18]:
sess.close()

In [19]:
tf.reset_default_graph()

In [20]:
sess = tf.InteractiveSession()

In [21]:
from tensor_trace_norm import TensorTraceNorm

In [22]:
# Here we use "TensorTraceNorm" to get the trace norm of shareable layer's parameter (tensor)
W_input_to_hidden = tf.Variable(tf.truncated_normal(shape=[D, H, T]))
Trace_norm_input_to_hidden = TensorTraceNorm(W_input_to_hidden, 'LAF') # Three methods: 'LAF', 'Tucker', and 'TT'
W_input_to_hidden = [W_input_to_hidden[:,:,i] for i in range(T)]

In [23]:
# Nothing changes for bias terms or unshared parameters
b_input_to_hidden = [tf.Variable(tf.zeros(shape=[H])) for _ in range(T)]
W_hidden_to_output = [tf.Variable(tf.truncated_normal(shape=[H, o])) for o in O]
b_hidden_to_output = [tf.Variable(tf.zeros(shape=[o])) for o in O]

In [24]:
# Build the network as usual
X_placeholder = [tf.placeholder(tf.float32, shape=[None, D]) for _ in range(T)]
Y_placeholder = [tf.placeholder(tf.float32, shape=[None, o]) for o in O]

Y_hat = [tf.nn.xw_plus_b(tf.nn.sigmoid(tf.nn.xw_plus_b(x,w0,b0)),w1,b1) 
         for x,w0,b0,w1,b1 in zip(X_placeholder, W_input_to_hidden, b_input_to_hidden, W_hidden_to_output, b_hidden_to_output)]

MSE = [tf.reduce_mean(tf.squared_difference(y,y_hat)) for y,y_hat in zip(Y_placeholder,Y_hat)]

In [25]:
# Here we add trace norm as part of losses
loss = tf.reduce_mean(MSE) + 0.001 * tf.reduce_sum(Trace_norm_input_to_hidden)

In [26]:
opt = tf.train.AdamOptimizer(learning_rate=0.01)

train = opt.minimize(loss)

In [27]:
# Train the model as usual
init_op = tf.global_variables_initializer() # Workaround: https://github.com/tensorflow/tensorflow/issues/6804
sess.run(init_op)

In [28]:
feed_dict = dict(list(zip(X_placeholder,X))+list(zip(Y_placeholder,Y)))

In [29]:
for _ in range(1000):
    train.run(feed_dict=feed_dict)
    if _ % 100 == 0:
        print(loss.eval(feed_dict=feed_dict))


28.6527
8.95926
2.87084
1.55215
0.995847
0.746464
0.603855
0.48874
0.391965
0.320133