In [1]:
import numpy as np
import tensorflow as tf
In [2]:
tf.__version__
Out[2]:
In [3]:
T = 3 # number of tasks
O = [3, 2, 5] # number of outputs for every task
N = 100 # number of training samples
D = 20 # dimension of feature vector -- assumed to be the same for all tasks
In [4]:
X = [np.random.randn(N,D) for _ in range(T)]
Y = [x.dot(np.random.randn(D,o)) for x,o in zip(X,O)]
In [5]:
sess = tf.InteractiveSession()
In [6]:
H = 10
W_input_to_hidden = [tf.Variable(tf.truncated_normal(shape=[D, H])) for _ in range(T)]
b_input_to_hidden = [tf.Variable(tf.zeros(shape=[H])) for _ in range(T)]
W_hidden_to_output = [tf.Variable(tf.truncated_normal(shape=[H, o])) for o in O]
b_hidden_to_output = [tf.Variable(tf.zeros(shape=[o])) for o in O]
In [7]:
X_placeholder = [tf.placeholder(tf.float32, shape=[None, D]) for _ in range(T)]
Y_placeholder = [tf.placeholder(tf.float32, shape=[None, o]) for o in O]
In [8]:
Y_hat = [tf.nn.xw_plus_b(tf.nn.sigmoid(tf.nn.xw_plus_b(x,w0,b0)),w1,b1)
for x,w0,b0,w1,b1 in zip(X_placeholder, W_input_to_hidden, b_input_to_hidden, W_hidden_to_output, b_hidden_to_output)]
In [9]:
MSE = [tf.reduce_mean(tf.squared_difference(y,y_hat)) for y,y_hat in zip(Y_placeholder,Y_hat)]
In [10]:
loss = tf.reduce_mean(MSE)
In [11]:
opt = tf.train.AdamOptimizer(learning_rate=0.01)
In [12]:
train = opt.minimize(loss)
In [13]:
sess.run(tf.global_variables_initializer())
In [14]:
feed_dict = dict(list(zip(X_placeholder,X))+list(zip(Y_placeholder,Y)))
In [15]:
for _ in range(1000):
train.run(feed_dict=feed_dict)
if _ % 100 == 0:
print(loss.eval(feed_dict=feed_dict))
In [16]:
# We can resue the parameters learned above for initialisation, though it is optional
W_init = np.stack(sess.run(W_input_to_hidden))
In [17]:
# We put the task-axis in the last position
W_init = np.transpose(W_init, axes=[1,2,0])
In [18]:
sess.close()
In [19]:
tf.reset_default_graph()
In [20]:
sess = tf.InteractiveSession()
In [21]:
from tensor_trace_norm import TensorTraceNorm
In [22]:
# Here we use "TensorTraceNorm" to get the trace norm of shareable layer's parameter (tensor)
W_input_to_hidden = tf.Variable(tf.truncated_normal(shape=[D, H, T]))
Trace_norm_input_to_hidden = TensorTraceNorm(W_input_to_hidden, 'LAF') # Three methods: 'LAF', 'Tucker', and 'TT'
W_input_to_hidden = [W_input_to_hidden[:,:,i] for i in range(T)]
In [23]:
# Nothing changes for bias terms or unshared parameters
b_input_to_hidden = [tf.Variable(tf.zeros(shape=[H])) for _ in range(T)]
W_hidden_to_output = [tf.Variable(tf.truncated_normal(shape=[H, o])) for o in O]
b_hidden_to_output = [tf.Variable(tf.zeros(shape=[o])) for o in O]
In [24]:
# Build the network as usual
X_placeholder = [tf.placeholder(tf.float32, shape=[None, D]) for _ in range(T)]
Y_placeholder = [tf.placeholder(tf.float32, shape=[None, o]) for o in O]
Y_hat = [tf.nn.xw_plus_b(tf.nn.sigmoid(tf.nn.xw_plus_b(x,w0,b0)),w1,b1)
for x,w0,b0,w1,b1 in zip(X_placeholder, W_input_to_hidden, b_input_to_hidden, W_hidden_to_output, b_hidden_to_output)]
MSE = [tf.reduce_mean(tf.squared_difference(y,y_hat)) for y,y_hat in zip(Y_placeholder,Y_hat)]
In [25]:
# Here we add trace norm as part of losses
loss = tf.reduce_mean(MSE) + 0.001 * tf.reduce_sum(Trace_norm_input_to_hidden)
In [26]:
opt = tf.train.AdamOptimizer(learning_rate=0.01)
train = opt.minimize(loss)
In [27]:
# Train the model as usual
init_op = tf.global_variables_initializer() # Workaround: https://github.com/tensorflow/tensorflow/issues/6804
sess.run(init_op)
In [28]:
feed_dict = dict(list(zip(X_placeholder,X))+list(zip(Y_placeholder,Y)))
In [29]:
for _ in range(1000):
train.run(feed_dict=feed_dict)
if _ % 100 == 0:
print(loss.eval(feed_dict=feed_dict))