Ch 12: Concept 03

Ranking images

Download cloth folding dataset from

Extract the zip. Keep note of where you extract it; we'll call that location DATASET_DIR.

import tensorflow as tf
import numpy as np
from vgg16 import vgg16
import glob, os
from scipy.misc import imread, imresize

DATASET_DIR = os.path.join(os.path.expanduser('~'), 'res', 'cloth_folding_rgb_vids')

def get_img_pair(video_id):
    img_files = sorted(glob.glob(os.path.join(DATASET_DIR, video_id, '*.png')))
    start_img = img_files[0]
    end_img = img_files[-1]
    pair = []
    for image_file in [start_img, end_img]:
        img_original = imread(image_file)
        img_resized = imresize(img_original, (224, 224))
    return tuple(pair)

start_imgs = []
end_imgs= []
for vid_id in range(1, NUM_VIDS + 1):
    start_img, end_img = get_img_pair(str(vid_id))
print('Images of starting state {}'.format(np.shape(start_imgs)))
print('Images of ending state {}'.format(np.shape(end_imgs)))

Images of starting state (45, 224, 224, 3)
Images of ending state (45, 224, 224, 3)

imgs_plc = tf.placeholder(tf.float32, [None, 224, 224, 3])

n_features = 4096
n_hidden = 10 # n_features * 2

with tf.name_scope("input"):
    x1 = tf.placeholder(tf.float32, [None, n_features], name="x1")
    x2 = tf.placeholder(tf.float32, [None, n_features], name="x2")
    dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_prob')

with tf.name_scope("hidden_layer"):
    with tf.name_scope("weights"):
        w1 = tf.Variable(tf.random_normal([n_features, n_hidden]), name="w1")
        tf.summary.histogram("w1", w1)
        b1 = tf.Variable(tf.random_normal([n_hidden]), name="b1")
        tf.summary.histogram("b1", b1)
    with tf.name_scope("output"):
        h1 = tf.nn.dropout(tf.nn.relu(tf.matmul(x1,w1) + b1), keep_prob=dropout_keep_prob)
        tf.summary.histogram("h1", h1)
        h2 = tf.nn.dropout(tf.nn.relu(tf.matmul(x2, w1) + b1), keep_prob=dropout_keep_prob)
        tf.summary.histogram("h2", h2)

with tf.name_scope("output_layer"):
    with tf.name_scope("weights"):
        w2 = tf.Variable(tf.random_normal([n_hidden, 1]), name="w2")
        tf.summary.histogram("w2", w2)
        b2 = tf.Variable(tf.random_normal([1]), name="b2")
        tf.summary.histogram("b2", b2)
    with tf.name_scope("output"):
        s1 = tf.matmul(h1, w2) + b2
        s2 = tf.matmul(h2, w2) + b2
with tf.name_scope("loss"):
    s12 = s1 - s2
    s12_flat = tf.reshape(s12, [-1])
    pred = tf.sigmoid(s12)
    lable_p = tf.sigmoid(-tf.ones_like(s12))
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=tf.zeros_like(s12_flat), logits=s12_flat + 1)
    loss = tf.reduce_mean(cross_entropy)
    tf.summary.scalar("loss", loss)
with tf.name_scope("train_op"):
    train_op = tf.train.AdamOptimizer(0.001).minimize(loss)

sess = tf.InteractiveSession()

print('Loading model...')
vgg = vgg16(imgs_plc, 'vgg16_weights.npz', sess)
print('Done loading!')

Loading model...
start_imgs_embedded =, feed_dict={vgg.imgs: start_imgs})
end_imgs_embedded =, feed_dict={vgg.imgs: end_imgs})

idxs = np.random.choice(NUM_VIDS, NUM_VIDS, replace=False)
train_idxs = idxs[0:int(NUM_VIDS * 0.75)]
test_idxs = idxs[int(NUM_VIDS * 0.75):]

train_start_imgs = start_imgs_embedded[train_idxs]
train_end_imgs = end_imgs_embedded[train_idxs]
test_start_imgs = start_imgs_embedded[test_idxs]
test_end_imgs = end_imgs_embedded[test_idxs]

print('Train start imgs {}'.format(np.shape(train_start_imgs)))
print('Train end imgs {}'.format(np.shape(train_end_imgs)))
print('Test start imgs {}'.format(np.shape(test_start_imgs)))
print('Test end imgs {}'.format(np.shape(test_end_imgs)))

Train start imgs (33, 4096)
Train end imgs (33, 4096)
Test start imgs (12, 4096)
Test end imgs (12, 4096)

train_y1 = np.expand_dims(np.zeros(np.shape(train_start_imgs)[0]), axis=1)
train_y2 = np.expand_dims(np.ones(np.shape(train_end_imgs)[0]), axis=1)
for epoch in range(100):
    for i in range(np.shape(train_start_imgs)[0]):
        _, cost_val =[train_op, loss], 
                               feed_dict={x1: train_start_imgs[i:i+1,:], 
                                          x2: train_end_imgs[i:i+1,:], 
                                          dropout_keep_prob: 0.5})
    print('{}. {}'.format(epoch, cost_val))
    s1_val, s2_val =[s1, s2], feed_dict={x1: test_start_imgs, x2: test_end_imgs, dropout_keep_prob: 1})
    print('Accuracy: {}%'.format(100 * np.mean(s1_val < s2_val)))

0. 0.0
Accuracy: 41.66666666666667%
1. 0.0
Accuracy: 75.0%
2. 0.0
Accuracy: 75.0%
3. 0.0
Accuracy: 91.66666666666666%
4. 0.0
Accuracy: 100.0%
def get_img_seq(video_id):
    img_files = sorted(glob.glob(os.path.join(DATASET_DIR, video_id, '*.png')))
    imgs = []
    for image_file in img_files:
        img_original = imread(image_file)
        img_resized = imresize(img_original, (224, 224))
    return imgs

imgs = get_img_seq('1')

(50, 224, 224, 3)

imgs_embedded =, feed_dict={vgg.imgs: imgs})
scores =[s1], feed_dict={x1: imgs_embedded, dropout_keep_prob: 1})

%matplotlib inline
from matplotlib import pyplot as plt
plt.title('Utility of cloth-folding over time')
plt.xlabel('time (video frame #)')

[<matplotlib.lines.Line2D at 0x7f7796094ac8>]