Unsupervised dimensionality reduction using a 1 Hidden-layer perceptron where label == ground truth

For NLP, we can say somewhat say that word2vec and autoencoders are similiar.

Dimensionality reduction works only if the inputs are correlated (like images from the same domain). It fails if we pass in completely random inputs each time we train an autoencoder. So in the end, an autoencoder can produce lower dimensional output (at the encoder) given an input much like Principal Component Analysis (PCA). And since we don’t have to use any labels during training, it’s an unsupervised model as well.


In [5]:
import os
from random import randint
from collections import Counter
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

import numpy as np
import tensorflow as tf

In [182]:
corpus = "the quick brown fox jumped over the lazy dog from the quick tall fox".split()
test_corpus = "the quick brown fox jumped over the lazy dog from the quick tall fox".split()
corpus[:10]


Out[182]:
['the',
 'quick',
 'brown',
 'fox',
 'jumped',
 'over',
 'the',
 'lazy',
 'dog',
 'from']

In [183]:
def build_vocab(words, vocab_size):
    """ Build vocabulary of VOCAB_SIZE most frequent words """
    dictionary = dict()
    count = [('UNK', -1)]
    count.extend(Counter(words).most_common(vocab_size - 1))
    index = 0
    for word, _ in count:
        dictionary[word] = index
        index += 1
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, index_dictionary

In [184]:
vocabulary, reverse_vocabulary = build_vocab(corpus, 100)

In [185]:
vocabulary


Out[185]:
{'UNK': 0,
 'brown': 9,
 'dog': 7,
 'fox': 3,
 'from': 10,
 'jumped': 8,
 'lazy': 6,
 'over': 5,
 'quick': 2,
 'tall': 4,
 'the': 1}

In [186]:
def index_words_in_corpus(corpus):
    return [vocabulary[token] if token in vocabulary else 0 for token in corpus]

In [187]:
corpus = index_words_in_corpus(corpus)
test_corpus = index_words_in_corpus(test_corpus)

In [188]:
test_corpus


Out[188]:
[1, 2, 9, 3, 8, 5, 1, 6, 7, 10, 1, 2, 4, 3]

In [189]:
vocabulary_size = len(vocabulary)
vocabulary_size


Out[189]:
11

In [190]:
def one_hot_encode(index):
    row = np.zeros(vocabulary_size, dtype=np.int32)
    row[index] = 1
    return row

In [191]:
data = np.array([one_hot_encode(i) for i in corpus])
test_data = np.array([one_hot_encode(i) for i in test_corpus])

In [192]:
print("(TRAIN: Total number of words, Vocabulary size):", data.shape)
print("(TEST:  Total number of words, Vocabulary size):", test_data.shape)


(TRAIN: Total number of words, Vocabulary size): (14, 11)
(TEST:  Total number of words, Vocabulary size): (14, 11)

In [193]:
data[randint(1, data.shape[0])]


Out[193]:
array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [194]:
X = tf.placeholder(tf.float32, shape=(None, vocabulary_size))
Y = tf.placeholder(tf.float32, shape=(None, vocabulary_size))

In [196]:
w1 = tf.Variable(tf.random_normal(shape=(vocabulary_size, 1000), stddev=0.01), name='weights1')
b1 = tf.Variable(tf.zeros([1, 1000]), name="bias1")
layer1 = tf.nn.relu(tf.add(tf.matmul(X, w1), b1))

w2 = tf.Variable(tf.random_normal(shape=(1000, 250), stddev=0.01), name='weights2')
b2 = tf.Variable(tf.zeros([1, 250]), name="bias2")
layer2 = tf.nn.relu(tf.add(tf.matmul(layer1, w2), b2))

w = tf.Variable(tf.random_normal(shape=(250, 50), stddev=0.01), name='weights')
b = tf.Variable(tf.zeros([1, 50]), name="bias")
code = tf.nn.relu(tf.add(tf.matmul(layer2, w), b))

w3 = tf.Variable(tf.random_normal(shape=(50, 250), stddev=0.01), name='weights3')
b3 = tf.Variable(tf.zeros([1, 250]), name="bias3")
layer3 = tf.nn.relu(tf.add(tf.matmul(code, w3), b3))

w4 = tf.Variable(tf.random_normal(shape=(250, 1000), stddev=0.01), name='weights4')
b4 = tf.Variable(tf.zeros([1, 1000]), name="bias4")
layer4 = tf.nn.relu(tf.add(tf.matmul(layer3, w4), b4))

w5 = tf.Variable(tf.random_normal(shape=(1000, vocabulary_size), stddev=0.01), name='weights5')
b5 = tf.Variable(tf.zeros([1, vocabulary_size]), name="bias5")
decoder = tf.nn.sigmoid(tf.add(tf.matmul(layer4, w5), b5))

In [197]:
# entropy = tf.nn.softmax_cross_entropy_with_logits(logits=decoder, labels=Y)
loss = tf.reduce_mean(tf.pow(X - decoder, 2))

In [198]:
optimizer = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE).minimize(loss)
init = tf.global_variables_initializer()

In [203]:
LEARNING_RATE = 0.01
NUM_TRAIN_STEPS = 1000
SKIP_STEP = 10 # how many steps to skip before reporting the loss

In [205]:
with tf.Session() as sess:
    sess.run(init)
    
    for i in range(NUM_TRAIN_STEPS):
        _, loss_val = sess.run([optimizer, loss], feed_dict={X: data})
            
        if i % SKIP_STEP == 0:
                print("EPOCH {}/{}, LOSS {}".format(i , NUM_TRAIN_STEPS, loss_val))
                
    test_data_compressed = sess.run(decoder, feed_dict={X: test_data})
#     np.save(outfile, test_data_compressed)


EPOCH 0/1000, LOSS 0.2499999850988388
EPOCH 10/1000, LOSS 0.24945718050003052
EPOCH 20/1000, LOSS 0.24854017794132233
EPOCH 30/1000, LOSS 0.24700087308883667
EPOCH 40/1000, LOSS 0.24443811178207397
EPOCH 50/1000, LOSS 0.24024417996406555
EPOCH 60/1000, LOSS 0.2336183488368988
EPOCH 70/1000, LOSS 0.22381845116615295
EPOCH 80/1000, LOSS 0.21059055626392365
EPOCH 90/1000, LOSS 0.19337916374206543
EPOCH 100/1000, LOSS 0.16469691693782806
EPOCH 110/1000, LOSS 0.10726149380207062
EPOCH 120/1000, LOSS 0.0821770429611206
EPOCH 130/1000, LOSS 0.08011706173419952
EPOCH 140/1000, LOSS 0.07989447563886642
EPOCH 150/1000, LOSS 0.07984154671430588
EPOCH 160/1000, LOSS 0.07981390506029129
EPOCH 170/1000, LOSS 0.07979799807071686
EPOCH 180/1000, LOSS 0.08027677237987518
EPOCH 190/1000, LOSS 0.08026178926229477
EPOCH 200/1000, LOSS 0.08014322817325592
EPOCH 210/1000, LOSS 0.08006715029478073
EPOCH 220/1000, LOSS 0.08003073930740356
EPOCH 230/1000, LOSS 0.08000756800174713
EPOCH 240/1000, LOSS 0.07999226450920105
EPOCH 250/1000, LOSS 0.07997790724039078
EPOCH 260/1000, LOSS 0.07996856421232224
EPOCH 270/1000, LOSS 0.07995875924825668
EPOCH 280/1000, LOSS 0.0799492746591568
EPOCH 290/1000, LOSS 0.07994165271520615
EPOCH 300/1000, LOSS 0.07993904501199722
EPOCH 310/1000, LOSS 0.07992982864379883
EPOCH 320/1000, LOSS 0.07992083579301834
EPOCH 330/1000, LOSS 0.07991596311330795
EPOCH 340/1000, LOSS 0.07991111278533936
EPOCH 350/1000, LOSS 0.07990624010562897
EPOCH 360/1000, LOSS 0.07990201562643051
EPOCH 370/1000, LOSS 0.07989463210105896
EPOCH 380/1000, LOSS 0.07989007979631424
EPOCH 390/1000, LOSS 0.07988661527633667
EPOCH 400/1000, LOSS 0.07988297194242477
EPOCH 410/1000, LOSS 0.07987949997186661
EPOCH 420/1000, LOSS 0.07987655699253082
EPOCH 430/1000, LOSS 0.07987482845783234
EPOCH 440/1000, LOSS 0.07987351715564728
EPOCH 450/1000, LOSS 0.07986708730459213
EPOCH 460/1000, LOSS 0.07986541092395782
EPOCH 470/1000, LOSS 0.07986427843570709
EPOCH 480/1000, LOSS 0.07986196875572205
EPOCH 490/1000, LOSS 0.07985991984605789
EPOCH 500/1000, LOSS 0.07985825836658478
EPOCH 510/1000, LOSS 0.07985662668943405
EPOCH 520/1000, LOSS 0.07985501736402512
EPOCH 530/1000, LOSS 0.07985347509384155
EPOCH 540/1000, LOSS 0.07985194027423859
EPOCH 550/1000, LOSS 0.07985039800405502
EPOCH 560/1000, LOSS 0.0798492431640625
EPOCH 570/1000, LOSS 0.07985011488199234
EPOCH 580/1000, LOSS 0.07984574139118195
EPOCH 590/1000, LOSS 0.0798448771238327
EPOCH 600/1000, LOSS 0.07984477281570435
EPOCH 610/1000, LOSS 0.07984322309494019
EPOCH 620/1000, LOSS 0.07984180003404617
EPOCH 630/1000, LOSS 0.0798407644033432
EPOCH 640/1000, LOSS 0.07983964681625366
EPOCH 650/1000, LOSS 0.07983851432800293
EPOCH 660/1000, LOSS 0.07983747124671936
EPOCH 670/1000, LOSS 0.07983653247356415
EPOCH 680/1000, LOSS 0.0798356905579567
EPOCH 690/1000, LOSS 0.0798349380493164
EPOCH 700/1000, LOSS 0.0798342302441597
EPOCH 710/1000, LOSS 0.0798335000872612
EPOCH 720/1000, LOSS 0.0798327624797821
EPOCH 730/1000, LOSS 0.07983200252056122
EPOCH 740/1000, LOSS 0.07983124256134033
EPOCH 750/1000, LOSS 0.07983052730560303
EPOCH 760/1000, LOSS 0.07982966303825378
EPOCH 770/1000, LOSS 0.07982870191335678
EPOCH 780/1000, LOSS 0.07983026653528214
EPOCH 790/1000, LOSS 0.0798267051577568
EPOCH 800/1000, LOSS 0.079826220870018
EPOCH 810/1000, LOSS 0.07982634752988815
EPOCH 820/1000, LOSS 0.0798252522945404
EPOCH 830/1000, LOSS 0.07982434332370758
EPOCH 840/1000, LOSS 0.07982372492551804
EPOCH 850/1000, LOSS 0.07982297986745834
EPOCH 860/1000, LOSS 0.07982219010591507
EPOCH 870/1000, LOSS 0.07982146739959717
EPOCH 880/1000, LOSS 0.07982077449560165
EPOCH 890/1000, LOSS 0.07982009649276733
EPOCH 900/1000, LOSS 0.0798194408416748
EPOCH 910/1000, LOSS 0.07981882244348526
EPOCH 920/1000, LOSS 0.0798182412981987
EPOCH 930/1000, LOSS 0.07981771230697632
EPOCH 940/1000, LOSS 0.07981720566749573
EPOCH 950/1000, LOSS 0.07981675118207932
EPOCH 960/1000, LOSS 0.07981628179550171
EPOCH 970/1000, LOSS 0.07981587946414948
EPOCH 980/1000, LOSS 0.07981543987989426
EPOCH 990/1000, LOSS 0.07981385290622711

In [206]:
test_data_compressed.shape


Out[206]:
(14, 11)

In [207]:
test_data_compressed


Out[207]:
array([[ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098]], dtype=float32)

Since our compressed data is in probabilities, we'll convert to whole nums to look up words


In [208]:
test_data_compressed[test_data_compressed>0] = 1

In [209]:
test_data_compressed


Out[209]:
array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]], dtype=float32)

In [210]:
test_data


Out[210]:
array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])

Tadaa!!! And here's our prediction

This show's how well our compression is able to recover data

Remember that Autoencoders are lossy compression which means you will never be able to full reconstruct that data


In [212]:
sent = np.ndarray.tolist(test_data_compressed)[0]
print(' '.join([reverse_vocabulary[i] if sent[i] == 1. else "" for i in range(len(sent))]))


UNK the quick fox tall over lazy dog jumped brown from

In [ ]: