In [1]:
#-*- coding: utf-8 -*-
import math
import os
#import ipdb
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle

from tensorflow.models.rnn import rnn_cell
import tensorflow.python.platform
from keras.preprocessing import sequence


Using Theano backend.

In [2]:
class Caption_Generator():

    def init_weight(self, dim_in, dim_out, name=None, stddev=1.0):
        return tf.Variable(tf.truncated_normal([dim_in, dim_out], stddev=stddev/math.sqrt(float(dim_in))), name=name)

    def init_bias(self, dim_out, name=None):
        return tf.Variable(tf.zeros([dim_out]), name=name)

    def __init__(self, n_words, dim_embed, dim_ctx, dim_hidden, n_lstm_steps, batch_size=200, ctx_shape=[196,512], bias_init_vector=None):
        self.n_words = n_words
        self.dim_embed = dim_embed
        self.dim_ctx = dim_ctx
        self.dim_hidden = dim_hidden
        self.ctx_shape = ctx_shape
        self.n_lstm_steps = n_lstm_steps
        self.batch_size = batch_size

        with tf.device("/cpu:0"):
            self.Wemb = tf.Variable(tf.random_uniform([n_words, dim_embed], -1.0, 1.0), name='Wemb')

        self.init_hidden_W = self.init_weight(dim_ctx, dim_hidden, name='init_hidden_W')
        self.init_hidden_b = self.init_bias(dim_hidden, name='init_hidden_b')

        self.init_memory_W = self.init_weight(dim_ctx, dim_hidden, name='init_memory_W')
        self.init_memory_b = self.init_bias(dim_hidden, name='init_memory_b')

        self.lstm_W = self.init_weight(dim_embed, dim_hidden*4, name='lstm_W')
        self.lstm_U = self.init_weight(dim_hidden, dim_hidden*4, name='lstm_U')
        self.lstm_b = self.init_bias(dim_hidden*4, name='lstm_b')

        self.image_encode_W = self.init_weight(dim_ctx, dim_hidden*4, name='image_encode_W')

        self.image_att_W = self.init_weight(dim_ctx, dim_ctx, name='image_att_W')
        self.hidden_att_W = self.init_weight(dim_hidden, dim_ctx, name='hidden_att_W')
        self.pre_att_b = self.init_bias(dim_ctx, name='pre_att_b')

        self.att_W = self.init_weight(dim_ctx, 1, name='att_W')
        self.att_b = self.init_bias(1, name='att_b')

        self.decode_lstm_W = self.init_weight(dim_hidden, dim_embed, name='decode_lstm_W')
        self.decode_lstm_b = self.init_bias(dim_embed, name='decode_lstm_b')

        self.decode_word_W = self.init_weight(dim_embed, n_words, name='decode_word_W')

        if bias_init_vector is not None:
            self.decode_word_b = tf.Variable(bias_init_vector.astype(np.float32), name='decode_word_b')
        else:
            self.decode_word_b = self.init_bias(n_words, name='decode_word_b')


    def get_initial_lstm(self, mean_context):
        initial_hidden = tf.nn.tanh(tf.matmul(mean_context, self.init_hidden_W) + self.init_hidden_b)
        initial_memory = tf.nn.tanh(tf.matmul(mean_context, self.init_memory_W) + self.init_memory_b)

        return initial_hidden, initial_memory

    def build_model(self):
        context = tf.placeholder("float32", [self.batch_size, self.ctx_shape[0], self.ctx_shape[1]])
        sentence = tf.placeholder("int32", [self.batch_size, self.n_lstm_steps])
        mask = tf.placeholder("float32", [self.batch_size, self.n_lstm_steps])

        h, c = self.get_initial_lstm(tf.reduce_mean(context, 1))

        # TensorFlow가 dot(3D tensor, matrix) 계산을 못함;;; ㅅㅂ 삽질 ㄱㄱ
        context_flat = tf.reshape(context, [-1, self.dim_ctx])
        context_encode = tf.matmul(context_flat, self.image_att_W) # (batch_size, 196, 512)
        context_encode = tf.reshape(context_encode, [-1, ctx_shape[0], ctx_shape[1]])

        loss = 0.0

        for ind in range(self.n_lstm_steps):

            if ind == 0:
                word_emb = tf.zeros([self.batch_size, self.dim_embed])
            else:
                tf.get_variable_scope().reuse_variables()
                with tf.device("/cpu:0"):
                    word_emb = tf.nn.embedding_lookup(self.Wemb, sentence[:,ind-1])

            x_t = tf.matmul(word_emb, self.lstm_W) + self.lstm_b # (batch_size, hidden*4)

            labels = tf.expand_dims(sentence[:,ind], 1)
            indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1)
            concated = tf.concat(1, [indices, labels])
            onehot_labels = tf.sparse_to_dense( concated, tf.pack([self.batch_size, self.n_words]), 1.0, 0.0)

            context_encode = context_encode + \
                 tf.expand_dims(tf.matmul(h, self.hidden_att_W), 1) + \
                 self.pre_att_b

            context_encode = tf.nn.tanh(context_encode)

            # 여기도 context_encode: 3D -> flat required
            context_encode_flat = tf.reshape(context_encode, [-1, self.dim_ctx]) # (batch_size*196, 512)
            alpha = tf.matmul(context_encode_flat, self.att_W) + self.att_b # (batch_size*196, 1)
            alpha = tf.reshape(alpha, [-1, self.ctx_shape[0]])
            alpha = tf.nn.softmax( alpha )

            weighted_context = tf.reduce_sum(context * tf.expand_dims(alpha, 2), 1)

            lstm_preactive = tf.matmul(h, self.lstm_U) + x_t + tf.matmul(weighted_context, self.image_encode_W)
            i, f, o, new_c = tf.split(1, 4, lstm_preactive)

            i = tf.nn.sigmoid(i)
            f = tf.nn.sigmoid(f)
            o = tf.nn.sigmoid(o)
            new_c = tf.nn.tanh(new_c)

            c = f * c + i * new_c
            h = o * tf.nn.tanh(new_c)

            logits = tf.matmul(h, self.decode_lstm_W) + self.decode_lstm_b
            logits = tf.nn.relu(logits)
            logits = tf.nn.dropout(logits, 0.5)

            logit_words = tf.matmul(logits, self.decode_word_W) + self.decode_word_b
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logit_words, onehot_labels)
            cross_entropy = cross_entropy * mask[:,ind]

            current_loss = tf.reduce_sum(cross_entropy)
            loss = loss + current_loss

        loss = loss / tf.reduce_sum(mask)
        return loss, context, sentence, mask

    def build_generator(self, maxlen):
        context = tf.placeholder("float32", [1, self.ctx_shape[0], self.ctx_shape[1]])
        h, c = self.get_initial_lstm(tf.reduce_mean(context, 1))
        
        #print(context[0].shape)
        #context_encode = tf.matmul(context[0], self.image_att_W)
        context_encode = tf.matmul(tf.squeeze(context,[0]), self.image_att_W)
        generated_words = []
        logit_list = []
        alpha_list = []
        word_emb = tf.zeros([1, self.dim_embed])
        for ind in range(maxlen):
            x_t = tf.matmul(word_emb, self.lstm_W) + self.lstm_b
            context_encode = context_encode + tf.matmul(h, self.hidden_att_W) + self.pre_att_b
            context_encode = tf.nn.tanh(context_encode)

            alpha = tf.matmul(context_encode, self.att_W) + self.att_b
            alpha = tf.reshape(alpha, [-1, self.ctx_shape[0]] )
            alpha = tf.nn.softmax(alpha)

            alpha = tf.reshape(alpha, (ctx_shape[0], -1))
            alpha_list.append(alpha)

            weighted_context = tf.reduce_sum(tf.squeeze(context) * alpha, 0)
            weighted_context = tf.expand_dims(weighted_context, 0)

            lstm_preactive = tf.matmul(h, self.lstm_U) + x_t + tf.matmul(weighted_context, self.image_encode_W)

            i, f, o, new_c = tf.split(1, 4, lstm_preactive)

            i = tf.nn.sigmoid(i)
            f = tf.nn.sigmoid(f)
            o = tf.nn.sigmoid(o)
            new_c = tf.nn.tanh(new_c)

            c = f*c + i*new_c
            h = o*tf.nn.tanh(new_c)

            logits = tf.matmul(h, self.decode_lstm_W) + self.decode_lstm_b
            logits = tf.nn.relu(logits)

            logit_words = tf.matmul(logits, self.decode_word_W) + self.decode_word_b

            max_prob_word = tf.argmax(logit_words, 1)

            with tf.device("/cpu:0"):
                word_emb = tf.nn.embedding_lookup(self.Wemb, max_prob_word)

            generated_words.append(max_prob_word)
            logit_list.append(logit_words)

        return context, generated_words, logit_list, alpha_list

In [3]:
###### 학습 관련 Parameters ######
n_epochs=20
batch_size=10
dim_embed=256
dim_ctx=2048
dim_hidden=256
ctx_shape=[1,2048]
#############################
###### 잡다한 Parameters #####
annotation_path = 'test_data/caption/caption.p'
feat_path = 'test_data/features/features.p'
model_path = 'test_data/model/'
#############################

In [4]:
def preProBuildWordVocab(sentence_iterator, word_count_threshold=0): # borrowed this function from NeuralTalk
    print 'preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold, )
    word_counts = {}
    nsents = 0
    for sent in sentence_iterator:
        nsents += 1
        for w in sent.lower().split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1
    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print 'filtered words from %d to %d' % (len(word_counts), len(vocab))

    ixtoword = {}
    ixtoword[0] = '#END#'  # end token at the end of the sentence. make first dimension be end token
    wordtoix = {}
    wordtoix['#START#'] = 0 # make first vector be the start token
    ix = 1
    for w in vocab:
        wordtoix[w] = ix
        ixtoword[ix] = w
        ix += 1

    word_counts['#END#'] = nsents
    bias_init_vector = np.array([1.0*word_counts[ixtoword[i]] for i in ixtoword])
    bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range
    return wordtoix, ixtoword, bias_init_vector

In [5]:
def train(pretrained_model_path=None): # 전에 학습하던게 있으면 초기값 설정.
    annotation_data = pickle.load(open(annotation_path,"rb"))
    captions = annotation_data.values()
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions)

    learning_rate=0.001
    n_words = len(wordtoix)
    feats = np.load(feat_path)
    maxlen = np.max( map(lambda x: len(x.split(' ')), captions) )

    sess = tf.InteractiveSession()


    caption_generator = Caption_Generator(
            n_words=n_words,
            dim_embed=dim_embed,
            dim_ctx=dim_ctx,
            dim_hidden=dim_hidden,
            n_lstm_steps=maxlen+1, # w1~wN까지 예측한 뒤 마지막에 '.'예측해야하니까 +1
            batch_size=batch_size,
            ctx_shape=ctx_shape,
            bias_init_vector=bias_init_vector)

    loss, context, sentence, mask = caption_generator.build_model()
    saver = tf.train.Saver(max_to_keep=50)

    
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    tf.initialize_all_variables().run()
    if pretrained_model_path is not None:
        print "Starting with pretrained model"
        saver.restore(sess, pretrained_model_path)

    index = annotation_data.keys()
    #np.random.shuffle(index)
    #annotation_data = annotation_data.ix[index]

    #captions = annotation_data['caption'].values
    image_id = annotation_data.keys()
                                  

    for epoch in range(n_epochs):
        for start, end in zip( \
                range(0, len(captions), batch_size),
                range(batch_size, len(captions), batch_size)):

            current_feats = np.array([feats[x] for x in image_id[start:end] ])
            current_feats = current_feats.reshape(-1, ctx_shape[1], ctx_shape[0]).swapaxes(1,2)

            current_captions = captions[start:end]
            current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions) # '.'은 제거

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=maxlen+1)

            current_mask_matrix = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix ))

            for ind, row in enumerate(current_mask_matrix):
                row[:nonzeros[ind]] = 1

            _, loss_value = sess.run([train_op, loss], feed_dict={
                context:current_feats,
                sentence:current_caption_matrix,
                mask:current_mask_matrix})

            print "Current Cost: ", loss_value
        saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)

In [8]:
train()


preprocessing word counts and creating vocab based on word count threshold 0
filtered words from 1362 to 1362
Current Cost:  5.95998
Current Cost:  5.94182
Current Cost:  6.08181
Current Cost:  5.89515
Current Cost:  5.87336
Current Cost:  6.04251
Current Cost:  5.83051
Current Cost:  5.81027
Current Cost:  5.97738
Current Cost:  5.73983
Current Cost:  5.73094
Current Cost:  5.90942
Current Cost:  5.61007
Current Cost:  5.61181
Current Cost:  5.83284
Current Cost:  5.48976
Current Cost:  5.50577
Current Cost:  5.72968
Current Cost:  5.35807
Current Cost:  5.39083
Current Cost:  5.62616
Current Cost:  5.23496
Current Cost:  5.28849
Current Cost:  5.50061
Current Cost:  5.08424
Current Cost:  5.16502
Current Cost:  5.42624
Current Cost:  4.97845
Current Cost:  5.0459
Current Cost:  5.27823
Current Cost:  4.79709
Current Cost:  4.86554
Current Cost:  5.18148
Current Cost:  4.62536
Current Cost:  4.73223
Current Cost:  5.02811
Current Cost:  4.49578
Current Cost:  4.58808
Current Cost:  4.89913
Current Cost:  4.36374
Current Cost:  4.4395
Current Cost:  4.7526
Current Cost:  4.15757
Current Cost:  4.29297
Current Cost:  4.64205
Current Cost:  3.97815
Current Cost:  4.13749
Current Cost:  4.50258
Current Cost:  3.8999
Current Cost:  3.96775
Current Cost:  4.41955
Current Cost:  3.73163
Current Cost:  3.89565
Current Cost:  4.2278
Current Cost:  3.57264
Current Cost:  3.70824
Current Cost:  4.04702
Current Cost:  3.4147
Current Cost:  3.55959
Current Cost:  3.8858

In [6]:
def test(test_feat="test_data/features/test_feat.p", model_path='test_data/model/model-19'):
    annotation_data = pickle.load(open(annotation_path,"rb"))
    captions = annotation_data.values()
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions)
    n_words = len(wordtoix)
    feat = np.load(test_feat)
    feat = np.array([feat[x] for x in ['200744']])
    feat = feat.reshape(-1, ctx_shape[1], ctx_shape[0]).swapaxes(1,2)
    maxlen = np.max( map(lambda x: len(x.split(' ')), captions) )

    sess = tf.InteractiveSession()
    #tf.reset_default_graph()

    caption_generator = Caption_Generator(
            n_words=n_words,
            dim_embed=dim_embed,
            dim_ctx=dim_ctx,
            dim_hidden=dim_hidden,
            n_lstm_steps=maxlen,
            batch_size=batch_size,
            ctx_shape=ctx_shape)

    context, generated_words, logit_list, alpha_list = caption_generator.build_generator(maxlen=maxlen)
    saver = tf.train.Saver()
    saver.restore(sess, model_path)
    print("Model Restored")

    generated_word_index = sess.run(generated_words, feed_dict={context:feat})
    #alpha_list_val = sess.run(alpha_list, feed_dict={context:feat})
    generated_words = [ixtoword[x[0]] for x in generated_word_index]
    #punctuation = np.argmax(np.array(generated_words) == '.')+1

    #generated_words = generated_words[:punctuation]
    #alpha_list_val = alpha_list_val[:punctuation]
    return generated_words

In [5]:
def test2(test_feat="web_data/data/features.p", model_path='test_data/model/model-19'):
    annotation_data = pickle.load(open(annotation_path,"rb"))
    captions = annotation_data.values()
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions)
    n_words = len(wordtoix)
    feat = np.load(test_feat)
    all_features = pickle.load(open(test_feat,"rb"))
    maxlen = np.max( map(lambda x: len(x.split(' ')), captions) )

    sess = tf.InteractiveSession()
    #tf.reset_default_graph()
    
    generated_recs = {}

    caption_generator = Caption_Generator(
            n_words=n_words,
            dim_embed=dim_embed,
            dim_ctx=dim_ctx,
            dim_hidden=dim_hidden,
            n_lstm_steps=maxlen,
            batch_size=batch_size,
            ctx_shape=ctx_shape)

    context, generated_words, logit_list, alpha_list = caption_generator.build_generator(maxlen=maxlen)
    saver = tf.train.Saver()
    saver.restore(sess, model_path)
    print("Model Restored")
    
    for rec in all_features.keys():
        print("processing",rec)
        feat = np.array(all_features[rec])
        feat = feat.reshape(-1, ctx_shape[1], ctx_shape[0]).swapaxes(1,2)
        generated_word_index = sess.run(generated_words, feed_dict={context:feat})
        words = [ixtoword[x[0]] for x in generated_word_index]
        generated_recs[rec] = ' '.join(words)

    return generated_recs

In [ ]:
generated_recs = test2()
pickle.dump( generated_recs, open( "web_data/data/rnn_recipes.p", "wb" ) )

In [7]:
pickle.dump( generated_recs, open( "web_data/data/rnn_recipes.p", "wb" ) )

In [8]:
annotation_data = pickle.load(open(annotation_path,"rb"))

In [10]:
print(annotation_data['200744'])


4 pork chops
1 tablespoon butter
1/2 cup brown sugar
3/4 cup evaporated milk
1/4 cup light corn syrup
1/4 cup chopped pecans
In a large skillet over medium-high heat, cook the pork chops until no longer pink in the center, 5 to 7 minutes per side. An instant-read thermometer inserted into the center should read 145 degrees F (63 degrees C).
Melt the butter in a saucepan; stir the brown sugar into the melted butter until smooth. Add the evaporated milk and stir; bring the mixture to a boil and immediately remove from heat. Stir the corn syrup into the milk mixture; fold the pecans into the sauce. Spoon the sauce over the pork chops to serve.


In [ ]: