In [ ]:
import os
import tensorflow as tf
import numpy as np
In [ ]:
from tensorflow.contrib.tensorboard.plugins import projector
def embedding_to_tensorboard(word_vectors, vocab_arr, name): # name="embedding_title"
#N = 10000 # Number of items (vocab size).
#D = 200 # Dimensionality of the embedding.
#embedding_var = tf.Variable(tf.random_normal([N,D]), name='word_embedding')
embedding_var = tf.Variable(word_vectors, dtype='float32',
name=name)
# Format: tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
projector_config = projector.ProjectorConfig()
# You can add multiple embeddings. Here we add only one.
embedding = projector_config.embeddings.add()
embedding.tensor_name = embedding_var.name
# Link this tensor to its metadata file (e.g. labels).
LOG_DIR='../../tensorflow.logdir/'
#metadata_file = 'glove_full_50d.words.tsv'
metadata_file = name + '.tsv'
#vocab_list = [ word_embedding.inverse_dictionary[i]
# for i in range(len( word_embedding.inverse_dictionary )) ]
with open(os.path.join(LOG_DIR, metadata_file), 'wt') as metadata:
metadata.writelines("%s\n" % w for w in vocab_arr)
embedding.metadata_path = os.path.join(os.getcwd(), LOG_DIR, metadata_file)
# Use the same LOG_DIR where you stored your checkpoint.
summary_writer = tf.summary.FileWriter(LOG_DIR)
# The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
# read this file during startup.
projector.visualize_embeddings(summary_writer, projector_config)
saver = tf.train.Saver([embedding_var])
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver.save(sess, os.path.join(LOG_DIR, metadata_file+'.ckpt'))
print("Look at the embedding in TensorBoard : http://localhost:8081/#projector")
In [ ]:
import glove
glove_dir = './data/RNN/'
glove_100k_50d = 'glove.first-100k.6B.50d.txt'
glove_100k_50d_path = os.path.join(glove_dir, glove_100k_50d)
glove_embedding = glove.Glove.load_stanford( glove_100k_50d_path )
#glove_embedding.word_vectors.shape
embedding_to_tensorboard( glove_embedding.word_vectors,
[ glove_embedding.inverse_dictionary[i]
for i in range(len( glove_embedding.inverse_dictionary )) ],
'wiki_embedding' )
In [ ]:
import codecs
with codecs.open('poc3-spotify/st-ner.model.txt', "r", encoding='utf-8', errors='ignore') as fner:
ner_len, ner_dim = [ int(i) for i in fner.readline().strip().split(' ')]
print(ner_len, ner_dim)
valid_lines = [ l for l in fner.readlines() if 'jpg' not in l ]
#valid_lines = fner.readlines()
ner_vec = np.zeros( (len(valid_lines), ner_dim) )
ner_str_arr = []
for idx, l in enumerate( valid_lines ):
l_str = l.strip().split(' ')
ner_str_arr.append( l_str[0] )
ner_vec[idx] = np.array( [ float(x) for x in l_str[1:] ])
# Normalise all vector rows to L2=1
ner_vec = ner_vec / np.linalg.norm(ner_vec, axis=1, keepdims=True)
ner_str_to_idx = { s:idx for idx, s in enumerate(ner_str_arr) }
print( ner_vec.shape, len(ner_str_arr), )
embedding_to_tensorboard( ner_vec,
ner_str_arr,
'NER_embedding' )
In [ ]: