In [ ]:
import numpy as np
import os
import pickle
import time
SENTENCE_LENGTH_MAX = 32
EMBEDDING_DIM=50
In [ ]:
import nltk
nltk.download('punkt')
In [ ]:
sentence_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
sentence_splitter.tokenize("This is Mr. Smith's tokenized test. The U.S.A gives us sent two. Is this sent three?")
In [ ]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize("This is Mr. Smith's tokenized test.")
From the corpus download page : http://wortschatz.uni-leipzig.de/en/download/
Here's the paper that explains how the corpus was constructed :
In [ ]:
corpus_dir = './data/RNN/'
corpus_text_file = os.path.join(corpus_dir, 'en.wikipedia.2010.100K.txt')
In [ ]:
if not os.path.isfile( corpus_text_file ):
if not os.path.exists(corpus_dir):
os.makedirs(corpus_dir)
corpus_text_tar = 'eng_wikipedia_2010_100K.tar.gz'
download_url = 'http://pcai056.informatik.uni-leipzig.de/downloads/corpora/'+corpus_text_tar
data_cache = './data/cache'
if not os.path.exists(data_cache):
os.makedirs(data_cache)
# Fall-back url if too slow
#download_url= 'http://redcatlabs.com/downloads/deep-learning-workshop/notebooks/data/RNN/'+corpus_text_tar
import shutil, requests
# Get the download path from the web-service
#urllib.request.urlretrieve('http://wortschatz.uni-leipzig.de/download/service', corpus_text_tar)
# download_url = ...
tarfilepath = os.path.join(data_cache, corpus_text_tar)
if not os.path.isfile( tarfilepath ):
response = requests.get(download_url, stream=True)
with open(tarfilepath, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
if os.path.isfile(tarfilepath):
import tarfile
#tarfile.open(tarfilepath, 'r:gz').extractall(corpus_dir)
tarfile.open(tarfilepath, 'r:gz').extract('eng_wikipedia_2010_100K-sentences.txt', corpus_dir)
shutil.move(os.path.join(corpus_dir, 'eng_wikipedia_2010_100K-sentences.txt'), corpus_text_file)
# Get rid of tarfile source (the required text file itself will remain)
#os.unlink(tarfilepath)
print("Corpus available locally")
In [ ]:
## This is a work-in-progress, since we should really discover 'download_url' from the 'service'
#r=requests.post('http://wortschatz.uni-leipzig.de/download/service', data='file=%s&func="link"' % (corpus_text_tar,))
#r=requests.post('http://wortschatz.uni-leipzig.de/download/service', data=dict(file=corpus_text_tar, func="link") )
#r.text
In [ ]:
def corpus_sentence_tokens(corpus_text_file=corpus_text_file):
while True:
with open(corpus_text_file, encoding='utf-8') as f:
for line in f.readlines():
n,l = line.split('\t') # Strip of the initial numbers
for s in sentence_splitter.tokenize(l): # Split the lines into sentences (~1 each)
tree_banked = tokenizer.tokenize(s)
if len(tree_banked) < SENTENCE_LENGTH_MAX:
yield tree_banked
print("Corpus : Looping")
corpus_sentence_tokens_gen = corpus_sentence_tokens()
In [ ]:
' | '.join(next(corpus_sentence_tokens_gen))
Using the python package : https://github.com/maciejkula/glove-python , and code samples from : http://developers.lyst.com/2014/11/11/word-embeddings-for-fashion/
In [ ]:
! pip install glove_python
In [ ]:
import glove
glove_corpus = glove.Corpus()
corpus_sentences = [
[ w.lower() for w in next(corpus_sentence_tokens_gen)] # All lower-case
for _ in range(0,100*1000)
]
# Fit the co-occurrence matrix using a sliding window of 10 words.
t0 = time.time()
glove_corpus.fit(corpus_sentences, window=10)
print("Dictionary length=%d" % (len(glove_corpus.dictionary),))
print("Co-occurrence calculated in %5.1fsec" % (time.time()-t0, ))
In [ ]:
# Return the index of the word in the dictionary
glove_corpus.dictionary['city']
In [ ]:
word_embedding = glove.Glove(no_components=EMBEDDING_DIM, learning_rate=0.05)
t0 = time.time()
glove_epochs, glove_threads = 20, 4
word_embedding.fit(glove_corpus.matrix, epochs=glove_epochs, no_threads=glove_threads, verbose=True)
print("%d-d word-embedding created in %5.1fsec = %5.1fsec per epoch" % (
EMBEDDING_DIM, (time.time()-t0), (time.time()-t0)/glove_epochs*glove_threads, ))
# Add the word -> id dictionary to the model to allow similarity queries.
word_embedding.add_dictionary(glove_corpus.dictionary)
In [ ]:
#word_embedding.save("./data/RNN/glove.embedding.50.pkl")
#word_embedding.load("./data/RNN/glove.embedding.50.pkl")
In [ ]:
# word-similarity test
word_embedding.most_similar('country')
In [ ]:
# word-analogy test
def get_embedding_vec(word):
idx = word_embedding.dictionary.get(word.lower(), -1)
if idx<0:
#print("Missing word : '%s'" % (word,))
return np.zeros( (EMBEDDING_DIM, ), dtype='float32') # UNK
return word_embedding.word_vectors[idx]
def get_closest_word(vec, number=5):
dst = (np.dot(word_embedding.word_vectors, vec)
/ np.linalg.norm(word_embedding.word_vectors, axis=1)
/ np.linalg.norm(vec))
word_ids = np.argsort(-dst)
return [(word_embedding.inverse_dictionary[x], dst[x]) for x in word_ids[:number]
if x in word_embedding.inverse_dictionary]
In [ ]:
analogy_vec = get_embedding_vec('woman') + get_embedding_vec('king') - get_embedding_vec('man')
get_closest_word(analogy_vec)
In [ ]:
def test_analogy(s='one two three four'):
(a,b,c,d) = s.split(' ')
analogy_vec = get_embedding_vec(b) - get_embedding_vec(a) + get_embedding_vec(c)
words = [ w for (w,p) in get_closest_word(analogy_vec) if w not in (a,b,c)]
print("'%s' is to '%s' as '%s' is to {%s}" % (a,b,c,', '.join(words)))
In [ ]:
test_analogy('man woman king queen')
test_analogy('paris france rome italy')
test_analogy('kitten cat puppy dog')
test_analogy('understand understood run ran')
Since the embedding we learnt above is poor, let's load a pre-trained word embedding, from a much larger corpus, trained for a much longer period. Source of this word embedding (created from a 6 billion tokens corpus, with results as 50d vectors): http://nlp.stanford.edu/projects/glove/
NB: If you don't have the required data, and the RedCatLabs server doesn't give you the download, the loader below downloads a 823Mb file via a fairly slow connection to a server at Stanford (this can take HOURS).
In [ ]:
import os, requests, shutil
glove_dir = './data/RNN/'
glove_100k_50d = 'glove.first-100k.6B.50d.txt'
glove_100k_50d_path = os.path.join(glove_dir, glove_100k_50d)
# These are temporary files if we need to download it from the original source (slow)
data_cache = './data/cache'
glove_full_tar = 'glove.6B.zip'
glove_full_50d = 'glove.6B.50d.txt'
#force_download_from_original=False
download_url= 'http://redcatlabs.com/downloads/deep-learning-workshop/notebooks/data/RNN/'+glove_100k_50d
original_url = 'http://nlp.stanford.edu/data/'+glove_full_tar
if not os.path.isfile( glove_100k_50d_path ):
if not os.path.exists(glove_dir):
os.makedirs(glove_dir)
# First, try to download a pre-prepared file directly...
response = requests.get(download_url, stream=True)
if response.status_code == requests.codes.ok:
print("Downloading 42Mb pre-prepared GloVE file from RedCatLabs")
with open(glove_100k_50d_path, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
else:
# But, for some reason, RedCatLabs didn't give us the file directly
if not os.path.exists(data_cache):
os.makedirs(data_cache)
if not os.path.isfile( os.path.join(data_cache, glove_full_50d) ):
zipfilepath = os.path.join(data_cache, glove_full_tar)
if not os.path.isfile( zipfilepath ):
print("Downloading 860Mb GloVE file from Stanford")
response = requests.get(download_url, stream=True)
with open(zipfilepath, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
if os.path.isfile(zipfilepath):
print("Unpacking 50d GloVE file from zip")
import zipfile
zipfile.ZipFile(zipfilepath, 'r').extract(glove_full_50d, data_cache)
with open(os.path.join(data_cache, glove_full_50d), 'rt') as in_file:
with open(glove_100k_50d_path, 'wt') as out_file:
print("Reducing 50d GloVE file to first 100k words")
for i, l in enumerate(in_file.readlines()):
if i>=100000: break
out_file.write(l)
# Get rid of tarfile source (the required text file itself will remain)
#os.unlink(zipfilepath)
#os.unlink(os.path.join(data_cache, glove_full_50d))
print("GloVE available locally")
In [ ]:
# Due to size constraints, only use the first 100k vectors (i.e. 100k most frequently used words)
word_embedding = glove.Glove.load_stanford( glove_100k_50d_path )
word_embedding.word_vectors.shape
Having loaded that, play around with the similarity and analogy tests again...
In [ ]:
word_embedding.most_similar('king')
In [ ]:
test_analogy('man woman king queen')
test_analogy('paris france rome italy')
test_analogy('kitten cat puppy dog')
test_analogy('understand understood run ran')
In [ ]:
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
#N = 10000 # Number of items (vocab size).
#D = 200 # Dimensionality of the embedding.
#embedding_var = tf.Variable(tf.random_normal([N,D]), name='word_embedding')
embedding_var = tf.Variable(word_embedding.word_vectors, dtype='float32',
name='word_embedding')
# Format: tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
projector_config = projector.ProjectorConfig()
# You can add multiple embeddings. Here we add only one.
embedding = projector_config.embeddings.add()
embedding.tensor_name = embedding_var.name
# Link this tensor to its metadata file (e.g. labels).
LOG_DIR='../../tensorflow.logdir/'
os.makedirs(LOG_DIR, exist_ok=True)
metadata_file = 'glove_full_50d.words.tsv'
vocab_list = [ word_embedding.inverse_dictionary[i]
for i in range(len( word_embedding.inverse_dictionary )) ]
with open(os.path.join(LOG_DIR, metadata_file), 'wt') as metadata:
metadata.writelines("%s\n" % w for w in vocab_list)
embedding.metadata_path = os.path.join(os.getcwd(), LOG_DIR, metadata_file)
# Use the same LOG_DIR where you stored your checkpoint.
summary_writer = tf.summary.FileWriter(LOG_DIR)
# The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
# read this file during startup.
projector.visualize_embeddings(summary_writer, projector_config)
saver = tf.train.Saver([embedding_var])
with tf.Session() as sess:
# Initialize the model
sess.run(tf.global_variables_initializer())
saver.save(sess, os.path.join(LOG_DIR, metadata_file+'.ckpt'))
#print("Look at the embedding in TensorBoard : http://localhost:8081/")
In [ ]:
# Start the tensorboard server on this (colab) machine
get_ipython().system_raw(
'tensorboard --logdir {} --host 0.0.0.0 --port 8081 &'
.format(LOG_DIR)
)
In [ ]:
# Install 'localtunnel' (a node.js proxy) -- work a little harder to avoid global install
! npm install localtunnel
! ls -l node_modules/localtunnel/bin/client
In [ ]:
# Tunnel port 8081 (TensorBoard assumed running)
get_ipython().system_raw('node_modules/localtunnel/bin/client --port 8081 >> tunnel_url.txt 2>&1 &')
# Check that it's running
! ps fax | grep node | grep 8081
In [ ]:
# Get url - this should be available on the web
# (tunnels into colab via localtunnel to its tensorboard)
! cat tunnel_url.txt
In [ ]: