Copyright 2019 The TensorFlow Hub Authors.
Licensed under the Apache License, Version 2.0 (the "License");
In [1]:
# Copyright 2019 The TensorFlow Hub Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
This notebook illustrates how to access the Multilingual Universal Sentence Encoder module and use it for sentence similarity across multiple languages. This module is an extension of the original Universal Encoder module.
The notebook is divided as follows:
Research papers that make use of the models explored in this colab should cite:
Yinfei Yang, Daniel Cer, Amin Ahmad, Mandy Guo, Jax Law, Noah Constant, Gustavo Hernandez Abrego, Steve Yuan, Chris Tar, Yun-Hsuan Sung, Brian Strope, and Ray Kurzweil. 2019. arXiv preprint arXiv:1907.04307
In [2]:
%%capture
#@title Setup Environment
# Install the latest Tensorflow version.
!pip install tensorflow_text
!pip install bokeh
!pip install simpleneighbors[annoy]
!pip install tqdm
In [3]:
#@title Setup common imports and functions
import bokeh
import bokeh.models
import bokeh.plotting
import numpy as np
import os
import pandas as pd
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer
import sklearn.metrics.pairwise
from simpleneighbors import SimpleNeighbors
from tqdm import tqdm
from tqdm import trange
def visualize_similarity(embeddings_1, embeddings_2, labels_1, labels_2,
plot_title,
plot_width=1200, plot_height=600,
xaxis_font_size='12pt', yaxis_font_size='12pt'):
assert len(embeddings_1) == len(labels_1)
assert len(embeddings_2) == len(labels_2)
# arccos based text similarity (Yang et al. 2019; Cer et al. 2019)
sim = 1 - np.arccos(
sklearn.metrics.pairwise.cosine_similarity(embeddings_1,
embeddings_2))/np.pi
embeddings_1_col, embeddings_2_col, sim_col = [], [], []
for i in range(len(embeddings_1)):
for j in range(len(embeddings_2)):
embeddings_1_col.append(labels_1[i])
embeddings_2_col.append(labels_2[j])
sim_col.append(sim[i][j])
df = pd.DataFrame(zip(embeddings_1_col, embeddings_2_col, sim_col),
columns=['embeddings_1', 'embeddings_2', 'sim'])
mapper = bokeh.models.LinearColorMapper(
palette=[*reversed(bokeh.palettes.YlOrRd[9])], low=df.sim.min(),
high=df.sim.max())
p = bokeh.plotting.figure(title=plot_title, x_range=labels_1,
x_axis_location="above",
y_range=[*reversed(labels_2)],
plot_width=plot_width, plot_height=plot_height,
tools="save",toolbar_location='below', tooltips=[
('pair', '@embeddings_1 ||| @embeddings_2'),
('sim', '@sim')])
p.rect(x="embeddings_1", y="embeddings_2", width=1, height=1, source=df,
fill_color={'field': 'sim', 'transform': mapper}, line_color=None)
p.title.text_font_size = '12pt'
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_standoff = 16
p.xaxis.major_label_text_font_size = xaxis_font_size
p.xaxis.major_label_orientation = 0.25 * np.pi
p.yaxis.major_label_text_font_size = yaxis_font_size
p.min_border_right = 300
bokeh.io.output_notebook()
bokeh.io.show(p)
This is additional boilerplate code where we import the pre-trained ML model we will use to encode text throughout this notebook.
In [4]:
# The 16-language multilingual module is the default but feel free
# to pick others from the list and compare the results.
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3' #@param ['https://tfhub.dev/google/universal-sentence-encoder-multilingual/3', 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3']
model = hub.load(module_url)
def embed_text(input):
return model(input)
In [5]:
# Some texts of different lengths in different languages.
arabic_sentences = ['كلب', 'الجراء لطيفة.', 'أستمتع بالمشي لمسافات طويلة على طول الشاطئ مع كلبي.']
chinese_sentences = ['狗', '小狗很好。', '我喜欢和我的狗一起沿着海滩散步。']
english_sentences = ['dog', 'Puppies are nice.', 'I enjoy taking long walks along the beach with my dog.']
french_sentences = ['chien', 'Les chiots sont gentils.', 'J\'aime faire de longues promenades sur la plage avec mon chien.']
german_sentences = ['Hund', 'Welpen sind nett.', 'Ich genieße lange Spaziergänge am Strand entlang mit meinem Hund.']
italian_sentences = ['cane', 'I cuccioli sono carini.', 'Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane.']
japanese_sentences = ['犬', '子犬はいいです', '私は犬と一緒にビーチを散歩するのが好きです']
korean_sentences = ['개', '강아지가 좋다.', '나는 나의 산책을 해변을 따라 길게 산책하는 것을 즐긴다.']
russian_sentences = ['собака', 'Милые щенки.', 'Мне нравится подолгу гулять по пляжу со своей собакой.']
spanish_sentences = ['perro', 'Los cachorros son agradables.', 'Disfruto de dar largos paseos por la playa con mi perro.']
# Multilingual example
multilingual_example = ["Willkommen zu einfachen, aber", "verrassend krachtige", "multilingüe", "compréhension du langage naturel", "модели.", "大家是什么意思" , "보다 중요한", ".اللغة التي يتحدثونها"]
multilingual_example_in_en = ["Welcome to simple yet", "surprisingly powerful", "multilingual", "natural language understanding", "models.", "What people mean", "matters more than", "the language they speak."]
In [6]:
# Compute embeddings.
ar_result = embed_text(arabic_sentences)
en_result = embed_text(english_sentences)
es_result = embed_text(spanish_sentences)
de_result = embed_text(german_sentences)
fr_result = embed_text(french_sentences)
it_result = embed_text(italian_sentences)
ja_result = embed_text(japanese_sentences)
ko_result = embed_text(korean_sentences)
ru_result = embed_text(russian_sentences)
zh_result = embed_text(chinese_sentences)
multilingual_result = embed_text(multilingual_example)
multilingual_in_en_result = embed_text(multilingual_example_in_en)
In [7]:
visualize_similarity(multilingual_in_en_result, multilingual_result,
multilingual_example_in_en, multilingual_example, "Multilingual Universal Sentence Encoder for Semantic Retrieval (Yang et al., 2019)")
In [8]:
visualize_similarity(en_result, ar_result, english_sentences, arabic_sentences, 'English-Arabic Similarity')
In [9]:
visualize_similarity(en_result, ru_result, english_sentences, russian_sentences, 'English-Russian Similarity')
In [10]:
visualize_similarity(en_result, es_result, english_sentences, spanish_sentences, 'English-Spanish Similarity')
In [11]:
visualize_similarity(en_result, it_result, english_sentences, italian_sentences, 'English-Italian Similarity')
In [12]:
visualize_similarity(it_result, es_result, italian_sentences, spanish_sentences, 'Italian-Spanish Similarity')
In [13]:
visualize_similarity(en_result, zh_result, english_sentences, chinese_sentences, 'English-Chinese Similarity')
In [14]:
visualize_similarity(en_result, ko_result, english_sentences, korean_sentences, 'English-Korean Similarity')
In [15]:
visualize_similarity(zh_result, ko_result, chinese_sentences, korean_sentences, 'Chinese-Korean Similarity')
Whereas in the previous example we visualized a handful of sentences, in this section we will build a semantic-search index of about 200,000 sentences from a Wikipedia Corpus. About half will be in English and the other half in Spanish to demonstrate the multilingual capabilities of the Universal Sentence Encoder.
First, we will download news sentences in multiples languages from the News Commentary Corpus [1]. Without loss of generality, this approach should also work for indexing the rest of the supported languages.
To speed up the demo, we limit to 1000 sentences per language.
In [16]:
corpus_metadata = [
('ar', 'ar-en.txt.zip', 'News-Commentary.ar-en.ar', 'Arabic'),
('zh', 'en-zh.txt.zip', 'News-Commentary.en-zh.zh', 'Chinese'),
('en', 'en-es.txt.zip', 'News-Commentary.en-es.en', 'English'),
('ru', 'en-ru.txt.zip', 'News-Commentary.en-ru.ru', 'Russian'),
('es', 'en-es.txt.zip', 'News-Commentary.en-es.es', 'Spanish'),
]
language_to_sentences = {}
language_to_news_path = {}
for language_code, zip_file, news_file, language_name in corpus_metadata:
zip_path = tf.keras.utils.get_file(
fname=zip_file,
origin='http://opus.nlpl.eu/download.php?f=News-Commentary/v11/moses/' + zip_file,
extract=True)
news_path = os.path.join(os.path.dirname(zip_path), news_file)
language_to_sentences[language_code] = pd.read_csv(news_path, sep='\t', header=None)[0][:1000]
language_to_news_path[language_code] = news_path
print('{:,} {} sentences'.format(len(language_to_sentences[language_code]), language_name))
In [17]:
# Takes about 3 minutes
batch_size = 2048
language_to_embeddings = {}
for language_code, zip_file, news_file, language_name in corpus_metadata:
print('\nComputing {} embeddings'.format(language_name))
with tqdm(total=len(language_to_sentences[language_code])) as pbar:
for batch in pd.read_csv(language_to_news_path[language_code], sep='\t',header=None, chunksize=batch_size):
language_to_embeddings.setdefault(language_code, []).extend(embed_text(batch[0]))
pbar.update(len(batch))
We use the SimpleNeighbors library---which is a wrapper for the Annoy library---to efficiently look up results from the corpus.
In [18]:
%%time
# Takes about 8 minutes
num_index_trees = 40
language_name_to_index = {}
embedding_dimensions = len(list(language_to_embeddings.values())[0][0])
for language_code, zip_file, news_file, language_name in corpus_metadata:
print('\nAdding {} embeddings to index'.format(language_name))
index = SimpleNeighbors(embedding_dimensions, metric='dot')
for i in trange(len(language_to_sentences[language_code])):
index.add_one(language_to_sentences[language_code][i], language_to_embeddings[language_code][i])
print('Building {} index with {} trees...'.format(language_name, num_index_trees))
index.build(n=num_index_trees)
language_name_to_index[language_name] = index
In [19]:
%%time
# Takes about 13 minutes
num_index_trees = 60
print('Computing mixed-language index')
combined_index = SimpleNeighbors(embedding_dimensions, metric='dot')
for language_code, zip_file, news_file, language_name in corpus_metadata:
print('Adding {} embeddings to mixed-language index'.format(language_name))
for i in trange(len(language_to_sentences[language_code])):
annotated_sentence = '({}) {}'.format(language_name, language_to_sentences[language_code][i])
combined_index.add_one(annotated_sentence, language_to_embeddings[language_code][i])
print('Building mixed-language index with {} trees...'.format(num_index_trees))
combined_index.build(n=num_index_trees)
In this section we will demonstrate:
In this section we show how to retrieve sentences related to a set of sample English sentences. Things to try:
In [20]:
sample_query = 'The stock market fell four points.' #@param ["Global warming", "Researchers made a surprising new discovery last week.", "The stock market fell four points.", "Lawmakers will vote on the proposal tomorrow."] {allow-input: true}
index_language = 'English' #@param ["Arabic", "Chinese", "English", "French", "German", "Russian", "Spanish"]
num_results = 10 #@param {type:"slider", min:0, max:100, step:10}
query_embedding = embed_text(sample_query)[0]
search_results = language_name_to_index[index_language].nearest(query_embedding, n=num_results)
print('{} sentences similar to: "{}"\n'.format(index_language, sample_query))
search_results
Out[20]:
In [21]:
sample_query = 'The stock market fell four points.' #@param ["Global warming", "Researchers made a surprising new discovery last week.", "The stock market fell four points.", "Lawmakers will vote on the proposal tomorrow."] {allow-input: true}
num_results = 40 #@param {type:"slider", min:0, max:100, step:10}
query_embedding = embed_text(sample_query)[0]
search_results = language_name_to_index[index_language].nearest(query_embedding, n=num_results)
print('{} sentences similar to: "{}"\n'.format(index_language, sample_query))
search_results
Out[21]:
Try your own queries:
In [22]:
query = 'The stock market fell four points.' #@param {type:"string"}
num_results = 30 #@param {type:"slider", min:0, max:100, step:10}
query_embedding = embed_text(sample_query)[0]
search_results = combined_index.nearest(query_embedding, n=num_results)
print('{} sentences similar to: "{}"\n'.format(index_language, query))
search_results
Out[22]:
Finally, we encourage you to try queries in any of the supported languages: English, Arabic, Chinese, Dutch, French, German, Italian, Japanese, Korean, Polish, Portuguese, Russian, Spanish, Thai and Turkish.
Also, even though we only indexed in a subset of the languages, you can also index content in any of the supported languages.
We offer variations of the Universal Encoder models optimized for various things like memory, latency and/or quality. Please feel free to experiment with them to find a suitable one.
We used Annoy to efficiently look up nearest neighbors. See the tradeoffs section to read about the number of trees (memory-dependent) and number of items to search (latency-dependent)---SimpleNeighbors only allows to control the number of trees, but refactoring the code to use Annoy directly should be simple, we just wanted to keep this code as simple as possible for the general user.
If Annoy does not scale for your application, please also check out FAISS.
All the best building your multilingual semantic applications!
[1] J. Tiedemann, 2012, Parallel Data, Tools and Interfaces in OPUS. In Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012)