In [ ]:
import os, requests, shutil
download_dir = './data/RNN/'
data_cache = './data/cache'
def ensure_downloaded_and_prepared(expected_file, original_url, zipsize_check,
vocab_size=100000, embedding_dim=50):
print('"%s" preparation' % (expected_file, ))
final_path = os.path.join(download_dir, expected_file)
download_url= 'http://redcatlabs.com/downloads/deep-learning-workshop/notebooks/'+final_path
if not os.path.isfile( final_path ):
os.makedirs(download_dir, exist_ok=True)
# First, try to download a pre-prepared file directly...
response = requests.get(download_url, stream=True)
if response.status_code == requests.codes.ok:
print(" Downloading pre-prepared file from RedCatLabs")
with open(final_path, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
else:
# But, for some reason, RedCatLabs didn't give us the file directly
if not os.path.exists(data_cache):
os.makedirs(data_cache, exist_ok=True)
archivefile = original_url[ original_url.rfind('/')+1:]
archivefilepath = os.path.join(data_cache, archivefile)
if not os.path.isfile( archivefilepath ):
print(" Downloading file of size %d from %s" % (zipsize_check, original_url,))
response = requests.get(original_url, stream=True)
with open(archivefilepath, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
# Should also check the size of the download...
print(" Finished Download")
vecfile = archivefile.replace('.zip', '').replace('.gz', '')
vecfilepath = os.path.join(data_cache, vecfile)
if not os.path.isfile( vecfilepath ):
if archivefile.endswith('.zip'):
print(' Unpacking "%s" from .zip' % (vecfile,))
import zipfile
zipfile.ZipFile(archivefilepath, 'r').extract(vecfile, data_cache)
if archivefile.endswith('.gz'):
print(' Unpacking "%s" from .gz' % (vecfile,))
import gzip
with gzip.open(archivefilepath, 'rb') as f_in:
with open(vecfilepath, 'wb') as f_out:
#f_out.write(f_in.read())
shutil.copyfileobj(f_in, f_out)
print(" Finished unpacking")
with open(vecfilepath, 'rt') as in_file:
with open(final_path, 'wt') as out_file:
print(" Reducing vec file to first 100k words, 50 columns")
print(' First line : "%s"' % (in_file.readline().strip(),))
out_file.write("%d %d\n" % (vocab_size, embedding_dim))
for i, l in enumerate(in_file): # .readlines() is not an iterator...
if i>=vocab_size: break
# Parse the line
arr = l.strip().split(' ')
word = arr[0]
nums = arr[1:embedding_dim+1]
out_file.write("%s %s\n" % (word, ' '.join(nums),))
# Get rid of tarfile source (the required text file itself will remain)
#os.unlink(archivefilepath)
#os.unlink(vecfilepath)
print(' File is available locally')
# See : https://fasttext.cc/docs/en/crawl-vectors.html
download_base = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors'
ensure_downloaded_and_prepared( # English
'wiki-news-300d-1M.vec.50d-100k.txt',
download_base+'/wiki-news-300d-1M.vec.zip',
682631666) # 683MB download
ensure_downloaded_and_prepared( # Chinese (Mandarin)
'cc.zh.300.vec.50d-100k.txt',
download_base+'/word-vectors-v2/cc.zh.300.vec.gz',
1358778961) # 1.36GB download
ensure_downloaded_and_prepared( # Malay
'cc.ms.300.vec.50d-100k.txt',
download_base+'/word-vectors-v2/cc.ms.300.vec.gz',
710958603) # 711MB download
ensure_downloaded_and_prepared( # French
'cc.fr.300.vec.50d-100k.txt',
download_base+'/word-vectors-v2/cc.fr.300.vec.gz',
1287757366) # 1.29GB download
In [ ]:
#! ls -l ./data/cache/
#! rm ./data/cache/wiki-news-300d-1M.vec.zip
In [ ]:
In [ ]:
# pip install gensim==3.4.0
import gensim
gensim.__version__ # '3.4.0'
In [ ]:
en_vecfile = './data/RNN/wiki-news-300d-1M.vec.50d-100k.txt'
#xx_vecfile = './data/RNN/cc.zh.300.vec.50d-100k.txt'
#xx_vecfile = './data/RNN/wiki-news-300d-1M.vec.50d-100k.txt'
xx_vecfile = './data/RNN/cc.fr.300.vec.50d-100k.txt'
In [ ]:
from gensim.models import KeyedVectors
# Creating the english language model from the vectors stored on disk
en_model = KeyedVectors.load_word2vec_format(en_vecfile)
en_model.init_sims(replace=True)
len(en_model.vocab), en_model.vector_size # Vocab size and dim (expect 100k x 50)
In [ ]:
', '.join([ f for f in dir(en_model) if not f.startswith('_') ])
In [ ]:
In [ ]:
import random
', '.join([ en_model.index2word[random.randint(0,len(en_model.vocab))]
for _ in range(30)])
In [ ]:
# Pick a word
find_similar_to = 'dog'
# Finding out similar words
for similar_word in en_model.similar_by_word(find_similar_to, topn=10):
print("Similarity: %.2f, Word: %s" % ( similar_word[1], similar_word[0],))
In [ ]:
In [ ]:
def test_analogy(s): # A is to B as C is to 'D'?
(a,b,c,d) = s.split(' ')
print("Trying for '%s'" % (d,))
for similar_word in en_model.most_similar(
positive=[b,c], negative=[a], topn=3):
print(" Similarity: %.2f, Word: %s" % ( similar_word[1], similar_word[0],))
test_analogy('man woman king queen')
test_analogy('paris france london england')
test_analogy('kitten cat puppy dog')
test_analogy('look looked run ran')
In [ ]:
In [ ]:
# Let's load in another language
In [ ]:
xx_model = KeyedVectors.load_word2vec_format(xx_vecfile)
xx_model.init_sims(replace=True)
In [ ]:
# Pick some pairs of translations...
translations = (
'house:屋 door:门 wheel:轮 money:钱 book:书 ear:耳 mouth:嘴 '
+'electronic:电子 car:汽车 key:键 village:村 student:学生 '
+'sky:天空 mountain:山 tree:树 river:河 beach:海滩 rain:雨 '
+'bird:鸟 fish:鱼 butterfly:蝴蝶 cow:牛 rat:鼠 strawberry:草莓 honey:蜜 '
+'jump:跳 speak:说话 count:计数 explain:说明 climb:爬 '
+'tall:高 heavy:重 red:红 gold:金 ancient:古 rapid:快速 '
+'seven:七 thousand:千 circle:圈 perpendicular:垂直 fraction:分数 '
+'hero:英雄 sword:剑 awkward:尴尬 night:晚 arrival:到达 '
#+'aggressive:侵略性 discount:折扣 apartment:公寓 computer:电脑 '
).strip().split()
len(translations)
In [ ]:
translations = [ '%s:%s' % (w,w) for w in (
'house door wheel money book ear mouth '
+'electronic car key village student '
+'sky mountain tree river beach rain '
+'bird fish butterfly cow rat strawberry honey '
+'jump speak count explain climb '
+'tall heavy red gold ancient rapid '
+'seven thousand circle perpendicular fraction '
+'hero sword awkward night arrival '
+'').strip().split() ]
len(translations)
In [ ]:
translations = (
'house:maison door:porte wheel:roue '
+'money:argent book:livre ear:oreille mouth:bouche '
+'electronic:electronic car:voiture key:clé '
+'village:village student:étudiant '
+'sky:ciel mountain:montagne tree:arbre '
+'river:rivière beach:plage rain:pluie '
+'bird:oiseau fish:poisson butterfly:papillon cow:vache '
+'rat:rat strawberry:fraise honey:miel '
+'jump:sauter speak:parler count:compter explain:expliquer climb:monter '
+'tall:grand heavy:lourd red:rouge gold:or ancient:ancien rapid:rapide '
+'seven:sept thousand:mille circle:cercle '
+'perpendicular:perpendiculaire fraction:fraction '
+'hero:héros sword:épée awkward:gênant night:nuit arrival:arrivée '
+'aggressive:agressif discount:remise apartment:appartement computer:ordinateur '
).strip().split()
len(translations)
In [ ]:
import numpy as np
np.set_printoptions(precision=3)
en_arr, xx_arr = [], []
for en_word,xx_word in [pair.split(':') for pair in translations]:
if not en_word in en_model.vocab:
print("Failed to find %s (~%s:%s)" % (en_word,en_word,xx_word))
continue
if not xx_word in xx_model.vocab:
print("Failed to find %s (~%s:%s)" % (xx_word,en_word,xx_word))
continue
en_arr.append( en_model.get_vector(en_word))
xx_arr.append( xx_model.get_vector(xx_word))
len(en_arr), en_arr[0]
In [ ]:
en_known = np.array( en_arr ).T
xx_known = np.array( xx_arr ).T
In [ ]:
en_known_inv = np.linalg.pinv(en_known)
en_known.shape, en_known_inv.shape
In [ ]:
np.dot(en_known_inv, en_known)[:3,:3] # Looks v. close to I(46,46)
#np.dot(en_known, en_known_inv)[:3,:3] # Roughly similar to I(50,50)
In [ ]:
en_vec = en_model.get_vector('butterfly').T
np.dot(en_known_inv, en_vec)>0.8
In [ ]:
#xx_vec = np.dot(xx_known, np.dot(en_known_inv, en_vec))
xx_vec = np.dot( np.dot(xx_known, en_known_inv), en_vec)
xx_vec
In [ ]:
for xx_word in xx_model.similar_by_vector(xx_vec, topn=3):
print(" Similarity: %.2f, Word: %s" % ( xx_word[1], xx_word[0],))
In [ ]:
#np.dot(xx_known, en_known_inv)[:3,:3]
In [ ]:
In [ ]:
# Translate en->xx A.en = xx => A=xx.inv(en)
en_trans = A = np.dot(xx_known, np.linalg.pinv(en_known))
# Translate xx->en XX.B = EN => B=inv(XX).EN
xx_trans = B = np.dot(en_known, np.linalg.pinv(xx_known))
In [ ]:
#en_trans[:3,:3]
In [ ]:
# Let's attempt a translation of some unknown words
In [ ]:
en_test = 'butterfly cat chair bee twenty entrance haircut'
for en_word in en_test.split(' '):
print("Translate : '%s'" % (en_word,))
en_vec = en_model.get_vector(en_word).T
xx_vec = np.dot(en_trans, en_vec)
#print(en_vec.shape, en_trans.shape)
#print(xx_vec); break
for xx_word in xx_model.similar_by_vector(xx_vec, topn=3):
print(" Similarity: %.2f, Word: %s" % ( xx_word[1], xx_word[0],))
xx_vec_sim = xx_model.get_vector(xx_word[0]).T
en_vec_sim = np.dot(xx_trans, xx_vec_sim)
for en_word_2 in en_model.similar_by_vector(en_vec_sim, topn=3):
print(" Similarity: %.2f, Word: %s" % ( en_word_2[1], en_word_2[0],))
# ideal(?) : '猫 椅子 蜜蜂 二十 入口 理发'
In [ ]:
In [ ]:
In [ ]: