In [ ]:
import os, requests, shutil

download_dir = './data/RNN/'
data_cache = './data/cache'

def ensure_downloaded_and_prepared(expected_file, original_url, zipsize_check, 
                                   vocab_size=100000, embedding_dim=50):
    print('"%s" preparation' % (expected_file, ))
    final_path = os.path.join(download_dir, expected_file)

    download_url= 'http://redcatlabs.com/downloads/deep-learning-workshop/notebooks/'+final_path
    
    if not os.path.isfile( final_path ):
        os.makedirs(download_dir, exist_ok=True)

        # First, try to download a pre-prepared file directly...
        response = requests.get(download_url, stream=True)
        if response.status_code == requests.codes.ok:
            print("  Downloading pre-prepared file from RedCatLabs")
            with open(final_path, 'wb') as out_file:
                shutil.copyfileobj(response.raw, out_file)
        else:
            # But, for some reason, RedCatLabs didn't give us the file directly
            if not os.path.exists(data_cache):
                os.makedirs(data_cache, exist_ok=True)

            archivefile = original_url[ original_url.rfind('/')+1:]
            archivefilepath = os.path.join(data_cache, archivefile)
            
            if not os.path.isfile( archivefilepath ):
                print("  Downloading file of size %d from %s" % (zipsize_check, original_url,))
                response = requests.get(original_url, stream=True)
                with open(archivefilepath, 'wb') as out_file:
                    shutil.copyfileobj(response.raw, out_file)
                # Should also check the size of the download...
                print("  Finished Download")
                
            vecfile = archivefile.replace('.zip', '').replace('.gz', '')
            vecfilepath = os.path.join(data_cache, vecfile)
            if not os.path.isfile( vecfilepath ):
                if archivefile.endswith('.zip'):
                    print('  Unpacking "%s" from .zip' % (vecfile,))
                    import zipfile
                    zipfile.ZipFile(archivefilepath, 'r').extract(vecfile, data_cache)
                if archivefile.endswith('.gz'):
                    print('  Unpacking "%s" from .gz' % (vecfile,))
                    import gzip
                    with gzip.open(archivefilepath, 'rb') as f_in:
                        with open(vecfilepath, 'wb') as f_out:
                            #f_out.write(f_in.read())
                            shutil.copyfileobj(f_in, f_out)
                print("  Finished unpacking")

            with open(vecfilepath, 'rt') as in_file:
                with open(final_path, 'wt') as out_file:
                    print("  Reducing vec file to first 100k words, 50 columns")
                    print('  First line : "%s"' % (in_file.readline().strip(),))
                    out_file.write("%d %d\n" % (vocab_size, embedding_dim))
                    for i, l in enumerate(in_file): # .readlines() is not an iterator...
                        if i>=vocab_size: break
                        # Parse the line
                        arr = l.strip().split(' ')
                        word = arr[0]
                        nums = arr[1:embedding_dim+1]
                        out_file.write("%s %s\n" % (word, ' '.join(nums),))

            # Get rid of tarfile source (the required text file itself will remain)
            #os.unlink(archivefilepath)
            #os.unlink(vecfilepath)

    print('  File is available locally')

    
# See : https://fasttext.cc/docs/en/crawl-vectors.html
download_base = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors'

ensure_downloaded_and_prepared(  # English
    'wiki-news-300d-1M.vec.50d-100k.txt', 
    download_base+'/wiki-news-300d-1M.vec.zip',
     682631666)  # 683MB download

ensure_downloaded_and_prepared(  # Chinese (Mandarin)
    'cc.zh.300.vec.50d-100k.txt', 
    download_base+'/word-vectors-v2/cc.zh.300.vec.gz',
    1358778961)  # 1.36GB download

ensure_downloaded_and_prepared(  # Malay
    'cc.ms.300.vec.50d-100k.txt',
    download_base+'/word-vectors-v2/cc.ms.300.vec.gz',
     710958603) # 711MB download

ensure_downloaded_and_prepared(  # French
    'cc.fr.300.vec.50d-100k.txt',
    download_base+'/word-vectors-v2/cc.fr.300.vec.gz',
    1287757366) # 1.29GB download

In [ ]:
#! ls -l ./data/cache/
#! rm ./data/cache/wiki-news-300d-1M.vec.zip

In [ ]:


In [ ]:
# pip install gensim==3.4.0
import gensim
gensim.__version__  # '3.4.0'

In [ ]:
en_vecfile = './data/RNN/wiki-news-300d-1M.vec.50d-100k.txt'
#xx_vecfile = './data/RNN/cc.zh.300.vec.50d-100k.txt'
#xx_vecfile = './data/RNN/wiki-news-300d-1M.vec.50d-100k.txt'
xx_vecfile = './data/RNN/cc.fr.300.vec.50d-100k.txt'

In [ ]:
from gensim.models import KeyedVectors

# Creating the english language model from the vectors stored on disk
en_model = KeyedVectors.load_word2vec_format(en_vecfile)
en_model.init_sims(replace=True)

len(en_model.vocab), en_model.vector_size # Vocab size and dim (expect 100k x 50)

In [ ]:
', '.join([ f for f in dir(en_model) if not f.startswith('_') ])

In [ ]:


In [ ]:
import random
', '.join([ en_model.index2word[random.randint(0,len(en_model.vocab))] 
           for _ in range(30)])

In [ ]:
# Pick a word 
find_similar_to = 'dog'

# Finding out similar words
for similar_word in en_model.similar_by_word(find_similar_to, topn=10):
    print("Similarity: %.2f, Word: %s" % ( similar_word[1], similar_word[0],))

In [ ]:


In [ ]:
def test_analogy(s):  # A is to B as C is to 'D'?
    (a,b,c,d) = s.split(' ')
    print("Trying for '%s'" % (d,))
    for similar_word in en_model.most_similar(
            positive=[b,c], negative=[a], topn=3):
        print("  Similarity: %.2f, Word: %s" % ( similar_word[1], similar_word[0],))

test_analogy('man woman king queen')
test_analogy('paris france london england')
test_analogy('kitten cat puppy dog')
test_analogy('look looked run ran')

In [ ]:


In [ ]:
# Let's load in another language

In [ ]:
xx_model = KeyedVectors.load_word2vec_format(xx_vecfile)
xx_model.init_sims(replace=True)

In [ ]:
# Pick some pairs of translations...
translations = ( 
     'house:屋 door:门 wheel:轮 money:钱 book:书 ear:耳 mouth:嘴 '
    +'electronic:电子 car:汽车 key:键 village:村 student:学生 '
    +'sky:天空 mountain:山 tree:树 river:河 beach:海滩 rain:雨 '
    +'bird:鸟 fish:鱼 butterfly:蝴蝶 cow:牛 rat:鼠 strawberry:草莓 honey:蜜 '
    +'jump:跳 speak:说话 count:计数 explain:说明 climb:爬 '
    +'tall:高 heavy:重 red:红 gold:金 ancient:古 rapid:快速 '
    +'seven:七 thousand:千 circle:圈 perpendicular:垂直 fraction:分数 '
    +'hero:英雄 sword:剑 awkward:尴尬 night:晚 arrival:到达 '
    #+'aggressive:侵略性 discount:折扣 apartment:公寓 computer:电脑 '
).strip().split()

len(translations)

In [ ]:
translations = [ '%s:%s' % (w,w) for w in (
     'house door wheel money book ear mouth '
    +'electronic car key village student '
    +'sky mountain tree river beach rain '
    +'bird fish butterfly cow rat strawberry honey '
    +'jump speak count explain climb '
    +'tall heavy red gold ancient rapid '
    +'seven thousand circle perpendicular fraction '
    +'hero sword awkward night arrival '
    +'').strip().split() ]

len(translations)

In [ ]:
translations = ( 
     'house:maison door:porte wheel:roue '
    +'money:argent book:livre ear:oreille mouth:bouche '
    +'electronic:electronic car:voiture key:clé '
    +'village:village student:étudiant '
    +'sky:ciel mountain:montagne tree:arbre '
    +'river:rivière beach:plage rain:pluie '
    +'bird:oiseau fish:poisson butterfly:papillon cow:vache '
    +'rat:rat strawberry:fraise honey:miel '
    +'jump:sauter speak:parler count:compter explain:expliquer climb:monter '
    +'tall:grand heavy:lourd red:rouge gold:or ancient:ancien rapid:rapide '
    +'seven:sept thousand:mille circle:cercle '
    +'perpendicular:perpendiculaire fraction:fraction '
    +'hero:héros sword:épée awkward:gênant night:nuit arrival:arrivée '
    +'aggressive:agressif discount:remise apartment:appartement computer:ordinateur '
).strip().split()

len(translations)

In [ ]:
import numpy as np
np.set_printoptions(precision=3)

en_arr, xx_arr = [], []
for en_word,xx_word in [pair.split(':') for pair in translations]:
    if not en_word in en_model.vocab:
        print("Failed to find %s (~%s:%s)" % (en_word,en_word,xx_word))
        continue
    if not xx_word in xx_model.vocab:
        print("Failed to find %s (~%s:%s)" % (xx_word,en_word,xx_word))
        continue
    en_arr.append( en_model.get_vector(en_word))
    xx_arr.append( xx_model.get_vector(xx_word))

len(en_arr), en_arr[0]

In [ ]:
en_known = np.array( en_arr ).T
xx_known = np.array( xx_arr ).T

In [ ]:
en_known_inv = np.linalg.pinv(en_known)
en_known.shape, en_known_inv.shape

In [ ]:
np.dot(en_known_inv, en_known)[:3,:3] # Looks v. close to I(46,46)
#np.dot(en_known, en_known_inv)[:3,:3] # Roughly similar to I(50,50)

In [ ]:
en_vec = en_model.get_vector('butterfly').T
np.dot(en_known_inv, en_vec)>0.8

In [ ]:
#xx_vec = np.dot(xx_known, np.dot(en_known_inv, en_vec))
xx_vec = np.dot( np.dot(xx_known, en_known_inv), en_vec)
xx_vec

In [ ]:
for xx_word in xx_model.similar_by_vector(xx_vec, topn=3):
    print("  Similarity: %.2f, Word: %s" % ( xx_word[1], xx_word[0],))

In [ ]:
#np.dot(xx_known, en_known_inv)[:3,:3]

In [ ]:


In [ ]:
# Translate en->xx      A.en = xx => A=xx.inv(en)
en_trans = A = np.dot(xx_known, np.linalg.pinv(en_known))

# Translate xx->en      XX.B = EN => B=inv(XX).EN
xx_trans = B = np.dot(en_known, np.linalg.pinv(xx_known))

In [ ]:
#en_trans[:3,:3]

In [ ]:
# Let's attempt a translation of some unknown words

In [ ]:
en_test = 'butterfly cat chair bee twenty entrance haircut'
for en_word in en_test.split(' '):
    print("Translate : '%s'" % (en_word,))
    en_vec = en_model.get_vector(en_word).T
    xx_vec = np.dot(en_trans, en_vec)
    #print(en_vec.shape, en_trans.shape)
    #print(xx_vec); break
    
    for xx_word in xx_model.similar_by_vector(xx_vec, topn=3):
        print("  Similarity: %.2f, Word: %s" % ( xx_word[1], xx_word[0],))
        
        xx_vec_sim = xx_model.get_vector(xx_word[0]).T
        en_vec_sim = np.dot(xx_trans, xx_vec_sim)
        
        for en_word_2 in en_model.similar_by_vector(en_vec_sim, topn=3):
            print("    Similarity: %.2f, Word: %s" % ( en_word_2[1], en_word_2[0],))
        
# ideal(?) : '猫 椅子 蜜蜂 二十 入口 理发'

In [ ]:


In [ ]:


In [ ]: