In [8]:
import util
import os
import math
import subprocess
import numpy as np
from tensorflow.keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=1000)

In [5]:
X_train[0][:20]


Out[5]:
[1, 14, 22, 16, 43, 530, 973, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25]

In [4]:
util.decode(X_train[0])


Out[4]:
" this film was just brilliant casting � � story direction � really � the part they played and you could just imagine being there robert � is an amazing actor and now the same being director � father came from the same � � as myself so i loved the fact there was a real � with this film the � � throughout the film were great it was just brilliant so much that i � the film as soon as it was released for � and would recommend it to everyone to watch and the � � was amazing really � at the end it was so sad and you know what they say if you � at a film it must have been good and this definitely was also � to the two little � that played the � of � and paul they were just brilliant children are often left out of the � � i think because the stars that play them all � up are such a big � for the whole film but these children are amazing and should be � for what they have done don't you think the whole story was so � because it was true and was � life after all that was � with us all"

In [6]:
# Load embeddings
if not os.path.exists("glove.6B.100d.txt"):
    print("Downloading glove embeddings...")
    subprocess.check_output(
        "curl -OL http://nlp.stanford.edu/data/glove.6B.zip && unzip glove.6B.zip", shell=True)
embeddings_index = dict()
f = open('glove.6B.100d.txt')
print("Loading globe embeddings...")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


Downloading glove embeddings...

In [17]:
def cosine_sim(v1,v2):
    "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)
film = embeddings_index["film"]
movie = embeddings_index["movie"]
book = embeddings_index["book"]
car = embeddings_index["car"]
truck = embeddings_index["truck"]
plane = embeddings_index["plane"]
cosine_sim(film, truck)


Out[17]:
0.21388251764217375

In [19]:
embeddings_index["book"]


Out[19]:
array([-1.9744e-01,  4.4831e-01,  1.3689e-01, -1.5595e-01,  9.3600e-01,
        7.2986e-01,  3.4099e-01, -3.3896e-01, -8.9569e-02, -4.7706e-01,
        3.5112e-01, -4.2198e-01, -1.2221e-01, -6.3375e-02, -4.5820e-01,
        7.8723e-01,  9.4045e-01,  8.1101e-02, -2.3224e-01,  4.0778e-01,
        3.3258e-01, -4.4458e-01, -4.7117e-01,  1.4852e-01,  9.6308e-01,
       -6.5267e-02, -5.3661e-02, -6.7474e-01, -4.2364e-01,  9.4392e-02,
       -3.8668e-01,  1.8237e-01, -1.2846e-01, -2.1952e-01, -5.8993e-01,
        7.3602e-01, -2.4009e-01,  3.2392e-01, -2.4663e-01, -4.0684e-01,
       -5.2468e-01,  4.6174e-01, -1.4936e-01, -1.1999e-01, -1.3990e-01,
       -4.4944e-01, -2.6565e-01, -7.0061e-01,  3.0188e-01, -1.1209e-01,
        6.6323e-01,  3.9698e-01,  6.9158e-01,  8.3442e-01, -5.2717e-01,
       -2.5314e+00,  1.3281e-01,  3.0253e-01,  1.1062e+00,  7.2221e-03,
        2.6031e-01,  1.1584e+00, -7.9330e-02, -7.6659e-01,  1.2623e+00,
       -6.2071e-01,  5.9821e-01,  7.3539e-01,  3.8573e-01, -4.0293e-01,
       -3.1440e-02,  7.7863e-01,  3.1525e-01,  1.9003e-01, -6.5821e-01,
        4.0548e-01,  5.3596e-03,  5.5274e-02, -1.2238e+00, -4.8912e-02,
       -3.0511e-01,  4.4473e-01, -3.3826e-01, -2.2133e-01, -1.3214e+00,
       -6.4761e-01, -4.4021e-01, -1.4910e+00, -2.2495e-02,  6.0346e-02,
        1.4833e-01,  4.4162e-01,  7.9787e-01, -2.8076e-01, -2.9400e-02,
       -1.5656e-01, -1.2650e-01, -5.6968e-01,  1.5374e-03,  6.6600e-01],
      dtype=float32)

In [ ]: