Words Embeddings



In [ ]:
# Basic libraries import
import numpy as np
import pandas as pd
import seaborn as sns
import collections
import itertools
import random

import sys
import os
from os.path import join
from pathlib import Path

# Plotting
%matplotlib notebook
%matplotlib inline

sns.set_context("notebook", font_scale=1.5)

In [ ]:
data_folder = join(str(Path.home()), "Documents/datasets/")

Data


In [ ]:
sentences = ["A brown fox jumped on the lazy dog", 
            "A brown fox jumped on the brown duck",
            "A brown fox jumped on the lazy elephant",
            "An elephant is eating green grass near the alpaca",
            "A green alpaca tried to jump over an elephant",
            "May you rest in a deep and dreamless slumber"]

In [ ]:
# dummy tokenization
tokenized_sentences = [sent.strip().split() for sent in sentences]

# word to index
counter = collections.Counter(itertools.chain(*tokenized_sentences))
vocab = counter.most_common()
index_to_word = [x[0] for x in vocab]
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

Word2Vec


In [ ]:
import gensim, logging

Train Word2Vec


In [ ]:
# parameters
size = 200    # size of NN layers, corresponding to word vector dimensionality                      
min_count = 1   # minimum word count in order to consider such word                        
workers = 4       # number of threads to run in parallel (only effect if you have Cython installed)
window = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [ ]:
print("Training model...")
model = gensim.models.Word2Vec([s.split() for s in sentences],
                              workers=workers, 
            size=size, min_count = min_count, 
            window = window, sample = downsampling)

In [ ]:
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

In [ ]:
# save model
model_name = "w2v_{}_size{}_mincount{}_window{}".format(corpus_name, size, min_count, window)
model.save(model_name)

Test Word2Vec Model


In [ ]:
# load model
model = gensim.models.Word2Vec.load(model_name)

In [ ]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
model.doesnt_match("breakfast cereal dinner lunch";.split())
model.similarity('woman', 'man')

In [ ]:
# sentence to tensor
model[['brown', 'fox']].shape

In [ ]:
import nltk
import itertools
import collections
nltk.FreqDist(itertools.chain(*[s.split() for s in sentences]))

In [ ]:
collections.Counter(itertools.chain(*[s.split() for s in sentences]))

GloVe


In [ ]:
# load GloVe embeddings
EMBEDDING_DIM = 100
embeddings = {}
with open(join(data_folder, "glove", "glove.6B.100d.txt")) as glove:
    for line in glove:
        values = line.strip().split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings[word] = vector

In [ ]:
embeddings['objected']

In [ ]:
for i, (k, v) in enumerate(embeddings.items()):
    if k=='objected':
        print(i)

In [ ]:
# create embedding matrix
embeddings_matrix = np.zeros((len(word_to_index)+1, EMBEDDING_DIM))
for word, i in word_to_index.items():
    if word in embeddings:
        embeddings_matrix[i] = embeddings[word]

In [ ]: