In [ ]:

    
# Basic libraries import
import numpy as np
import pandas as pd
import seaborn as sns
import collections
import itertools
import random

import sys
import os
from os.path import join
from pathlib import Path

# Plotting
%matplotlib notebook
%matplotlib inline

sns.set_context("notebook", font_scale=1.5)



In [ ]:

    
data_folder = join(str(Path.home()), "Documents/datasets/")

Intro

This notebook explores words embeddings.

Includes playing around with Word2Vec using Gensim and exploration of GloVe pretained embeddings.

Resources

Data



In [ ]:

    
sentences = ["A brown fox jumped on the lazy dog", 
            "A brown fox jumped on the brown duck",
            "A brown fox jumped on the lazy elephant",
            "An elephant is eating green grass near the alpaca",
            "A green alpaca tried to jump over an elephant",
            "May you rest in a deep and dreamless slumber"]



In [ ]:

    
# dummy tokenization
tokenized_sentences = [sent.strip().split() for sent in sentences]

# word to index
counter = collections.Counter(itertools.chain(*tokenized_sentences))
vocab = counter.most_common()
index_to_word = [x[0] for x in vocab]
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

Word2Vec



In [ ]:

    
import gensim, logging

Train Word2Vec



In [ ]:

    
# parameters
size = 200    # size of NN layers, corresponding to word vector dimensionality                      
min_count = 1   # minimum word count in order to consider such word                        
workers = 4       # number of threads to run in parallel (only effect if you have Cython installed)
window = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words



In [ ]:

    
print("Training model...")
model = gensim.models.Word2Vec([s.split() for s in sentences],
                              workers=workers, 
            size=size, min_count = min_count, 
            window = window, sample = downsampling)



In [ ]:

    
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)



In [ ]:

    
# save model
model_name = "w2v_{}_size{}_mincount{}_window{}".format(corpus_name, size, min_count, window)
model.save(model_name)

Test Word2Vec Model



In [ ]:

    
# load model
model = gensim.models.Word2Vec.load(model_name)



In [ ]:

    
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
model.doesnt_match("breakfast cereal dinner lunch";.split())
model.similarity('woman', 'man')



In [ ]:

    
# sentence to tensor
model[['brown', 'fox']].shape



In [ ]:

    
import nltk
import itertools
import collections
nltk.FreqDist(itertools.chain(*[s.split() for s in sentences]))



In [ ]:

    
collections.Counter(itertools.chain(*[s.split() for s in sentences]))

GloVe



In [ ]:

    
# load GloVe embeddings
EMBEDDING_DIM = 100
embeddings = {}
with open(join(data_folder, "glove", "glove.6B.100d.txt")) as glove:
    for line in glove:
        values = line.strip().split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings[word] = vector



In [ ]:

    
embeddings['objected']



In [ ]:

    
for i, (k, v) in enumerate(embeddings.items()):
    if k=='objected':
        print(i)



In [ ]:

    
# create embedding matrix
embeddings_matrix = np.zeros((len(word_to_index)+1, EMBEDDING_DIM))
for word, i in word_to_index.items():
    if word in embeddings:
        embeddings_matrix[i] = embeddings[word]



In [ ]: