Use Word2Vec in gensim to train a word embedding model using the content from NIPS papers.


In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#config InlineBackend.figure_format = 'pdf'
from IPython.core.display import HTML
import gensim as gen
import gensim.models.word2vec as w2v

import matplotlib.pyplot as plt
from nltk.tokenize import WhitespaceTokenizer
import numpy as np
import os
import pandas as pd
try:
    import cPickle as pickle 
except:
    import pickle

import re
import scipy.stats as stats
import scipy.sparse as sp
import string
import sys
import csv

In [ ]:
# load the pickle containing the document-term matrix, 
# put the abstracts in, and dump it to a file.
fyear = 1988
tyear = 2015
dt_fpath = 'DT_%d_%d_wabs.p'%(fyear, tyear)

with open(dt_fpath, 'r') as f:
    info = pickle.load(f)

In [ ]:
info.keys()
list_abs = info['abstracts']

In [ ]:
list_abs[:2]

In [ ]:
# make each abstract a list of words
list_list_abs = [ab.split(' ') for ab in list_abs if ab is not None]
print list_list_abs[20]

In [ ]:
def paper_dataframe(fpath):
    rows = []
    with open(fpath, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        # Each read gives ['Id', 'Title', 'EventType', 'PdfName', 'Abstract', 'PaperText']
        reader.next()
        for row in reader:
            rows.append(tuple(row))
    data = pd.DataFrame(rows, columns=['Id', 'Title', 'EventType', 
                                   'PdfName', 'Abstract', 'PaperText'])
    return data

In [ ]:
text = ',sdf,.-23\][](s)'
re.sub(r'([^\w])+', ' ', text, flags=re.DOTALL)

In [ ]:
def tokenize_simple(text):
    # replace spaces with one space
    text = re.sub(r'\s+', ' ', text, flags=re.DOTALL)
    # remove non-English words
    text = re.sub(r'[^\w]+', ' ', text, flags=re.DOTALL)
    # naive tokenization
    tokens = [w.lower().strip() for w in text.split(' ') if len(w) > 1]
    return tokens

In [ ]:
dframe = paper_dataframe('Papers1988_2015.csv')
n_docs = dframe.shape[0]
tok_papers = []
tok_abstracts = []
for i in xrange(n_docs):
    paper = dframe['PaperText'][i]
    paper_tokens = tokenize_simple(paper)
    tok_papers.append(paper_tokens)
    
    ab = list_abs[i]
    if ab is None:
        ab_tokens = []
    else:
        ab_tokens = tokenize_simple(ab)
    tok_abstracts.append(ab_tokens)

In [ ]:

Train a word2vec model


In [ ]:
# size means the latent dimension
# sentences = an iterable where each item is a list of words
size = 50
window = 5
dest_fname = 'w2v_size%d_win%d.p'%(size, window)
model = w2v.Word2Vec(tok_papers, size=size, window=window, min_count=5, workers=4)
model.save(dest_fname)

In [ ]:
model.wv.similarity('neural', 'deep')

In [ ]:
model.wv.similarity('neural', 'kernel')

In [ ]:
model.wv.doesnt_match('supervised unsupervised neuron reinforcement'.split())

In [ ]:
model.wv.doesnt_match('kernel gretton hsic mmd'.split())

In [ ]:
model.wv['kernel']

In [ ]:
'kernel' in model.wv

Create a representation of each paper

The representation is simply a set of embedded words taken from the abstract and the title.


In [ ]:
titles = info['titles']
# each element is the representation of the paper. 
# This is a matrix with each row corresponding to the embedding
# of a word in the abstract and the title.
paper_reps = []
for i in xrange(n_docs):
    title_tokens = tokenize_simple(titles[i])
    rep_words = tok_abstracts[i] + title_tokens
    
    # embed each word in rep_words (if in the vocabulary)
    rep = []
    for w in rep_words:
        # only embed words that are in the vocabulary
        if w in model.wv:
            embed = model.wv[w]
            rep.append(embed)
    mat = np.vstack(rep)
    paper_reps.append(mat)

In [ ]:
len(paper_reps)

In [ ]:
# save the pickle with the paper representations
dt_dest = 'DT_%d_%d_wembed.p'%(fyear, tyear)
info['paper_reps'] = paper_reps
with open(dt_dest, 'w') as f:
    pickle.dump(info, f)

Load the saved pickle and check


In [ ]:
with open('DT_%d_%d_wembed.p'%(fyear, tyear), 'r') as f:
    info = pickle.load(f)

In [ ]:
info.keys()

In [ ]:
DT = info['DT']
abstracts = info['abstracts']
paper_reps = info['paper_reps']
titles = info['titles']
words = info['words']

filter words by DF


In [ ]:
# document frequency of each word
n_docs = DT.shape[0]
DF = np.array( (DT > 0).sum(0) )[0]
df_lb = 7
df_ub = int(0.15*n_docs)

print('n = #docs: %d'%n_docs)
print('original #words: %d'%len(words))
print('#words with %d <= df: %d'% (df_lb, np.sum(DF>=df_lb) ) )
print('#words with df <= %d: %d'% (df_ub, np.sum(DF<=df_ub) ) )
df_I = np.logical_and(DF>=df_lb, DF<=df_ub)
print('#words with %d <= df <= %d: %d'% 
      (df_lb, df_ub, np.sum( df_I) ) )

In [ ]:
df_words = np.array(words)[df_I]
print df_words.tolist()

In [ ]:
# filter out words
fDT = DT[:, df_I]
fwords = np.array(words)[df_I].tolist()

info['DT'] = fDT
info['words'] = fwords

In [ ]:
dffiltered_fname = 'DT_%d_%d_wem_df%d_%d.p'%(fyear, tyear, df_lb, df_ub)
with open(dffiltered_fname, 'w') as f:
    pickle.dump(info, f)

In [ ]:


In [ ]:


In [ ]:


In [ ]: