Use Word2Vec in gensim to train a word embedding model using the content from NIPS papers.
In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#config InlineBackend.figure_format = 'pdf'
from IPython.core.display import HTML
import gensim as gen
import gensim.models.word2vec as w2v
import matplotlib.pyplot as plt
from nltk.tokenize import WhitespaceTokenizer
import numpy as np
import os
import pandas as pd
try:
import cPickle as pickle
except:
import pickle
import re
import scipy.stats as stats
import scipy.sparse as sp
import string
import sys
import csv
In [ ]:
# load the pickle containing the document-term matrix,
# put the abstracts in, and dump it to a file.
fyear = 1988
tyear = 2015
dt_fpath = 'DT_%d_%d_wabs.p'%(fyear, tyear)
with open(dt_fpath, 'r') as f:
info = pickle.load(f)
In [ ]:
info.keys()
list_abs = info['abstracts']
In [ ]:
list_abs[:2]
In [ ]:
# make each abstract a list of words
list_list_abs = [ab.split(' ') for ab in list_abs if ab is not None]
print list_list_abs[20]
In [ ]:
def paper_dataframe(fpath):
rows = []
with open(fpath, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
# Each read gives ['Id', 'Title', 'EventType', 'PdfName', 'Abstract', 'PaperText']
reader.next()
for row in reader:
rows.append(tuple(row))
data = pd.DataFrame(rows, columns=['Id', 'Title', 'EventType',
'PdfName', 'Abstract', 'PaperText'])
return data
In [ ]:
text = ',sdf,.-23\][](s)'
re.sub(r'([^\w])+', ' ', text, flags=re.DOTALL)
In [ ]:
def tokenize_simple(text):
# replace spaces with one space
text = re.sub(r'\s+', ' ', text, flags=re.DOTALL)
# remove non-English words
text = re.sub(r'[^\w]+', ' ', text, flags=re.DOTALL)
# naive tokenization
tokens = [w.lower().strip() for w in text.split(' ') if len(w) > 1]
return tokens
In [ ]:
dframe = paper_dataframe('Papers1988_2015.csv')
n_docs = dframe.shape[0]
tok_papers = []
tok_abstracts = []
for i in xrange(n_docs):
paper = dframe['PaperText'][i]
paper_tokens = tokenize_simple(paper)
tok_papers.append(paper_tokens)
ab = list_abs[i]
if ab is None:
ab_tokens = []
else:
ab_tokens = tokenize_simple(ab)
tok_abstracts.append(ab_tokens)
In [ ]:
In [ ]:
# size means the latent dimension
# sentences = an iterable where each item is a list of words
size = 50
window = 5
dest_fname = 'w2v_size%d_win%d.p'%(size, window)
model = w2v.Word2Vec(tok_papers, size=size, window=window, min_count=5, workers=4)
model.save(dest_fname)
In [ ]:
model.wv.similarity('neural', 'deep')
In [ ]:
model.wv.similarity('neural', 'kernel')
In [ ]:
model.wv.doesnt_match('supervised unsupervised neuron reinforcement'.split())
In [ ]:
model.wv.doesnt_match('kernel gretton hsic mmd'.split())
In [ ]:
model.wv['kernel']
In [ ]:
'kernel' in model.wv
In [ ]:
titles = info['titles']
# each element is the representation of the paper.
# This is a matrix with each row corresponding to the embedding
# of a word in the abstract and the title.
paper_reps = []
for i in xrange(n_docs):
title_tokens = tokenize_simple(titles[i])
rep_words = tok_abstracts[i] + title_tokens
# embed each word in rep_words (if in the vocabulary)
rep = []
for w in rep_words:
# only embed words that are in the vocabulary
if w in model.wv:
embed = model.wv[w]
rep.append(embed)
mat = np.vstack(rep)
paper_reps.append(mat)
In [ ]:
len(paper_reps)
In [ ]:
# save the pickle with the paper representations
dt_dest = 'DT_%d_%d_wembed.p'%(fyear, tyear)
info['paper_reps'] = paper_reps
with open(dt_dest, 'w') as f:
pickle.dump(info, f)
In [ ]:
with open('DT_%d_%d_wembed.p'%(fyear, tyear), 'r') as f:
info = pickle.load(f)
In [ ]:
info.keys()
In [ ]:
DT = info['DT']
abstracts = info['abstracts']
paper_reps = info['paper_reps']
titles = info['titles']
words = info['words']
In [ ]:
# document frequency of each word
n_docs = DT.shape[0]
DF = np.array( (DT > 0).sum(0) )[0]
df_lb = 7
df_ub = int(0.15*n_docs)
print('n = #docs: %d'%n_docs)
print('original #words: %d'%len(words))
print('#words with %d <= df: %d'% (df_lb, np.sum(DF>=df_lb) ) )
print('#words with df <= %d: %d'% (df_ub, np.sum(DF<=df_ub) ) )
df_I = np.logical_and(DF>=df_lb, DF<=df_ub)
print('#words with %d <= df <= %d: %d'%
(df_lb, df_ub, np.sum( df_I) ) )
In [ ]:
df_words = np.array(words)[df_I]
print df_words.tolist()
In [ ]:
# filter out words
fDT = DT[:, df_I]
fwords = np.array(words)[df_I].tolist()
info['DT'] = fDT
info['words'] = fwords
In [ ]:
dffiltered_fname = 'DT_%d_%d_wem_df%d_%d.p'%(fyear, tyear, df_lb, df_ub)
with open(dffiltered_fname, 'w') as f:
pickle.dump(info, f)
In [ ]:
In [ ]:
In [ ]:
In [ ]: