Jupyter notebook for creating a Word2vec model from a Wikipedia dump. This model file can then be read into gensim's Word2Vec class. Feel free to edit this script as you see fit.
https://dumps.wikimedia.org/<locale>wiki/latest/<locale>wiki-latest-pages-articles.xml.bz2
E.x. https://dumps.wikimedia.org/itwiki/latest/itwiki-latest-pages-articles.xml.bz2
In [1]:
WIKIPEDIA_DUMP_PATH = './data/wiki-corpuses/enwiki-latest-pages-articles.xml.bz2'
# Choose a path that the word2vec model should be saved to
# (during training), and read from afterwards.
WIKIPEDIA_W2V_PATH = './data/enwiki.model'
Here is where we train the word2vec model on the given Wikipedia dump. Specifically we,
NB: 1 Wikipedia article is fed into word2vec as a single sentence.
In [2]:
import sys
import os
import tempfile
import multiprocessing
import logging
from gensim.corpora import WikiCorpus
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
In [3]:
def write_wiki_corpus(wiki, output_file):
"""Write a WikiCorpus as plain text to file."""
i = 0
for text in wiki.get_texts():
text_output_file.write(b' '.join(text) + b'\n')
i = i + 1
if (i % 10000 == 0):
print('\rSaved %d articles' % i, end='', flush=True)
print('\rFinished saving %d articles' % i, end='', flush=True)
def build_trained_model(text_file):
"""Reads text file and returns a trained model."""
sentences = LineSentence(text_file)
model = Word2Vec(sentences, size=400, window=5, min_count=5,
workers=multiprocessing.cpu_count())
# Trim unneeded model memory to reduce RAM usage
model.init_sims(replace=True)
return model
In [4]:
logging_format = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=logging_format, level=logging.INFO)
with tempfile.NamedTemporaryFile(suffix='.txt') as text_output_file:
# Create wiki corpus, and save text to temp file
wiki_corpus = WikiCorpus(WIKIPEDIA_DUMP_PATH, lemmatize=False, dictionary={})
write_wiki_corpus(wiki_corpus, text_output_file)
del wiki_corpus
# Train model on wiki corpus
model = build_trained_model(text_output_file)
model.save(WIKIPEDIA_W2V_PATH)
In [5]:
import random
In [6]:
%time
model = Word2Vec.load(WIKIPEDIA_W2V_PATH)
In [7]:
vocab = list(model.vocab.keys())
print('Vocabulary sample:', vocab[:5])
In [8]:
word = random.choice(vocab)
print('Similar words to:', word)
model.most_similar(word)
Out[8]:
In [9]:
word1 = random.choice(vocab)
word2 = random.choice(vocab)
print('similarity(%s, %s) = %f' % (word1, word2, model.similarity(word1, word2)))