In [ ]:
import re # Regular Expressions
import pandas as pd # DataFrames & Manipulation
from gensim.models.word2vec import Word2Vec
In [ ]:
train_input = "../data/recipes.tsv.bz2"
# preserve empty strings (http://pandas-docs.github.io/pandas-docs-travis/io.html#na-values)
train = pd.read_csv(train_input, delimiter="\t", quoting=3, encoding="utf-8", keep_default_na=False)
print "loaded %d documents." % len(train)
In [ ]:
train[['title', 'instructions']].head()
In [ ]:
def normalize(text):
norm_text = text.lower()
for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
norm_text = norm_text.replace(char, ' ' + char + ' ')
return norm_text
In [ ]:
sentences = [normalize(text).split() for text in train['instructions']]
print "%d documents in corpus" % len(sentences)
see http://radimrehurek.com/gensim/models/word2vec.html
class gensim.models.word2vec.Word2Vec(
-> sentences=None, # iterable of sentences (list of words)
-> size=100, # feature vector dimension
alpha=0.025, # intial learning rate (drops to min_alpha during training)
-> window=5, # maximum distance between current and predicted word
-> min_count=5, # ignore words with lower total frequency
max_vocab_size=None, # limit RAM to most frequent words (1M words ~ 1GB)
sample=0.001, # threshold for random downsampling of high frequency words
seed=1, # for random number generator
-> workers=3, # number fo worker threads
min_alpha=0.0001, # used for linear learning-rate decay
-> sg=0, # training algorithm - (sg=0) CBOW, (sg=1) skip-gram
hs=0, # use hierarchical softmax (if 1), or negative sampling (default)
negative=5, # number of noise words used for negative sampling
cbow_mean=1, # use sum (0) of context word vector or mean (1, default)
hashfxn=<built-in function hash>,
iter=5, # number of iterations (epochs) over the corpus
null_word=0,
trim_rule=None, # custom vocabulary filtering
sorted_vocab=1, # sort vocab by descending word frequency
batch_words=10000 # size of batches (in words) passed to worker threads
)
Define model training parameters
In [ ]:
num_features = 100 # Word vector dimensionality
min_word_count = 10 # Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 10 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
In [ ]:
# Import the built-in logging module and configure it so that Word2Vec creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [ ]:
print "Training CBOW model..."
model = Word2Vec(
sentences,
workers=num_workers,
size=num_features,
min_count = min_word_count,
window = context,
sample = downsampling)
In [ ]:
# make the model much more memory-efficient.
model.init_sims(replace=True)
model_name = "model-w2v_cbow_%dfeatures_%dminwords_%dcontext" % (num_features, min_word_count, context)
model.save(model_name)
In [ ]:
print "%d words in vocabulary." % len(model.wv.vocab)
vocab = [(k, v.count) for k, v in model.wv.vocab.items()]
pd.DataFrame.from_records(vocab, columns=['word', 'count']).sort_values('count', ascending=False).reset_index(drop=True)
In [ ]:
model.most_similar("pasta", topn=20)
In [ ]:
model.most_similar("ofen")
In [ ]:
print "Training skip-gram model..."
model2 = Word2Vec(
sentences,
sg = 1,
hs = 1,
workers=num_workers,
size=num_features,
min_count = min_word_count,
window = context,
sample = downsampling)
In [ ]:
# make the model much more memory-efficient.
model2.init_sims(replace=True)
model_name = "recipes_skip-gram_%dfeatures_%dminwords_%dcontext" % (num_features, min_word_count, context)
model2.save(model_name)
In [ ]:
model2.most_similar("pasta")
In [ ]:
model2.most_similar("ofen")