This file is used to train word2vec vector.
About word2vec


In [5]:
import logging
import os.path
import sys
import multiprocessing
#import gensim.models.word2vec

from gensim.models.word2vec import Word2Vec
from gensim.models.word2vec import LineSentence

In [6]:
if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))



    file_path = "./dummy/processed_reviews.txt"

    model_path = "./dummy/model.model"
    vector_path = "./dummy/vector.txt"

    
    count = multiprocessing.cpu_count()
    model = Word2Vec(LineSentence(file_path),size=200, window=10, min_count=5, workers=count)

    model.save(model_path)
    model.save_word2vec_format(vector_path, binary=False)


INFO:__main__.py:running /Library/Python/2.7/site-packages/ipykernel/__main__.py -f /Users/JaySurplus/Library/Jupyter/runtime/kernel-d7b1cb20-e96a-46c3-9208-ab5b53cd15a5.json
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:collected 38253 word types from a corpus of 1424772 raw words and 10000 sentences
INFO:gensim.models.word2vec:min_count=5 retains 8640 unique words (drops 29613)
INFO:gensim.models.word2vec:min_count leaves 1382119 word corpus (97% of original 1424772)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 38253 items
INFO:gensim.models.word2vec:sample=0.001 downsamples 53 most-common words
INFO:gensim.models.word2vec:downsampling leaves estimated 994317 word corpus (71.9% of prior 1382119)
INFO:gensim.models.word2vec:estimated required memory for 8640 words and 200 dimensions: 18144000 bytes
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.models.word2vec:training model with 4 workers on 8640 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5
INFO:gensim.models.word2vec:expecting 10000 sentences, matching count from corpus used for vocabulary survey
INFO:gensim.models.word2vec:PROGRESS: at 11.04% examples, 558845 words/s, in_qsize 6, out_qsize 1
INFO:gensim.models.word2vec:PROGRESS: at 23.60% examples, 584881 words/s, in_qsize 6, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 35.88% examples, 588593 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 48.14% examples, 595044 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 60.77% examples, 599463 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 72.63% examples, 598650 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 85.02% examples, 599283 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:PROGRESS: at 95.40% examples, 588637 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.word2vec:training on 7123860 raw words (4971002 effective words) took 8.6s, 580363 effective words/s
INFO:gensim.utils:saving Word2Vec object under ./dummy/model.model, separately None
INFO:gensim.utils:not storing attribute syn0norm
INFO:gensim.utils:not storing attribute cum_table
INFO:gensim.models.word2vec:storing 8640x200 projection weights into ./dummy/vector.txt

In [10]:
"""
Testing word2vec vector

"""
model = Word2Vec.load_word2vec_format('./dummy/vector.txt', binary=False)

model.most_similar('king')


INFO:gensim.models.word2vec:loading projection weights from ./dummy/vector.txt
INFO:gensim.models.word2vec:loaded (8640, 200) matrix from ./dummy/vector.txt
INFO:gensim.models.word2vec:precomputing L2-norms of word weight vectors
Out[10]:
[(u'queen', 0.8998591899871826),
 (u'deluxe', 0.8928799629211426),
 (u'double', 0.8741847276687622),
 (u'pullout', 0.8364671468734741),
 (u'suite', 0.8344507217407227),
 (u'junior', 0.826301097869873),
 (u'twin', 0.8189259171485901),
 (u'upgraded', 0.7928529977798462),
 (u'studio', 0.7852340340614319),
 (u'cityview', 0.7741087079048157)]

In [15]:
model.most_similar(positive=['pool', 'gym'] , negative=['swim'])


Out[15]:
[(u'equipment', 0.6470780372619629),
 (u'fitness', 0.6273806095123291),
 (u'workout', 0.6220968961715698),
 (u'weights', 0.6202228665351868),
 (u'adequate', 0.6176875829696655),
 (u'facility', 0.6097884774208069),
 (u'outdoor', 0.5975462198257446),
 (u'quite', 0.5851249098777771),
 (u'lounge', 0.5847529768943787),
 (u'basic', 0.5780242681503296)]

In [17]:
model.most_similar(['spa','internet','pool','gym'])


Out[17]:
[(u'cab', 0.8215907216072083),
 (u'taxi', 0.8048601150512695),
 (u'catch', 0.7325971126556396),
 (u'van', 0.723381757736206),
 (u'ride', 0.7149031162261963),
 (u'take', 0.6885027885437012),
 (u'shuttle', 0.6788942217826843),
 (u'drop', 0.6787338256835938),
 (u'bus', 0.6777344942092896),
 (u'drive', 0.6763275265693665)]

In [ ]: