In [ ]:
!pip install gensim

In [2]:
from gensim import corpora, similarities
from gensim.models import Word2Vec

import os
import multiprocessing

input_filename = '/Users/swkim/Data/namuwiki180326/_namuwiki_20180326_mini_pos_tagged_corpus.txt'
model_path = '/Users/swkim/Data/namuwiki180326_word2vec_model/_model'

class SentenceReader(object):
    def __init__(self, input_filename):
        self.input_filename = input_filename
 
    def __iter__(self):
        for line in open(input_filename):
            yield line.split(' ')

sentences_vocab = SentenceReader(input_filename) # a memory-friendly iterator
sentences_train = SentenceReader(input_filename) # a memory-friendly iterator

config = {
    'min_count': 10,  # 등장 횟수가 10 이하인 단어는 무시
    'size': 300,  # 300차원짜리 벡터스페이스에 embedding
    'sg': 1,  # 0이면 CBOW, 1이면 skip-gram을 사용한다
    'batch_words': 10000,  # 사전을 구축할때 한번에 읽을 단어 수
    'iter': 10,  # 보통 딥러닝에서 말하는 epoch과 비슷한, 반복 횟수
    'workers': multiprocessing.cpu_count(),
}
word2vec_model = Word2Vec(**config)

In [3]:
token_count = sum([len(sentence) for sentence in sentences_vocab])
print(token_count)


10101243

In [4]:
word2vec_model.build_vocab(sentences_vocab)

In [6]:
word2vec_model.train(sentences_train, total_examples = token_count, epochs=word2vec_model.iter)


/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `iter` (Attribute will be removed in 4.0.0, use self.epochs instead).
  """Entry point for launching an IPython kernel.
Out[6]:
(83179847, 101012430)

In [12]:
word2vec_model.wv['컴퓨터/Noun']


Out[12]:
array([ 1.96633279e-01,  2.48613656e-01, -2.45394304e-01, -2.56317645e-01,
        4.74541277e-01,  3.01026702e-02,  4.87546116e-01,  7.33239502e-02,
       -5.87272644e-01, -3.77035379e-01,  4.59482223e-02,  1.76913977e-01,
        2.15558521e-02, -1.87501684e-01, -4.71341431e-01, -1.91498622e-01,
       -9.00997370e-02, -1.88989416e-01, -1.22444250e-01, -2.40806062e-02,
        2.56040782e-01, -6.00680634e-02, -2.22483724e-01,  1.05650060e-01,
       -1.80254221e-01, -4.28380400e-01,  7.13798404e-02, -6.05015159e-01,
        3.07356715e-01,  5.84335029e-02,  4.71923389e-02, -5.38159132e-01,
        2.19250053e-01, -1.10728338e-01,  8.30922648e-02, -1.11769311e-01,
        3.73280764e-01, -2.50851899e-01,  5.41667081e-02,  1.86729014e-01,
       -2.61175662e-01,  1.41330943e-01, -3.25342603e-02, -4.20781486e-02,
       -2.76073247e-01, -8.15403908e-02, -2.20733300e-01, -1.49960816e-01,
       -4.13712293e-01,  3.02146107e-01,  1.40500247e-01, -1.36454061e-01,
       -1.67279676e-01,  8.24759007e-02, -1.81093216e-01,  1.67482048e-01,
        3.71372193e-01, -6.92298189e-02,  4.33557713e-03,  4.13014181e-02,
        3.36684957e-02, -2.16791719e-01, -3.47655296e-01,  3.78722399e-01,
        1.77493393e-01, -3.38427901e-01,  1.82618111e-01, -1.19372025e-01,
        5.97676868e-03, -4.19446826e-01, -7.68101960e-02,  2.87068158e-01,
       -4.22270626e-01,  3.82094592e-01,  6.70285001e-02,  4.30327147e-01,
        7.27928206e-02, -2.10601345e-01,  3.35319936e-01, -2.45708749e-01,
       -9.91251841e-02, -3.38260867e-02, -1.84034090e-02,  1.32642850e-01,
       -1.10084444e-01, -9.27782953e-02,  5.45493960e-02, -2.70131350e-01,
       -4.16198462e-01, -1.82379633e-01,  7.91987851e-02,  1.18133046e-01,
        3.53333503e-01,  9.15836841e-02,  4.72103775e-01, -4.69782293e-01,
       -1.80226505e-01, -9.16002095e-02,  1.26059237e-03,  2.95199484e-01,
        5.74038446e-01,  3.34445298e-01,  4.32193018e-02,  1.99793473e-01,
       -5.89150004e-02, -8.25238749e-02,  2.00352892e-01, -2.33546928e-01,
       -5.03906459e-02, -3.92877340e-01, -2.16950789e-01, -3.21619242e-01,
        2.18994707e-01, -2.41792366e-01,  3.52628171e-01, -1.59246586e-02,
        8.18535686e-03,  1.21118724e-01, -8.60626251e-02, -2.81175494e-01,
        2.84218431e-01,  9.50501636e-02, -2.75884241e-01, -2.23205220e-02,
       -2.77463067e-02, -8.85836110e-02, -3.30693871e-01, -1.55010924e-01,
       -3.03453449e-02,  3.00415736e-02,  2.45037049e-01,  1.85975015e-01,
       -5.89360073e-02, -4.83605154e-02, -6.54176772e-02,  4.60285872e-01,
       -7.40539432e-02,  2.33733088e-01, -2.98346346e-03,  2.86837846e-01,
        5.14750183e-02,  4.19609845e-01, -1.81906924e-01, -2.39353940e-01,
       -3.11446697e-01,  1.92439452e-01,  2.63882875e-01,  2.13398367e-01,
       -3.73577893e-01, -9.06985924e-02,  3.62905413e-01,  1.40135437e-01,
        1.70850068e-01,  1.41871035e-01,  7.44267926e-02,  2.47524500e-01,
        2.46565223e-01,  1.03810810e-01, -5.03858598e-03, -3.92798215e-01,
       -2.60481417e-01,  9.41614285e-02, -2.93596625e-01,  1.76164612e-01,
        3.89468610e-01,  2.43579179e-01,  5.85463405e-01,  5.17072007e-02,
        3.81490886e-01,  3.00789654e-01, -9.10783745e-03,  2.62502849e-01,
        5.57327628e-01,  3.03610470e-02,  4.35912341e-01,  1.30580515e-01,
        6.99765384e-02, -3.49960715e-01, -1.95075229e-01, -1.83452323e-01,
        1.08182997e-01,  1.05008475e-01,  6.05515480e-01, -2.35972449e-01,
        7.45770633e-02,  4.81134630e-04, -9.37304925e-03,  9.12125185e-02,
       -9.92347524e-02, -1.61523432e-01,  4.54760678e-02,  1.01462506e-01,
       -5.61557934e-02, -2.53882378e-01,  1.95822269e-02, -2.37109408e-01,
        1.45643026e-01,  5.32866679e-02, -3.51703823e-01,  4.16706242e-02,
        8.01131427e-02, -2.48289049e-01, -1.50226116e-01, -8.80241275e-01,
        2.11828455e-01, -1.55376598e-01,  1.92538515e-01, -2.05793127e-01,
        4.83399093e-01,  1.59928188e-01, -1.48797423e-01,  1.18253894e-01,
       -2.34961495e-01,  2.12588231e-03,  2.89692193e-01, -3.10200483e-01,
        5.18356683e-04,  1.15621579e-03,  7.68214315e-02, -9.09952521e-02,
       -7.53916800e-02, -3.11792959e-02, -1.48613583e-02,  3.93877067e-02,
       -2.90110886e-01, -8.33157673e-02, -5.92106581e-02,  2.05625132e-01,
       -1.66439608e-01,  2.86595672e-01, -4.79720652e-01,  1.93150304e-02,
       -3.80717590e-02,  1.84071839e-01, -3.91827792e-01, -6.31878078e-02,
        3.26163918e-01,  2.77901888e-01, -4.69847649e-01,  1.02160731e-03,
        3.12599689e-01, -6.33758068e-01,  1.79474391e-02, -3.17587346e-01,
        4.21647668e-01,  3.34556997e-02,  1.08245775e-01,  1.72977388e-01,
        3.69339027e-02,  2.57461984e-02, -1.35849416e-01,  3.71918559e-01,
       -4.59911823e-01, -1.34757221e-01, -5.01818359e-01, -3.74632806e-01,
       -7.98048526e-02, -2.18060523e-01, -1.65924683e-01, -4.35505845e-02,
        1.12242073e-01,  8.89115557e-02,  2.33236015e-01, -1.92003727e-01,
        5.55859730e-02, -6.59231767e-02,  1.12817056e-01,  1.27397731e-01,
       -1.97972476e-01,  3.30644935e-01,  3.61190975e-01, -5.59928231e-02,
       -2.47071199e-02,  1.15120992e-01,  4.03024144e-02, -6.60821795e-02,
        4.67173070e-01,  1.21420905e-01, -1.26042245e-02,  1.38601542e-01,
        1.15941726e-02,  2.56831795e-01,  1.08997837e-01, -1.01703638e-02,
       -7.08038034e-03,  1.69635117e-02,  3.44201252e-02,  2.19572261e-01,
        1.87270656e-01, -3.15116584e-01,  2.20931247e-01,  2.72078812e-01,
       -1.23649642e-01, -7.26786107e-02,  5.12815341e-02, -5.02058677e-02,
       -3.37342143e-01,  4.77617055e-01,  1.69076055e-01, -1.55936494e-01],
      dtype=float32)

In [11]:
word2vec_model.most_similar(['컴퓨터/Noun'])


/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  """Entry point for launching an IPython kernel.
Out[11]:
[('인터프리터/Noun', 0.4992220997810364),
 ('데스크톱/Noun', 0.48961344361305237),
 ('사운드카드/Noun', 0.47900864481925964),
 ('임베디드/Noun', 0.4784030020236969),
 ('운영체제/Noun', 0.47274693846702576),
 ('인코더/Noun', 0.4715920090675354),
 ('인터페이스/Noun', 0.46802210807800293),
 ('주변기기/Noun', 0.46601858735084534),
 ('컴퓨팅/Noun', 0.4629734456539154),
 ('데이터베이스/Noun', 0.4629448354244232)]

In [13]:
word2vec_model.most_similar(positive = ['서울/Noun', '미국/Noun'], negative=['한국/Noun'])


/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  """Entry point for launching an IPython kernel.
Out[13]:
[('런던/Noun', 0.42090463638305664),
 ('독일/Noun', 0.3893357217311859),
 ('국회의사당/Noun', 0.3868353068828583),
 ('컬럼비아/Noun', 0.38636940717697144),
 ('로스앤젤레스/Noun', 0.3854248523712158),
 ('교향악단/Noun', 0.3837340772151947),
 ('브뤼셀/Noun', 0.38252902030944824),
 ('레닌그라드/Noun', 0.38170433044433594),
 ('로스엔젤레스/Noun', 0.3792198598384857),
 ('지방법원/Noun', 0.3791108727455139)]

In [14]:
word2vec_model.save(model_path)

In [15]:
word2vec_model = Word2Vec.load(model_path)

In [ ]: