Word2Vec Word Embeddings


In [1]:
import numpy as np
from gensim.models import Word2Vec
import re
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


c:\python\python35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")

In [2]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read().splitlines()
    file.close()
    return text

In [4]:
words = load_doc('Data/Dictionary.txt')
sentences = []
words = np.array(words)

for word in words:
    spl = re.split(' ',word.strip().lower())
    sentences.append([i for i in spl])

In [6]:
model = Word2Vec(sentences,size=200,min_count=1,workers=4,hs=1,negative=0)


2018-05-14 21:15:01,925 : INFO : collecting all words and their counts
2018-05-14 21:15:01,929 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-05-14 21:15:01,968 : INFO : PROGRESS: at sentence #10000, processed 40981 words, keeping 1593 word types
2018-05-14 21:15:01,993 : INFO : PROGRESS: at sentence #20000, processed 83622 words, keeping 2111 word types
2018-05-14 21:15:02,032 : INFO : PROGRESS: at sentence #30000, processed 122578 words, keeping 2532 word types
2018-05-14 21:15:02,055 : INFO : PROGRESS: at sentence #40000, processed 154083 words, keeping 3260 word types
2018-05-14 21:15:02,085 : INFO : PROGRESS: at sentence #50000, processed 193580 words, keeping 3748 word types
2018-05-14 21:15:02,107 : INFO : PROGRESS: at sentence #60000, processed 229436 words, keeping 4073 word types
2018-05-14 21:15:02,138 : INFO : PROGRESS: at sentence #70000, processed 265030 words, keeping 4403 word types
2018-05-14 21:15:02,164 : INFO : PROGRESS: at sentence #80000, processed 301292 words, keeping 4593 word types
2018-05-14 21:15:02,187 : INFO : PROGRESS: at sentence #90000, processed 340756 words, keeping 4729 word types
2018-05-14 21:15:02,208 : INFO : PROGRESS: at sentence #100000, processed 375406 words, keeping 5221 word types
2018-05-14 21:15:02,234 : INFO : PROGRESS: at sentence #110000, processed 411580 words, keeping 5532 word types
2018-05-14 21:15:02,255 : INFO : PROGRESS: at sentence #120000, processed 443530 words, keeping 5942 word types
2018-05-14 21:15:02,278 : INFO : PROGRESS: at sentence #130000, processed 477970 words, keeping 6134 word types
2018-05-14 21:15:02,300 : INFO : PROGRESS: at sentence #140000, processed 513180 words, keeping 6317 word types
2018-05-14 21:15:02,323 : INFO : PROGRESS: at sentence #150000, processed 548566 words, keeping 6689 word types
2018-05-14 21:15:02,346 : INFO : PROGRESS: at sentence #160000, processed 587481 words, keeping 6824 word types
2018-05-14 21:15:02,369 : INFO : PROGRESS: at sentence #170000, processed 631491 words, keeping 6863 word types
2018-05-14 21:15:02,394 : INFO : PROGRESS: at sentence #180000, processed 668700 words, keeping 7030 word types
2018-05-14 21:15:02,421 : INFO : PROGRESS: at sentence #190000, processed 720606 words, keeping 7321 word types
2018-05-14 21:15:02,451 : INFO : PROGRESS: at sentence #200000, processed 766708 words, keeping 7569 word types
2018-05-14 21:15:02,475 : INFO : PROGRESS: at sentence #210000, processed 808825 words, keeping 7623 word types
2018-05-14 21:15:02,499 : INFO : PROGRESS: at sentence #220000, processed 855081 words, keeping 7629 word types
2018-05-14 21:15:02,529 : INFO : PROGRESS: at sentence #230000, processed 904848 words, keeping 7636 word types
2018-05-14 21:15:02,553 : INFO : PROGRESS: at sentence #240000, processed 952591 words, keeping 7638 word types
2018-05-14 21:15:02,583 : INFO : PROGRESS: at sentence #250000, processed 1008880 words, keeping 7639 word types
2018-05-14 21:15:02,609 : INFO : PROGRESS: at sentence #260000, processed 1058423 words, keeping 7644 word types
2018-05-14 21:15:02,633 : INFO : PROGRESS: at sentence #270000, processed 1107054 words, keeping 7656 word types
2018-05-14 21:15:02,659 : INFO : PROGRESS: at sentence #280000, processed 1155437 words, keeping 7670 word types
2018-05-14 21:15:02,684 : INFO : PROGRESS: at sentence #290000, processed 1205420 words, keeping 7677 word types
2018-05-14 21:15:02,708 : INFO : PROGRESS: at sentence #300000, processed 1252794 words, keeping 7703 word types
2018-05-14 21:15:02,732 : INFO : PROGRESS: at sentence #310000, processed 1297656 words, keeping 7721 word types
2018-05-14 21:15:02,755 : INFO : PROGRESS: at sentence #320000, processed 1340172 words, keeping 7737 word types
2018-05-14 21:15:02,779 : INFO : PROGRESS: at sentence #330000, processed 1380553 words, keeping 7790 word types
2018-05-14 21:15:02,804 : INFO : PROGRESS: at sentence #340000, processed 1423571 words, keeping 7848 word types
2018-05-14 21:15:02,828 : INFO : PROGRESS: at sentence #350000, processed 1470314 words, keeping 7848 word types
2018-05-14 21:15:02,862 : INFO : PROGRESS: at sentence #360000, processed 1519796 words, keeping 7848 word types
2018-05-14 21:15:02,901 : INFO : PROGRESS: at sentence #370000, processed 1568132 words, keeping 7848 word types
2018-05-14 21:15:02,928 : INFO : PROGRESS: at sentence #380000, processed 1624318 words, keeping 7848 word types
2018-05-14 21:15:02,959 : INFO : PROGRESS: at sentence #390000, processed 1673946 words, keeping 7848 word types
2018-05-14 21:15:03,001 : INFO : PROGRESS: at sentence #400000, processed 1722607 words, keeping 7848 word types
2018-05-14 21:15:03,024 : INFO : PROGRESS: at sentence #410000, processed 1771017 words, keeping 7848 word types
2018-05-14 21:15:03,053 : INFO : PROGRESS: at sentence #420000, processed 1820984 words, keeping 7848 word types
2018-05-14 21:15:03,083 : INFO : PROGRESS: at sentence #430000, processed 1868346 words, keeping 7848 word types
2018-05-14 21:15:03,108 : INFO : PROGRESS: at sentence #440000, processed 1912274 words, keeping 7903 word types
2018-05-14 21:15:03,137 : INFO : PROGRESS: at sentence #450000, processed 1959736 words, keeping 7954 word types
2018-05-14 21:15:03,163 : INFO : PROGRESS: at sentence #460000, processed 2001762 words, keeping 8005 word types
2018-05-14 21:15:03,186 : INFO : PROGRESS: at sentence #470000, processed 2038156 words, keeping 8223 word types
2018-05-14 21:15:03,210 : INFO : PROGRESS: at sentence #480000, processed 2074788 words, keeping 8332 word types
2018-05-14 21:15:03,232 : INFO : PROGRESS: at sentence #490000, processed 2112442 words, keeping 8419 word types
2018-05-14 21:15:03,253 : INFO : PROGRESS: at sentence #500000, processed 2146867 words, keeping 8549 word types
2018-05-14 21:15:03,281 : INFO : PROGRESS: at sentence #510000, processed 2184770 words, keeping 8636 word types
2018-05-14 21:15:03,310 : INFO : PROGRESS: at sentence #520000, processed 2222901 words, keeping 8700 word types
2018-05-14 21:15:03,341 : INFO : PROGRESS: at sentence #530000, processed 2259286 words, keeping 8803 word types
2018-05-14 21:15:03,363 : INFO : PROGRESS: at sentence #540000, processed 2298540 words, keeping 8859 word types
2018-05-14 21:15:03,388 : INFO : PROGRESS: at sentence #550000, processed 2339902 words, keeping 8909 word types
2018-05-14 21:15:03,411 : INFO : PROGRESS: at sentence #560000, processed 2377208 words, keeping 8976 word types
2018-05-14 21:15:03,445 : INFO : PROGRESS: at sentence #570000, processed 2412528 words, keeping 9031 word types
2018-05-14 21:15:03,466 : INFO : PROGRESS: at sentence #580000, processed 2449012 words, keeping 9110 word types
2018-05-14 21:15:03,481 : INFO : collected 9209 word types from a corpus of 2473995 raw words and 587715 sentences
2018-05-14 21:15:03,484 : INFO : Loading a fresh vocabulary
2018-05-14 21:15:03,530 : INFO : min_count=1 retains 9209 unique words (100% of original 9209, drops 0)
2018-05-14 21:15:03,532 : INFO : min_count=1 leaves 2473995 word corpus (100% of original 2473995, drops 0)
2018-05-14 21:15:03,611 : INFO : deleting the raw counts dictionary of 9209 items
2018-05-14 21:15:03,613 : INFO : sample=0.001 downsamples 68 most-common words
2018-05-14 21:15:03,615 : INFO : downsampling leaves estimated 1848026 word corpus (74.7% of prior 2473995)
2018-05-14 21:15:03,616 : INFO : estimated required memory for 9209 words and 200 dimensions: 21180700 bytes
2018-05-14 21:15:03,634 : INFO : constructing a huffman tree from 9209 words
2018-05-14 21:15:04,062 : INFO : built huffman tree with maximum node depth 21
2018-05-14 21:15:04,074 : INFO : resetting layer weights
2018-05-14 21:15:04,394 : INFO : training model with 4 workers on 9209 vocabulary and 200 features, using sg=0 hs=1 sample=0.001 negative=0 window=5
2018-05-14 21:15:05,414 : INFO : PROGRESS: at 5.01% examples, 470221 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:06,406 : INFO : PROGRESS: at 11.54% examples, 528939 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:07,411 : INFO : PROGRESS: at 18.04% examples, 551292 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:08,428 : INFO : PROGRESS: at 25.66% examples, 591634 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:09,444 : INFO : PROGRESS: at 33.01% examples, 603716 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:10,430 : INFO : PROGRESS: at 40.01% examples, 612224 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:11,447 : INFO : PROGRESS: at 46.93% examples, 619173 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:12,473 : INFO : PROGRESS: at 53.46% examples, 610852 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:13,487 : INFO : PROGRESS: at 60.15% examples, 612157 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:14,496 : INFO : PROGRESS: at 67.28% examples, 618161 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:15,521 : INFO : PROGRESS: at 74.33% examples, 616200 words/s, in_qsize 7, out_qsize 2
2018-05-14 21:15:16,515 : INFO : PROGRESS: at 81.02% examples, 618989 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:17,531 : INFO : PROGRESS: at 88.67% examples, 625091 words/s, in_qsize 8, out_qsize 0
2018-05-14 21:15:18,565 : INFO : PROGRESS: at 95.55% examples, 623180 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:19,121 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-05-14 21:15:19,138 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-14 21:15:19,157 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-14 21:15:19,170 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-14 21:15:19,171 : INFO : training on 12369975 raw words (9241135 effective words) took 14.8s, 625718 effective words/s

In [7]:
print(model)


Word2Vec(vocab=9209, size=200, alpha=0.025)

In [8]:
words = list(model.wv.vocab)

In [9]:
print(model['htn'])


[-0.16159943  0.14019014 -0.07255719 -0.20575488  0.72903645 -0.55635703
  0.85412264 -0.34359896 -0.31746644 -0.32048941 -0.65795773  0.61979234
  0.23127751  0.10298863  0.06665805 -0.22100185 -0.07442544  0.58151126
  0.1511952   0.03764169  0.33917949 -0.60502768  0.13392024  0.05951749
  0.08938409 -0.37683186  0.13353322 -0.07203946  0.28057733 -0.19731362
 -0.03616512  0.07342356 -0.00133    -0.38165814  0.16000949 -0.18942365
  0.53532767 -0.37107676  0.16013144  0.15260385 -0.05059931 -0.21138492
  0.49394286 -0.31780955  0.44223544  0.08498331  0.14112577  0.54882473
 -0.50177288  0.04734901 -0.82816863  0.10204273  0.19656208  0.26885754
  0.04904519 -0.28119352  0.39181671  0.26403868  0.1726854   0.04740996
  0.21523838  0.74642664  0.02258014 -0.38103712  0.30467874  0.4978264
 -0.38591063 -0.11380886 -0.24905485  0.18675399 -0.29905692  0.10811967
  0.1104599   0.04441717  0.48090035 -0.03979092 -0.13645086  0.1396915
 -0.43074805  0.12610394  0.36874434 -0.15483695  0.07143651 -0.05659245
 -0.12279928 -0.29733855 -0.19125427  0.21444722  0.06005018  0.21220337
  0.26229322 -0.35601294  0.04038214  0.29133272  0.40025511 -0.60056967
  0.0598099  -0.48129115  0.0462813   0.05842364 -0.44989526 -0.02950737
 -0.29943925  0.13010964  0.07699747 -0.24883518 -0.49564022  0.21402156
  0.07652833  0.09473322  0.0692455  -0.02028495  0.11401746 -0.32651556
 -0.31693348 -0.46751866  0.05676005 -0.46189111  0.19786322  0.04002017
 -0.21727712 -0.1099363  -0.0609546   0.17475837 -0.08879087  0.46824574
 -0.11908066 -0.1372623  -0.1877066  -0.42981881  0.42479086 -0.10933876
  0.00823757 -0.08627943 -0.29718593  0.13904281 -0.1377075   0.03910933
  0.23667112 -0.32014027  0.57764959 -0.36799043  0.04143166  0.17843759
 -0.02670532 -0.0751866  -0.13058931 -0.14003611 -0.05510529  0.14592604
  0.05041722 -0.15116628  0.55263227  0.0047082  -0.07633208 -0.37795362
  0.11542226 -0.14117117 -0.35518777  0.26740149 -0.35222843  0.05663204
 -0.30115712  0.18761489 -0.45409983  0.00721784 -0.3082661   0.10816953
 -0.50989181  0.03989371  0.34952426  0.11479736 -0.43714347 -0.4310317
  0.23571451  0.39162406 -0.01750166  0.12602578  0.03189791 -0.43512312
 -0.0752854   0.01362873 -0.23373926  0.02388891 -0.26917976  0.02396742
 -0.04581464  0.02265125 -0.4767209   0.01242359 -0.42058998  0.50935495
 -0.08304451 -0.54694545 -0.50837678 -0.06508687 -0.15124534 -0.4037146
 -0.02363339 -0.1337547 ]

In [10]:
model.save('Data/model.bin')


2018-05-14 21:16:21,561 : INFO : saving Word2Vec object under Data/model.bin, separately None
2018-05-14 21:16:21,565 : INFO : not storing attribute syn0norm
2018-05-14 21:16:21,568 : INFO : not storing attribute cum_table
2018-05-14 21:16:21,984 : INFO : saved Data/model.bin

In [11]:
model = Word2Vec.load('data/model.bin')


2018-05-14 21:17:42,693 : INFO : loading Word2Vec object from data/model.bin
2018-05-14 21:17:42,989 : INFO : loading wv recursively from data/model.bin.wv.* with mmap=None
2018-05-14 21:17:42,989 : INFO : setting ignored attribute syn0norm to None
2018-05-14 21:17:42,989 : INFO : setting ignored attribute cum_table to None
2018-05-14 21:17:42,989 : INFO : loaded data/model.bin