In [1]:
import numpy as np
from gensim.models import Word2Vec
import re
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
c:\python\python35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
In [2]:
def load_doc(filename):
file = open(filename, 'r')
text = file.read().splitlines()
file.close()
return text
In [4]:
words = load_doc('Data/Dictionary.txt')
sentences = []
words = np.array(words)
for word in words:
spl = re.split(' ',word.strip().lower())
sentences.append([i for i in spl])
In [6]:
model = Word2Vec(sentences,size=200,min_count=1,workers=4,hs=1,negative=0)
2018-05-14 21:15:01,925 : INFO : collecting all words and their counts
2018-05-14 21:15:01,929 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-05-14 21:15:01,968 : INFO : PROGRESS: at sentence #10000, processed 40981 words, keeping 1593 word types
2018-05-14 21:15:01,993 : INFO : PROGRESS: at sentence #20000, processed 83622 words, keeping 2111 word types
2018-05-14 21:15:02,032 : INFO : PROGRESS: at sentence #30000, processed 122578 words, keeping 2532 word types
2018-05-14 21:15:02,055 : INFO : PROGRESS: at sentence #40000, processed 154083 words, keeping 3260 word types
2018-05-14 21:15:02,085 : INFO : PROGRESS: at sentence #50000, processed 193580 words, keeping 3748 word types
2018-05-14 21:15:02,107 : INFO : PROGRESS: at sentence #60000, processed 229436 words, keeping 4073 word types
2018-05-14 21:15:02,138 : INFO : PROGRESS: at sentence #70000, processed 265030 words, keeping 4403 word types
2018-05-14 21:15:02,164 : INFO : PROGRESS: at sentence #80000, processed 301292 words, keeping 4593 word types
2018-05-14 21:15:02,187 : INFO : PROGRESS: at sentence #90000, processed 340756 words, keeping 4729 word types
2018-05-14 21:15:02,208 : INFO : PROGRESS: at sentence #100000, processed 375406 words, keeping 5221 word types
2018-05-14 21:15:02,234 : INFO : PROGRESS: at sentence #110000, processed 411580 words, keeping 5532 word types
2018-05-14 21:15:02,255 : INFO : PROGRESS: at sentence #120000, processed 443530 words, keeping 5942 word types
2018-05-14 21:15:02,278 : INFO : PROGRESS: at sentence #130000, processed 477970 words, keeping 6134 word types
2018-05-14 21:15:02,300 : INFO : PROGRESS: at sentence #140000, processed 513180 words, keeping 6317 word types
2018-05-14 21:15:02,323 : INFO : PROGRESS: at sentence #150000, processed 548566 words, keeping 6689 word types
2018-05-14 21:15:02,346 : INFO : PROGRESS: at sentence #160000, processed 587481 words, keeping 6824 word types
2018-05-14 21:15:02,369 : INFO : PROGRESS: at sentence #170000, processed 631491 words, keeping 6863 word types
2018-05-14 21:15:02,394 : INFO : PROGRESS: at sentence #180000, processed 668700 words, keeping 7030 word types
2018-05-14 21:15:02,421 : INFO : PROGRESS: at sentence #190000, processed 720606 words, keeping 7321 word types
2018-05-14 21:15:02,451 : INFO : PROGRESS: at sentence #200000, processed 766708 words, keeping 7569 word types
2018-05-14 21:15:02,475 : INFO : PROGRESS: at sentence #210000, processed 808825 words, keeping 7623 word types
2018-05-14 21:15:02,499 : INFO : PROGRESS: at sentence #220000, processed 855081 words, keeping 7629 word types
2018-05-14 21:15:02,529 : INFO : PROGRESS: at sentence #230000, processed 904848 words, keeping 7636 word types
2018-05-14 21:15:02,553 : INFO : PROGRESS: at sentence #240000, processed 952591 words, keeping 7638 word types
2018-05-14 21:15:02,583 : INFO : PROGRESS: at sentence #250000, processed 1008880 words, keeping 7639 word types
2018-05-14 21:15:02,609 : INFO : PROGRESS: at sentence #260000, processed 1058423 words, keeping 7644 word types
2018-05-14 21:15:02,633 : INFO : PROGRESS: at sentence #270000, processed 1107054 words, keeping 7656 word types
2018-05-14 21:15:02,659 : INFO : PROGRESS: at sentence #280000, processed 1155437 words, keeping 7670 word types
2018-05-14 21:15:02,684 : INFO : PROGRESS: at sentence #290000, processed 1205420 words, keeping 7677 word types
2018-05-14 21:15:02,708 : INFO : PROGRESS: at sentence #300000, processed 1252794 words, keeping 7703 word types
2018-05-14 21:15:02,732 : INFO : PROGRESS: at sentence #310000, processed 1297656 words, keeping 7721 word types
2018-05-14 21:15:02,755 : INFO : PROGRESS: at sentence #320000, processed 1340172 words, keeping 7737 word types
2018-05-14 21:15:02,779 : INFO : PROGRESS: at sentence #330000, processed 1380553 words, keeping 7790 word types
2018-05-14 21:15:02,804 : INFO : PROGRESS: at sentence #340000, processed 1423571 words, keeping 7848 word types
2018-05-14 21:15:02,828 : INFO : PROGRESS: at sentence #350000, processed 1470314 words, keeping 7848 word types
2018-05-14 21:15:02,862 : INFO : PROGRESS: at sentence #360000, processed 1519796 words, keeping 7848 word types
2018-05-14 21:15:02,901 : INFO : PROGRESS: at sentence #370000, processed 1568132 words, keeping 7848 word types
2018-05-14 21:15:02,928 : INFO : PROGRESS: at sentence #380000, processed 1624318 words, keeping 7848 word types
2018-05-14 21:15:02,959 : INFO : PROGRESS: at sentence #390000, processed 1673946 words, keeping 7848 word types
2018-05-14 21:15:03,001 : INFO : PROGRESS: at sentence #400000, processed 1722607 words, keeping 7848 word types
2018-05-14 21:15:03,024 : INFO : PROGRESS: at sentence #410000, processed 1771017 words, keeping 7848 word types
2018-05-14 21:15:03,053 : INFO : PROGRESS: at sentence #420000, processed 1820984 words, keeping 7848 word types
2018-05-14 21:15:03,083 : INFO : PROGRESS: at sentence #430000, processed 1868346 words, keeping 7848 word types
2018-05-14 21:15:03,108 : INFO : PROGRESS: at sentence #440000, processed 1912274 words, keeping 7903 word types
2018-05-14 21:15:03,137 : INFO : PROGRESS: at sentence #450000, processed 1959736 words, keeping 7954 word types
2018-05-14 21:15:03,163 : INFO : PROGRESS: at sentence #460000, processed 2001762 words, keeping 8005 word types
2018-05-14 21:15:03,186 : INFO : PROGRESS: at sentence #470000, processed 2038156 words, keeping 8223 word types
2018-05-14 21:15:03,210 : INFO : PROGRESS: at sentence #480000, processed 2074788 words, keeping 8332 word types
2018-05-14 21:15:03,232 : INFO : PROGRESS: at sentence #490000, processed 2112442 words, keeping 8419 word types
2018-05-14 21:15:03,253 : INFO : PROGRESS: at sentence #500000, processed 2146867 words, keeping 8549 word types
2018-05-14 21:15:03,281 : INFO : PROGRESS: at sentence #510000, processed 2184770 words, keeping 8636 word types
2018-05-14 21:15:03,310 : INFO : PROGRESS: at sentence #520000, processed 2222901 words, keeping 8700 word types
2018-05-14 21:15:03,341 : INFO : PROGRESS: at sentence #530000, processed 2259286 words, keeping 8803 word types
2018-05-14 21:15:03,363 : INFO : PROGRESS: at sentence #540000, processed 2298540 words, keeping 8859 word types
2018-05-14 21:15:03,388 : INFO : PROGRESS: at sentence #550000, processed 2339902 words, keeping 8909 word types
2018-05-14 21:15:03,411 : INFO : PROGRESS: at sentence #560000, processed 2377208 words, keeping 8976 word types
2018-05-14 21:15:03,445 : INFO : PROGRESS: at sentence #570000, processed 2412528 words, keeping 9031 word types
2018-05-14 21:15:03,466 : INFO : PROGRESS: at sentence #580000, processed 2449012 words, keeping 9110 word types
2018-05-14 21:15:03,481 : INFO : collected 9209 word types from a corpus of 2473995 raw words and 587715 sentences
2018-05-14 21:15:03,484 : INFO : Loading a fresh vocabulary
2018-05-14 21:15:03,530 : INFO : min_count=1 retains 9209 unique words (100% of original 9209, drops 0)
2018-05-14 21:15:03,532 : INFO : min_count=1 leaves 2473995 word corpus (100% of original 2473995, drops 0)
2018-05-14 21:15:03,611 : INFO : deleting the raw counts dictionary of 9209 items
2018-05-14 21:15:03,613 : INFO : sample=0.001 downsamples 68 most-common words
2018-05-14 21:15:03,615 : INFO : downsampling leaves estimated 1848026 word corpus (74.7% of prior 2473995)
2018-05-14 21:15:03,616 : INFO : estimated required memory for 9209 words and 200 dimensions: 21180700 bytes
2018-05-14 21:15:03,634 : INFO : constructing a huffman tree from 9209 words
2018-05-14 21:15:04,062 : INFO : built huffman tree with maximum node depth 21
2018-05-14 21:15:04,074 : INFO : resetting layer weights
2018-05-14 21:15:04,394 : INFO : training model with 4 workers on 9209 vocabulary and 200 features, using sg=0 hs=1 sample=0.001 negative=0 window=5
2018-05-14 21:15:05,414 : INFO : PROGRESS: at 5.01% examples, 470221 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:06,406 : INFO : PROGRESS: at 11.54% examples, 528939 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:07,411 : INFO : PROGRESS: at 18.04% examples, 551292 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:08,428 : INFO : PROGRESS: at 25.66% examples, 591634 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:09,444 : INFO : PROGRESS: at 33.01% examples, 603716 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:10,430 : INFO : PROGRESS: at 40.01% examples, 612224 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:11,447 : INFO : PROGRESS: at 46.93% examples, 619173 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:12,473 : INFO : PROGRESS: at 53.46% examples, 610852 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:13,487 : INFO : PROGRESS: at 60.15% examples, 612157 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:14,496 : INFO : PROGRESS: at 67.28% examples, 618161 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:15,521 : INFO : PROGRESS: at 74.33% examples, 616200 words/s, in_qsize 7, out_qsize 2
2018-05-14 21:15:16,515 : INFO : PROGRESS: at 81.02% examples, 618989 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:17,531 : INFO : PROGRESS: at 88.67% examples, 625091 words/s, in_qsize 8, out_qsize 0
2018-05-14 21:15:18,565 : INFO : PROGRESS: at 95.55% examples, 623180 words/s, in_qsize 7, out_qsize 0
2018-05-14 21:15:19,121 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-05-14 21:15:19,138 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-14 21:15:19,157 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-14 21:15:19,170 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-14 21:15:19,171 : INFO : training on 12369975 raw words (9241135 effective words) took 14.8s, 625718 effective words/s
In [7]:
print(model)
Word2Vec(vocab=9209, size=200, alpha=0.025)
In [8]:
words = list(model.wv.vocab)
In [9]:
print(model['htn'])
[-0.16159943 0.14019014 -0.07255719 -0.20575488 0.72903645 -0.55635703
0.85412264 -0.34359896 -0.31746644 -0.32048941 -0.65795773 0.61979234
0.23127751 0.10298863 0.06665805 -0.22100185 -0.07442544 0.58151126
0.1511952 0.03764169 0.33917949 -0.60502768 0.13392024 0.05951749
0.08938409 -0.37683186 0.13353322 -0.07203946 0.28057733 -0.19731362
-0.03616512 0.07342356 -0.00133 -0.38165814 0.16000949 -0.18942365
0.53532767 -0.37107676 0.16013144 0.15260385 -0.05059931 -0.21138492
0.49394286 -0.31780955 0.44223544 0.08498331 0.14112577 0.54882473
-0.50177288 0.04734901 -0.82816863 0.10204273 0.19656208 0.26885754
0.04904519 -0.28119352 0.39181671 0.26403868 0.1726854 0.04740996
0.21523838 0.74642664 0.02258014 -0.38103712 0.30467874 0.4978264
-0.38591063 -0.11380886 -0.24905485 0.18675399 -0.29905692 0.10811967
0.1104599 0.04441717 0.48090035 -0.03979092 -0.13645086 0.1396915
-0.43074805 0.12610394 0.36874434 -0.15483695 0.07143651 -0.05659245
-0.12279928 -0.29733855 -0.19125427 0.21444722 0.06005018 0.21220337
0.26229322 -0.35601294 0.04038214 0.29133272 0.40025511 -0.60056967
0.0598099 -0.48129115 0.0462813 0.05842364 -0.44989526 -0.02950737
-0.29943925 0.13010964 0.07699747 -0.24883518 -0.49564022 0.21402156
0.07652833 0.09473322 0.0692455 -0.02028495 0.11401746 -0.32651556
-0.31693348 -0.46751866 0.05676005 -0.46189111 0.19786322 0.04002017
-0.21727712 -0.1099363 -0.0609546 0.17475837 -0.08879087 0.46824574
-0.11908066 -0.1372623 -0.1877066 -0.42981881 0.42479086 -0.10933876
0.00823757 -0.08627943 -0.29718593 0.13904281 -0.1377075 0.03910933
0.23667112 -0.32014027 0.57764959 -0.36799043 0.04143166 0.17843759
-0.02670532 -0.0751866 -0.13058931 -0.14003611 -0.05510529 0.14592604
0.05041722 -0.15116628 0.55263227 0.0047082 -0.07633208 -0.37795362
0.11542226 -0.14117117 -0.35518777 0.26740149 -0.35222843 0.05663204
-0.30115712 0.18761489 -0.45409983 0.00721784 -0.3082661 0.10816953
-0.50989181 0.03989371 0.34952426 0.11479736 -0.43714347 -0.4310317
0.23571451 0.39162406 -0.01750166 0.12602578 0.03189791 -0.43512312
-0.0752854 0.01362873 -0.23373926 0.02388891 -0.26917976 0.02396742
-0.04581464 0.02265125 -0.4767209 0.01242359 -0.42058998 0.50935495
-0.08304451 -0.54694545 -0.50837678 -0.06508687 -0.15124534 -0.4037146
-0.02363339 -0.1337547 ]
In [10]:
model.save('Data/model.bin')
2018-05-14 21:16:21,561 : INFO : saving Word2Vec object under Data/model.bin, separately None
2018-05-14 21:16:21,565 : INFO : not storing attribute syn0norm
2018-05-14 21:16:21,568 : INFO : not storing attribute cum_table
2018-05-14 21:16:21,984 : INFO : saved Data/model.bin
In [11]:
model = Word2Vec.load('data/model.bin')
2018-05-14 21:17:42,693 : INFO : loading Word2Vec object from data/model.bin
2018-05-14 21:17:42,989 : INFO : loading wv recursively from data/model.bin.wv.* with mmap=None
2018-05-14 21:17:42,989 : INFO : setting ignored attribute syn0norm to None
2018-05-14 21:17:42,989 : INFO : setting ignored attribute cum_table to None
2018-05-14 21:17:42,989 : INFO : loaded data/model.bin
Content source: gedman4b/MachineLearning
Similar notebooks: