In [1]:
# until ttk is installed, add parent dir to path
import sys
sys.path.insert(0, '..')

In [2]:
# typicaL imports
import pandas as pd
import numpy as np
import re

import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)

import matplotlib.pyplot as plt

import spacy
import ete3
import seaborn

from ttk.corpus import load_headline_corpus

In [3]:
%%time
# load the corpus
corpus = load_headline_corpus(verbose=True)

print ('Headlines:', len(corpus.sents()))


Loading corpus from: S:\git\tacticsiege\tactictoolkit\ttk\..\env\corpus\dated\2017_08_22\corpus
Corpus loaded.
Headlines: 190447
Wall time: 9.96 s

In [4]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

model = gensim.models.Word2Vec(corpus.sents(), min_count=10, workers=4)
print ('Done training Word2Vec')


S:\Anaconda3\lib\site-packages\gensim\utils.py:865: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
Using TensorFlow backend.
2017-09-16 13:38:31,640 : INFO : collecting all words and their counts
2017-09-16 13:38:31,641 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-16 13:38:31,658 : INFO : PROGRESS: at sentence #10000, processed 120930 words, keeping 16346 word types
2017-09-16 13:38:31,678 : INFO : PROGRESS: at sentence #20000, processed 244972 words, keeping 23140 word types
2017-09-16 13:38:31,697 : INFO : PROGRESS: at sentence #30000, processed 369583 words, keeping 27819 word types
2017-09-16 13:38:31,716 : INFO : PROGRESS: at sentence #40000, processed 489975 words, keeping 31387 word types
2017-09-16 13:38:31,734 : INFO : PROGRESS: at sentence #50000, processed 611707 words, keeping 34592 word types
2017-09-16 13:38:31,754 : INFO : PROGRESS: at sentence #60000, processed 735270 words, keeping 37551 word types
2017-09-16 13:38:31,777 : INFO : PROGRESS: at sentence #70000, processed 857896 words, keeping 40223 word types
2017-09-16 13:38:31,797 : INFO : PROGRESS: at sentence #80000, processed 980891 words, keeping 42695 word types
2017-09-16 13:38:31,818 : INFO : PROGRESS: at sentence #90000, processed 1105387 words, keeping 44940 word types
2017-09-16 13:38:31,837 : INFO : PROGRESS: at sentence #100000, processed 1230761 words, keeping 47094 word types
2017-09-16 13:38:31,858 : INFO : PROGRESS: at sentence #110000, processed 1353936 words, keeping 49133 word types
2017-09-16 13:38:31,877 : INFO : PROGRESS: at sentence #120000, processed 1478469 words, keeping 51067 word types
2017-09-16 13:38:31,897 : INFO : PROGRESS: at sentence #130000, processed 1601591 words, keeping 52758 word types
2017-09-16 13:38:31,917 : INFO : PROGRESS: at sentence #140000, processed 1726149 words, keeping 54412 word types
2017-09-16 13:38:31,936 : INFO : PROGRESS: at sentence #150000, processed 1848459 words, keeping 56043 word types
2017-09-16 13:38:31,956 : INFO : PROGRESS: at sentence #160000, processed 1971437 words, keeping 57739 word types
2017-09-16 13:38:31,976 : INFO : PROGRESS: at sentence #170000, processed 2095738 words, keeping 59227 word types
2017-09-16 13:38:31,996 : INFO : PROGRESS: at sentence #180000, processed 2218582 words, keeping 60698 word types
2017-09-16 13:38:32,016 : INFO : PROGRESS: at sentence #190000, processed 2343161 words, keeping 62129 word types
2017-09-16 13:38:32,019 : INFO : collected 62172 word types from a corpus of 2348615 raw words and 190447 sentences
2017-09-16 13:38:32,020 : INFO : Loading a fresh vocabulary
2017-09-16 13:38:32,053 : INFO : min_count=10 retains 15826 unique words (25% of original 62172, drops 46346)
2017-09-16 13:38:32,054 : INFO : min_count=10 leaves 2228384 word corpus (94% of original 2348615, drops 120231)
2017-09-16 13:38:32,086 : INFO : deleting the raw counts dictionary of 62172 items
2017-09-16 13:38:32,088 : INFO : sample=0.001 downsamples 27 most-common words
2017-09-16 13:38:32,089 : INFO : downsampling leaves estimated 1788600 word corpus (80.3% of prior 2228384)
2017-09-16 13:38:32,090 : INFO : estimated required memory for 15826 words and 100 dimensions: 20573800 bytes
2017-09-16 13:38:32,124 : INFO : resetting layer weights
2017-09-16 13:38:32,268 : INFO : training model with 4 workers on 15826 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2017-09-16 13:38:33,282 : INFO : PROGRESS: at 22.73% examples, 2024628 words/s, in_qsize 7, out_qsize 0
2017-09-16 13:38:34,282 : INFO : PROGRESS: at 45.66% examples, 2036380 words/s, in_qsize 7, out_qsize 0
2017-09-16 13:38:35,285 : INFO : PROGRESS: at 68.73% examples, 2043583 words/s, in_qsize 7, out_qsize 0
2017-09-16 13:38:36,286 : INFO : PROGRESS: at 91.76% examples, 2047611 words/s, in_qsize 7, out_qsize 0
2017-09-16 13:38:36,622 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-09-16 13:38:36,625 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-09-16 13:38:36,627 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-09-16 13:38:36,632 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-09-16 13:38:36,633 : INFO : training on 11743075 raw words (8942538 effective words) took 4.4s, 2053886 effective words/s
Done training Word2Vec

In [24]:
v = model['Trump']
print (v)
wv = model.wv['Trump']
print (wv)


[ 0.29885244  0.59210825  0.6835134  -1.97689641 -1.77425933  2.72494745
 -1.80706406  3.09676409 -1.43100131  1.43180478 -2.84082317 -0.17713302
  0.39028874 -0.97122753  0.77371252 -0.41681597 -2.804039    0.4942773
  0.8730706  -0.52051955 -1.22874975  0.48163256  1.6346823   0.34572822
  2.15361166  0.63823193  0.4967922  -1.57182133  3.57459664  1.59961689
 -1.87780499 -0.8334884  -1.15448999  0.17362782  0.05077303 -1.9494617
  0.01808122 -0.24651465 -0.60728067 -0.39412042 -1.72290623 -0.5231334
  1.05408156  1.16079271  2.74360561 -0.71948671  1.4831742  -0.95651716
 -1.4159652   2.21552706 -0.65985197 -1.49993432 -1.14925528 -3.10891652
 -0.55597746  0.50553614 -0.08452856  1.08416414 -2.69906807  1.86237717
  1.05386806  1.24952865 -0.43346357 -0.29969639 -2.7266655  -0.08025009
  0.0321678  -1.85476863  0.49063626  0.36337584  1.47239769  4.60377884
  0.19552235  0.60449493 -1.64198732 -0.33059791  1.43331826 -1.43779182
 -0.52706558  1.33855557 -1.07231784 -3.47415829  1.48423481  0.74174565
  0.15316947 -0.53314239 -2.77853847 -0.06810275  0.7514838  -0.52830392
  0.38485211  0.72767335  0.0467535  -1.96421862  0.84556675 -1.90041423
  0.56232399  1.26560628 -0.29252657 -1.44901657]
[ 0.29885244  0.59210825  0.6835134  -1.97689641 -1.77425933  2.72494745
 -1.80706406  3.09676409 -1.43100131  1.43180478 -2.84082317 -0.17713302
  0.39028874 -0.97122753  0.77371252 -0.41681597 -2.804039    0.4942773
  0.8730706  -0.52051955 -1.22874975  0.48163256  1.6346823   0.34572822
  2.15361166  0.63823193  0.4967922  -1.57182133  3.57459664  1.59961689
 -1.87780499 -0.8334884  -1.15448999  0.17362782  0.05077303 -1.9494617
  0.01808122 -0.24651465 -0.60728067 -0.39412042 -1.72290623 -0.5231334
  1.05408156  1.16079271  2.74360561 -0.71948671  1.4831742  -0.95651716
 -1.4159652   2.21552706 -0.65985197 -1.49993432 -1.14925528 -3.10891652
 -0.55597746  0.50553614 -0.08452856  1.08416414 -2.69906807  1.86237717
  1.05386806  1.24952865 -0.43346357 -0.29969639 -2.7266655  -0.08025009
  0.0321678  -1.85476863  0.49063626  0.36337584  1.47239769  4.60377884
  0.19552235  0.60449493 -1.64198732 -0.33059791  1.43331826 -1.43779182
 -0.52706558  1.33855557 -1.07231784 -3.47415829  1.48423481  0.74174565
  0.15316947 -0.53314239 -2.77853847 -0.06810275  0.7514838  -0.52830392
  0.38485211  0.72767335  0.0467535  -1.96421862  0.84556675 -1.90041423
  0.56232399  1.26560628 -0.29252657 -1.44901657]

In [16]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'])


Out[16]:
[('prince', 0.8552632331848145),
 ('crown', 0.7827849388122559),
 ('royal', 0.7457411289215088),
 ('prime', 0.7392249703407288),
 ('interior', 0.7353373169898987),
 ('Mohammed', 0.727878749370575),
 ('Orban', 0.7038736343383789),
 ('newspaper', 0.69437175989151),
 ('bin', 0.6856281757354736),
 ('model', 0.6794185638427734)]

In [23]:
model.wv.doesnt_match("Trump Sessions Obama fire".split())


Out[23]:
'fire'

In [26]:
wv = model['Trump']
inverse = model.most_similar(positive=[wv], topn=1)
print (inverse)


[('Trump', 1.0)]

In [ ]: