In [3]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
np.random.seed(1337)

from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.datasets import imdb
from keras import backend as K

from theano import function


Using Theano backend.

In [6]:
print("Loading data...")
max_features = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words = max_features)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')


Loading data...
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
25000 train sequences
25000 test sequences

In [8]:
print("Pad sequences(samples x time)")
maxlen = 500
X_train = sequence.pad_sequences(X_train, maxlen = maxlen)
X_test = sequence.pad_sequences(X_test, maxlen = maxlen)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


Pad sequences(samples x time)
X_train shape: (25000, 500)
X_test shape: (25000, 500)

In [9]:
print("Build model..")
model = Sequential()
model.add(Embedding(max_features, 128, input_length = maxlen))


Build model..

In [10]:
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [13]:
model.compile(loss='binary_crossentropy',
              optimizer = 'adam',
              metrics=["accuracy"])
print("Train..")
batch_size = 30
score = model.fit(X_train, y_train, batch_size = batch_size,
          nb_epoch = 4, validation_data = (X_test, y_test))


Train..
Train on 25000 samples, validate on 25000 samples
Epoch 1/4
25000/25000 [==============================] - 1224s - loss: 0.3778 - acc: 0.8353 - val_loss: 0.3857 - val_acc: 0.8345
Epoch 2/4
25000/25000 [==============================] - 1157s - loss: 0.3204 - acc: 0.8697 - val_loss: 0.5001 - val_acc: 0.7509
Epoch 3/4
25000/25000 [==============================] - 2144s - loss: 0.2639 - acc: 0.8935 - val_loss: 0.3063 - val_acc: 0.8739
Epoch 4/4
25000/25000 [==============================] - 2257s - loss: 0.2117 - acc: 0.9189 - val_loss: 0.3256 - val_acc: 0.8778

In [16]:
import matplotlib.pyplot as plt

plt.plot(score.history['acc'])
plt.plot(score.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(score.history['loss'])
plt.plot(score.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()



In [17]:
X_train[0]


Out[17]:
array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    1,   14,   22,   16,
         43,  530,  973, 1622, 1385,   65,  458, 4468,   66, 3941,    4,
        173,   36,  256,    5,   25,  100,   43,  838,  112,   50,  670,
          2,    9,   35,  480,  284,    5,  150,    4,  172,  112,  167,
          2,  336,  385,   39,    4,  172, 4536, 1111,   17,  546,   38,
         13,  447,    4,  192,   50,   16,    6,  147, 2025,   19,   14,
         22,    4, 1920, 4613,  469,    4,   22,   71,   87,   12,   16,
         43,  530,   38,   76,   15,   13, 1247,    4,   22,   17,  515,
         17,   12,   16,  626,   18,    2,    5,   62,  386,   12,    8,
        316,    8,  106,    5,    4, 2223,    2,   16,  480,   66, 3785,
         33,    4,  130,   12,   16,   38,  619,    5,   25,  124,   51,
         36,  135,   48,   25, 1415,   33,    6,   22,   12,  215,   28,
         77,   52,    5,   14,  407,   16,   82,    2,    8,    4,  107,
        117,    2,   15,  256,    4,    2,    7, 3766,    5,  723,   36,
         71,   43,  530,  476,   26,  400,  317,   46,    7,    4,    2,
       1029,   13,  104,   88,    4,  381,   15,  297,   98,   32, 2071,
         56,   26,  141,    6,  194,    2,   18,    4,  226,   22,   21,
        134,  476,   26,  480,    5,  144,   30,    2,   18,   51,   36,
         28,  224,   92,   25,  104,    4,  226,   65,   16,   38, 1334,
         88,   12,   16,  283,    5,   16, 4472,  113,  103,   32,   15,
         16,    2,   19,  178,   32], dtype=int32)

In [18]:
word_index = imdb.get_word_index()
index_word = {v:k for k,v in word_index.items()}


Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json

In [25]:
type(index_word.keys()[0])


Out[25]:
int

In [32]:
index_word[0] = '0'
' '.join(index_word[w] for w in X_train[0])


Out[32]:
u"0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room and it so heart shows to years of every never going and help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but and to story wonderful that in seeing in character to of 70s and with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other and in of seen over and for anyone of and br show's to whether from than out themselves history he name half some br of and odd was two most of mean for 1 any an boat she he should is thought and but of script you not while history he heart to real at and but when from one bit then have two of script their with her nobody most that with wasn't to with armed acting watch an for with and film want an"

In [91]:
X_train_words = []
for sentence in X_train:
    X_train_words += [[index_word[w] for w in sentence if w != "0"]]

In [67]:
import gensim
import logging

In [92]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
w2v_model = gensim.models.Word2Vec(X_train_words, min_count=1)


2017-05-10 14:28:45,839 : INFO : collecting all words and their counts
2017-05-10 14:28:45,846 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-05-10 14:28:46,440 : INFO : PROGRESS: at sentence #10000, processed 5000000 words, keeping 4998 word types
2017-05-10 14:28:46,979 : INFO : PROGRESS: at sentence #20000, processed 10000000 words, keeping 4998 word types
2017-05-10 14:28:47,230 : INFO : collected 4998 word types from a corpus of 12500000 raw words and 25000 sentences
2017-05-10 14:28:47,231 : INFO : Loading a fresh vocabulary
2017-05-10 14:28:47,328 : INFO : min_count=1 retains 4998 unique words (100% of original 4998, drops 0)
2017-05-10 14:28:47,329 : INFO : min_count=1 leaves 12500000 word corpus (100% of original 12500000, drops 0)
2017-05-10 14:28:47,352 : INFO : deleting the raw counts dictionary of 4998 items
2017-05-10 14:28:47,353 : INFO : sample=0.001 downsamples 21 most-common words
2017-05-10 14:28:47,355 : INFO : downsampling leaves estimated 4531025 word corpus (36.2% of prior 12500000)
2017-05-10 14:28:47,357 : INFO : estimated required memory for 4998 words and 100 dimensions: 6497400 bytes
2017-05-10 14:28:47,383 : INFO : resetting layer weights
2017-05-10 14:28:47,482 : INFO : training model with 3 workers on 4998 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2017-05-10 14:28:48,489 : INFO : PROGRESS: at 3.94% examples, 903660 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:28:49,489 : INFO : PROGRESS: at 8.10% examples, 917340 words/s, in_qsize 5, out_qsize 0
2017-05-10 14:28:50,491 : INFO : PROGRESS: at 12.38% examples, 934189 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:28:51,495 : INFO : PROGRESS: at 16.99% examples, 958488 words/s, in_qsize 6, out_qsize 1
2017-05-10 14:28:52,504 : INFO : PROGRESS: at 21.39% examples, 967189 words/s, in_qsize 6, out_qsize 1
2017-05-10 14:28:53,505 : INFO : PROGRESS: at 25.12% examples, 947237 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:28:54,509 : INFO : PROGRESS: at 28.21% examples, 910461 words/s, in_qsize 6, out_qsize 1
2017-05-10 14:28:55,511 : INFO : PROGRESS: at 31.41% examples, 886526 words/s, in_qsize 5, out_qsize 0
2017-05-10 14:28:56,513 : INFO : PROGRESS: at 35.98% examples, 902297 words/s, in_qsize 5, out_qsize 1
2017-05-10 14:28:57,515 : INFO : PROGRESS: at 40.88% examples, 923592 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:28:58,520 : INFO : PROGRESS: at 45.39% examples, 932692 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:28:59,522 : INFO : PROGRESS: at 50.13% examples, 942927 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:00,526 : INFO : PROGRESS: at 54.67% examples, 949447 words/s, in_qsize 6, out_qsize 1
2017-05-10 14:29:01,528 : INFO : PROGRESS: at 59.01% examples, 951779 words/s, in_qsize 4, out_qsize 0
2017-05-10 14:29:02,534 : INFO : PROGRESS: at 62.69% examples, 944540 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:03,535 : INFO : PROGRESS: at 66.53% examples, 939537 words/s, in_qsize 6, out_qsize 1
2017-05-10 14:29:04,536 : INFO : PROGRESS: at 70.02% examples, 929941 words/s, in_qsize 5, out_qsize 0
2017-05-10 14:29:05,537 : INFO : PROGRESS: at 73.34% examples, 920449 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:06,539 : INFO : PROGRESS: at 76.90% examples, 914117 words/s, in_qsize 5, out_qsize 1
2017-05-10 14:29:07,539 : INFO : PROGRESS: at 80.38% examples, 908463 words/s, in_qsize 4, out_qsize 0
2017-05-10 14:29:08,545 : INFO : PROGRESS: at 83.55% examples, 899567 words/s, in_qsize 5, out_qsize 1
2017-05-10 14:29:09,547 : INFO : PROGRESS: at 87.31% examples, 896817 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:10,552 : INFO : PROGRESS: at 91.09% examples, 894464 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:11,554 : INFO : PROGRESS: at 95.46% examples, 898365 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:12,558 : INFO : PROGRESS: at 98.78% examples, 892542 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:12,965 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-05-10 14:29:12,967 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-05-10 14:29:12,973 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-05-10 14:29:12,974 : INFO : training on 62500000 raw words (22656609 effective words) took 25.5s, 888977 effective words/s

In [95]:
X_train_words[0]
# w2v_model.wv[u'wonderful']


Out[95]:
['0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 u'the',
 u'as',
 u'you',
 u'with',
 u'out',
 u'themselves',
 u'powerful',
 u'lets',
 u'loves',
 u'their',
 u'becomes',
 u'reaching',
 u'had',
 u'journalist',
 u'of',
 u'lot',
 u'from',
 u'anyone',
 u'to',
 u'have',
 u'after',
 u'out',
 u'atmosphere',
 u'never',
 u'more',
 u'room',
 u'and',
 u'it',
 u'so',
 u'heart',
 u'shows',
 u'to',
 u'years',
 u'of',
 u'every',
 u'never',
 u'going',
 u'and',
 u'help',
 u'moments',
 u'or',
 u'of',
 u'every',
 u'chest',
 u'visual',
 u'movie',
 u'except',
 u'her',
 u'was',
 u'several',
 u'of',
 u'enough',
 u'more',
 u'with',
 u'is',
 u'now',
 u'current',
 u'film',
 u'as',
 u'you',
 u'of',
 u'mine',
 u'potentially',
 u'unfortunately',
 u'of',
 u'you',
 u'than',
 u'him',
 u'that',
 u'with',
 u'out',
 u'themselves',
 u'her',
 u'get',
 u'for',
 u'was',
 u'camp',
 u'of',
 u'you',
 u'movie',
 u'sometimes',
 u'movie',
 u'that',
 u'with',
 u'scary',
 u'but',
 u'and',
 u'to',
 u'story',
 u'wonderful',
 u'that',
 u'in',
 u'seeing',
 u'in',
 u'character',
 u'to',
 u'of',
 u'70s',
 u'and',
 u'with',
 u'heart',
 u'had',
 u'shadows',
 u'they',
 u'of',
 u'here',
 u'that',
 u'with',
 u'her',
 u'serious',
 u'to',
 u'have',
 u'does',
 u'when',
 u'from',
 u'why',
 u'what',
 u'have',
 u'critics',
 u'they',
 u'is',
 u'you',
 u'that',
 u"isn't",
 u'one',
 u'will',
 u'very',
 u'to',
 u'as',
 u'itself',
 u'with',
 u'other',
 u'and',
 u'in',
 u'of',
 u'seen',
 u'over',
 u'and',
 u'for',
 u'anyone',
 u'of',
 u'and',
 u'br',
 u"show's",
 u'to',
 u'whether',
 u'from',
 u'than',
 u'out',
 u'themselves',
 u'history',
 u'he',
 u'name',
 u'half',
 u'some',
 u'br',
 u'of',
 u'and',
 u'odd',
 u'was',
 u'two',
 u'most',
 u'of',
 u'mean',
 u'for',
 u'1',
 u'any',
 u'an',
 u'boat',
 u'she',
 u'he',
 u'should',
 u'is',
 u'thought',
 u'and',
 u'but',
 u'of',
 u'script',
 u'you',
 u'not',
 u'while',
 u'history',
 u'he',
 u'heart',
 u'to',
 u'real',
 u'at',
 u'and',
 u'but',
 u'when',
 u'from',
 u'one',
 u'bit',
 u'then',
 u'have',
 u'two',
 u'of',
 u'script',
 u'their',
 u'with',
 u'her',
 u'nobody',
 u'most',
 u'that',
 u'with',
 u"wasn't",
 u'to',
 u'with',
 u'armed',
 u'acting',
 u'watch',
 u'an',
 u'for',
 u'with',
 u'and',
 u'film',
 u'want',
 u'an']

In [ ]:


In [147]:
w2v_model.wv['sandler']
# [index_word[w] for w in X_train[0]]
# w2v_model.wv.most_similar(positive=['bad', 'well'], negative=['good'])
# print(w2v_model.wv.similarity('dinosaur', 'spielberg'))
# print(w2v_model.wv.similarity('sandler', 'woman'))


Out[147]:
array([-0.12254356,  0.08464921, -0.97466284, -0.23125856, -0.64915526,
       -0.18388164,  0.76747668, -0.0233916 , -0.11822106,  0.12072534,
        0.01562079,  0.15966068,  0.09456155,  0.06940489,  1.25917947,
        0.53069818,  0.31309906,  0.27209523,  0.27739969, -0.20734745,
        0.34070483,  0.22484052,  0.17897399, -0.41470134, -0.22246778,
       -0.58976907,  0.71105635, -0.30407917, -0.02301157, -0.51524049,
        0.32956386,  0.10615972, -0.67365152,  0.58188295,  0.6263693 ,
       -0.64749116, -0.04960984,  0.82754505,  0.15768802,  0.09536082,
        0.38240585, -0.00774292,  0.04048406, -0.87557566, -0.65339088,
       -0.53546149,  0.2809363 , -0.3177368 , -0.44026917, -0.28955364,
       -0.18334401,  0.41002616, -0.13841389,  0.19244394,  0.38882449,
        0.3402794 , -0.19592963, -0.16570446,  0.23080724, -0.11118862,
        0.42556122,  0.54166591,  0.30829766, -0.59903324, -0.75490725,
        0.21917841, -0.28062344, -0.2228691 ,  0.72345096, -0.27047646,
        0.37278536,  0.19560704, -0.24535486, -0.24909811,  0.01014762,
        0.18995444, -0.83859807,  0.08452499, -0.34518746,  0.25360781,
       -0.70085961,  0.09347685,  0.449334  ,  0.20431519,  0.05476317,
       -0.0355494 , -0.17515995,  0.16061474, -0.37417161, -0.06960552,
       -0.16299857, -0.39852008,  0.35810471,  0.14529118, -0.26855153,
       -0.07384432, -0.29711565,  0.44796848,  0.05807808,  0.37246943], dtype=float32)

In [47]:
from tempfile import mkstemp

In [48]:
fs, temp_path = mkstemp('word2vec_model_may9')
w2v_model.save(temp_path)


2017-05-09 02:44:32,610 : INFO : saving Word2Vec object under /var/folders/ns/srq5jt196rndzc9kh00zm1gc0000gp/T/tmpdSFF6Pword2vec_model_may9, separately None
2017-05-09 02:44:32,612 : INFO : not storing attribute syn0norm
2017-05-09 02:44:32,613 : INFO : not storing attribute cum_table
2017-05-09 02:44:32,619 : INFO : saved /var/folders/ns/srq5jt196rndzc9kh00zm1gc0000gp/T/tmpdSFF6Pword2vec_model_may9

In [51]:
w2v_model.accuracy('questions-words.txt')


Out[51]:
[{'correct': [], 'incorrect': [], 'section': u'capital-common-countries'},
 {'correct': [], 'incorrect': [], 'section': u'capital-world'},
 {'correct': [], 'incorrect': [], 'section': u'currency'},
 {'correct': [], 'incorrect': [], 'section': u'city-in-state'},
 {'correct': [], 'incorrect': [], 'section': u'family'},
 {'correct': [], 'incorrect': [], 'section': u'gram1-adjective-to-adverb'},
 {'correct': [], 'incorrect': [], 'section': u'gram2-opposite'},
 {'correct': [], 'incorrect': [], 'section': u'gram3-comparative'},
 {'correct': [], 'incorrect': [], 'section': u'gram4-superlative'},
 {'correct': [], 'incorrect': [], 'section': u'gram5-present-participle'},
 {'correct': [], 'incorrect': [], 'section': u'gram6-nationality-adjective'},
 {'correct': [], 'incorrect': [], 'section': u'gram7-past-tense'},
 {'correct': [], 'incorrect': [], 'section': u'gram8-plural'},
 {'correct': [], 'incorrect': [], 'section': u'gram9-plural-verbs'},
 {'correct': [], 'incorrect': [], 'section': 'total'}]

In [70]:
fs, temp_path = mkstemp('word2vec_model_may9')
w2v.model = gensim.models.Word2Vec.load('word2vec_model_may9')


2017-05-10 11:22:33,773 : INFO : loading Word2Vec object from word2vec_model_may9
---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-70-363e0eba1453> in <module>()
      1 fs, temp_path = mkstemp('word2vec_model_may9')
----> 2 w2v.model = gensim.models.Word2Vec.load('word2vec_model_may9')

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/gensim/models/word2vec.pyc in load(cls, *args, **kwargs)
   1408     @classmethod
   1409     def load(cls, *args, **kwargs):
-> 1410         model = super(Word2Vec, cls).load(*args, **kwargs)
   1411         # update older models
   1412         if hasattr(model, 'table'):

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/gensim/utils.pyc in load(cls, fname, mmap)
    269         compress, subname = SaveLoad._adapt_by_suffix(fname)
    270 
--> 271         obj = unpickle(fname)
    272         obj._load_specials(fname, mmap, compress, subname)
    273         logger.info("loaded %s", fname)

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/gensim/utils.pyc in unpickle(fname)
    928 def unpickle(fname):
    929     """Load pickled object from `fname`"""
--> 930     with smart_open(fname, 'rb') as f:
    931         # Because of loading from S3 load can't be used (missing readline in smart_open)
    932         if sys.version_info > (3, 0):

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/smart_open/smart_open_lib.pyc in smart_open(uri, mode, **kw)
    138             # local files -- both read & write supported
    139             # compression, if any, is determined by the filename extension (.gz, .bz2)
--> 140             return file_smart_open(parsed_uri.uri_path, mode)
    141         elif parsed_uri.scheme in ("s3", "s3n", "s3u"):
    142             kwargs = {}

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/smart_open/smart_open_lib.pyc in file_smart_open(fname, mode)
    642 
    643     """
--> 644     return compression_wrapper(open(fname, mode), fname, mode)
    645 
    646 

IOError: [Errno 2] No such file or directory: 'word2vec_model_may9'

In [52]:
from gensim.corpora import WikiCorpus

In [53]:
import pandas as pd

In [55]:
train = pd.read_csv( "labeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )

In [58]:
# Verify the number of reviews that were read (100,000 in total)
print (train["review"].size, test["review"].size, unlabeled_train["review"].size)


25000 25000 50000

In [59]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [60]:
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [150]:
# Download the punkt tokenizer for sentence splitting
import nltk.data
# nltk.download()   

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.decode('utf-8').strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [151]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)


Parsing sentences from training set
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:219: UserWarning: "." looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.
  ' Beautiful Soup.' % markup)
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.happierabroad.com"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
Parsing sentences from unlabeled set
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.archive.org/details/LovefromaStranger"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.loosechangeguide.com/LooseChangeGuide.html"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.msnbc.msn.com/id/4972055/site/newsweek/"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:219: UserWarning: ".." looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.
  ' Beautiful Soup.' % markup)
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.youtube.com/watch?v=a0KSqelmgN8"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://jake-weird.blogspot.com/2007/08/beneath.html"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup

In [154]:
# len(sentences)
wv_model = gensim.models.Word2Vec(sentences, min_count=1)


2017-05-10 21:09:38,806 : INFO : collecting all words and their counts
2017-05-10 21:09:38,810 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-05-10 21:09:39,060 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2017-05-10 21:09:39,218 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2017-05-10 21:09:39,338 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2017-05-10 21:09:39,477 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2017-05-10 21:09:39,620 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2017-05-10 21:09:39,752 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2017-05-10 21:09:39,890 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2017-05-10 21:09:40,176 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2017-05-10 21:09:40,318 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2017-05-10 21:09:40,498 : INFO : PROGRESS: at sentence #100000, processed 2226967 words, keeping 50207 word types
2017-05-10 21:09:40,666 : INFO : PROGRESS: at sentence #110000, processed 2446581 words, keeping 52081 word types
2017-05-10 21:09:40,829 : INFO : PROGRESS: at sentence #120000, processed 2668776 words, keeping 54119 word types
2017-05-10 21:09:40,982 : INFO : PROGRESS: at sentence #130000, processed 2894304 words, keeping 55847 word types
2017-05-10 21:09:41,131 : INFO : PROGRESS: at sentence #140000, processed 3107006 words, keeping 57346 word types
2017-05-10 21:09:41,275 : INFO : PROGRESS: at sentence #150000, processed 3332628 words, keeping 59055 word types
2017-05-10 21:09:41,409 : INFO : PROGRESS: at sentence #160000, processed 3555316 words, keeping 60617 word types
2017-05-10 21:09:41,551 : INFO : PROGRESS: at sentence #170000, processed 3778656 words, keeping 62077 word types
2017-05-10 21:09:41,678 : INFO : PROGRESS: at sentence #180000, processed 3999237 words, keeping 63496 word types
2017-05-10 21:09:41,844 : INFO : PROGRESS: at sentence #190000, processed 4224450 words, keeping 64794 word types
2017-05-10 21:09:42,005 : INFO : PROGRESS: at sentence #200000, processed 4448604 words, keeping 66087 word types
2017-05-10 21:09:42,140 : INFO : PROGRESS: at sentence #210000, processed 4669968 words, keeping 67390 word types
2017-05-10 21:09:42,303 : INFO : PROGRESS: at sentence #220000, processed 4894969 words, keeping 68697 word types
2017-05-10 21:09:42,526 : INFO : PROGRESS: at sentence #230000, processed 5117546 words, keeping 69958 word types
2017-05-10 21:09:42,819 : INFO : PROGRESS: at sentence #240000, processed 5345051 words, keeping 71167 word types
2017-05-10 21:09:43,038 : INFO : PROGRESS: at sentence #250000, processed 5559166 words, keeping 72351 word types
2017-05-10 21:09:43,170 : INFO : PROGRESS: at sentence #260000, processed 5779147 words, keeping 73478 word types
2017-05-10 21:09:43,329 : INFO : PROGRESS: at sentence #270000, processed 6000436 words, keeping 74767 word types
2017-05-10 21:09:43,491 : INFO : PROGRESS: at sentence #280000, processed 6226315 words, keeping 76369 word types
2017-05-10 21:09:43,661 : INFO : PROGRESS: at sentence #290000, processed 6449475 words, keeping 77839 word types
2017-05-10 21:09:43,853 : INFO : PROGRESS: at sentence #300000, processed 6674078 words, keeping 79171 word types
2017-05-10 21:09:44,046 : INFO : PROGRESS: at sentence #310000, processed 6899392 words, keeping 80480 word types
2017-05-10 21:09:44,217 : INFO : PROGRESS: at sentence #320000, processed 7124279 words, keeping 81808 word types
2017-05-10 21:09:44,403 : INFO : PROGRESS: at sentence #330000, processed 7346022 words, keeping 83030 word types
2017-05-10 21:09:44,575 : INFO : PROGRESS: at sentence #340000, processed 7575534 words, keeping 84280 word types
2017-05-10 21:09:44,775 : INFO : PROGRESS: at sentence #350000, processed 7798804 words, keeping 85425 word types
2017-05-10 21:09:45,012 : INFO : PROGRESS: at sentence #360000, processed 8019467 words, keeping 86596 word types
2017-05-10 21:09:45,224 : INFO : PROGRESS: at sentence #370000, processed 8246659 words, keeping 87708 word types
2017-05-10 21:09:45,417 : INFO : PROGRESS: at sentence #380000, processed 8471806 words, keeping 88878 word types
2017-05-10 21:09:45,637 : INFO : PROGRESS: at sentence #390000, processed 8701556 words, keeping 89907 word types
2017-05-10 21:09:45,853 : INFO : PROGRESS: at sentence #400000, processed 8924505 words, keeping 90916 word types
2017-05-10 21:09:46,073 : INFO : PROGRESS: at sentence #410000, processed 9145855 words, keeping 91880 word types
2017-05-10 21:09:46,267 : INFO : PROGRESS: at sentence #420000, processed 9366935 words, keeping 92912 word types
2017-05-10 21:09:46,413 : INFO : PROGRESS: at sentence #430000, processed 9594472 words, keeping 93932 word types
2017-05-10 21:09:46,591 : INFO : PROGRESS: at sentence #440000, processed 9821225 words, keeping 94906 word types
2017-05-10 21:09:46,786 : INFO : PROGRESS: at sentence #450000, processed 10044987 words, keeping 96036 word types
2017-05-10 21:09:47,018 : INFO : PROGRESS: at sentence #460000, processed 10277747 words, keeping 97088 word types
2017-05-10 21:09:47,205 : INFO : PROGRESS: at sentence #470000, processed 10505672 words, keeping 97933 word types
2017-05-10 21:09:47,406 : INFO : PROGRESS: at sentence #480000, processed 10726056 words, keeping 98862 word types
2017-05-10 21:09:47,584 : INFO : PROGRESS: at sentence #490000, processed 10952800 words, keeping 99871 word types
2017-05-10 21:09:47,746 : INFO : PROGRESS: at sentence #500000, processed 11174456 words, keeping 100765 word types
2017-05-10 21:09:47,974 : INFO : PROGRESS: at sentence #510000, processed 11399731 words, keeping 101699 word types
2017-05-10 21:09:48,178 : INFO : PROGRESS: at sentence #520000, processed 11623082 words, keeping 102598 word types
2017-05-10 21:09:48,432 : INFO : PROGRESS: at sentence #530000, processed 11847480 words, keeping 103400 word types
2017-05-10 21:09:48,609 : INFO : PROGRESS: at sentence #540000, processed 12072095 words, keeping 104265 word types
2017-05-10 21:09:48,787 : INFO : PROGRESS: at sentence #550000, processed 12297646 words, keeping 105133 word types
2017-05-10 21:09:48,940 : INFO : PROGRESS: at sentence #560000, processed 12518936 words, keeping 105997 word types
2017-05-10 21:09:49,108 : INFO : PROGRESS: at sentence #570000, processed 12748083 words, keeping 106787 word types
2017-05-10 21:09:49,273 : INFO : PROGRESS: at sentence #580000, processed 12969579 words, keeping 107665 word types
2017-05-10 21:09:49,431 : INFO : PROGRESS: at sentence #590000, processed 13195104 words, keeping 108501 word types
2017-05-10 21:09:49,587 : INFO : PROGRESS: at sentence #600000, processed 13417302 words, keeping 109218 word types
2017-05-10 21:09:49,738 : INFO : PROGRESS: at sentence #610000, processed 13638325 words, keeping 110092 word types
2017-05-10 21:09:49,998 : INFO : PROGRESS: at sentence #620000, processed 13864650 words, keeping 110837 word types
2017-05-10 21:09:50,237 : INFO : PROGRESS: at sentence #630000, processed 14088936 words, keeping 111610 word types
2017-05-10 21:09:50,384 : INFO : PROGRESS: at sentence #640000, processed 14309719 words, keeping 112416 word types
2017-05-10 21:09:50,559 : INFO : PROGRESS: at sentence #650000, processed 14535475 words, keeping 113196 word types
2017-05-10 21:09:50,766 : INFO : PROGRESS: at sentence #660000, processed 14758265 words, keeping 113945 word types
2017-05-10 21:09:50,997 : INFO : PROGRESS: at sentence #670000, processed 14981658 words, keeping 114643 word types
2017-05-10 21:09:51,230 : INFO : PROGRESS: at sentence #680000, processed 15206490 words, keeping 115354 word types
2017-05-10 21:09:51,409 : INFO : PROGRESS: at sentence #690000, processed 15428683 words, keeping 116131 word types
2017-05-10 21:09:51,585 : INFO : PROGRESS: at sentence #700000, processed 15657389 words, keeping 116943 word types
2017-05-10 21:09:51,773 : INFO : PROGRESS: at sentence #710000, processed 15880378 words, keeping 117596 word types
2017-05-10 21:09:51,931 : INFO : PROGRESS: at sentence #720000, processed 16105665 words, keeping 118221 word types
2017-05-10 21:09:52,181 : INFO : PROGRESS: at sentence #730000, processed 16332046 words, keeping 118954 word types
2017-05-10 21:09:52,370 : INFO : PROGRESS: at sentence #740000, processed 16553079 words, keeping 119668 word types
2017-05-10 21:09:52,531 : INFO : PROGRESS: at sentence #750000, processed 16771406 words, keeping 120295 word types
2017-05-10 21:09:52,753 : INFO : PROGRESS: at sentence #760000, processed 16990810 words, keeping 120930 word types
2017-05-10 21:09:52,971 : INFO : PROGRESS: at sentence #770000, processed 17217947 words, keeping 121703 word types
2017-05-10 21:09:53,132 : INFO : PROGRESS: at sentence #780000, processed 17448093 words, keeping 122402 word types
2017-05-10 21:09:53,288 : INFO : PROGRESS: at sentence #790000, processed 17675169 words, keeping 123066 word types
2017-05-10 21:09:53,378 : INFO : collected 123504 word types from a corpus of 17798270 raw words and 795538 sentences
2017-05-10 21:09:53,380 : INFO : Loading a fresh vocabulary
2017-05-10 21:10:03,284 : INFO : min_count=1 retains 123504 unique words (100% of original 123504, drops 0)
2017-05-10 21:10:03,288 : INFO : min_count=1 leaves 17798270 word corpus (100% of original 17798270, drops 0)
2017-05-10 21:10:03,970 : INFO : deleting the raw counts dictionary of 123504 items
2017-05-10 21:10:03,981 : INFO : sample=0.001 downsamples 48 most-common words
2017-05-10 21:10:03,983 : INFO : downsampling leaves estimated 13372562 word corpus (75.1% of prior 17798270)
2017-05-10 21:10:03,992 : INFO : estimated required memory for 123504 words and 100 dimensions: 160555200 bytes
2017-05-10 21:10:04,659 : INFO : resetting layer weights
2017-05-10 21:10:06,884 : INFO : training model with 3 workers on 123504 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2017-05-10 21:10:07,929 : INFO : PROGRESS: at 0.94% examples, 621682 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:08,937 : INFO : PROGRESS: at 2.11% examples, 695229 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:09,931 : INFO : PROGRESS: at 2.93% examples, 646438 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:10,932 : INFO : PROGRESS: at 4.09% examples, 675604 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:11,947 : INFO : PROGRESS: at 5.30% examples, 701649 words/s, in_qsize 4, out_qsize 0
2017-05-10 21:10:12,945 : INFO : PROGRESS: at 6.44% examples, 708646 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:13,952 : INFO : PROGRESS: at 7.58% examples, 716033 words/s, in_qsize 6, out_qsize 1
2017-05-10 21:10:14,963 : INFO : PROGRESS: at 8.76% examples, 725204 words/s, in_qsize 4, out_qsize 0
2017-05-10 21:10:15,960 : INFO : PROGRESS: at 9.84% examples, 725182 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:16,969 : INFO : PROGRESS: at 11.00% examples, 729278 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:17,978 : INFO : PROGRESS: at 12.06% examples, 728379 words/s, in_qsize 5, out_qsize 2
2017-05-10 21:10:18,977 : INFO : PROGRESS: at 13.18% examples, 730143 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:19,979 : INFO : PROGRESS: at 14.19% examples, 725564 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:20,983 : INFO : PROGRESS: at 15.33% examples, 728265 words/s, in_qsize 6, out_qsize 1
2017-05-10 21:10:21,993 : INFO : PROGRESS: at 16.41% examples, 727159 words/s, in_qsize 4, out_qsize 0
2017-05-10 21:10:23,000 : INFO : PROGRESS: at 17.47% examples, 725935 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:24,017 : INFO : PROGRESS: at 18.65% examples, 728774 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:25,027 : INFO : PROGRESS: at 19.82% examples, 731627 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:26,040 : INFO : PROGRESS: at 21.02% examples, 735203 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:27,038 : INFO : PROGRESS: at 22.23% examples, 738270 words/s, in_qsize 5, out_qsize 1
2017-05-10 21:10:28,038 : INFO : PROGRESS: at 23.39% examples, 739543 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:29,056 : INFO : PROGRESS: at 24.52% examples, 739402 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:30,079 : INFO : PROGRESS: at 25.55% examples, 736496 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:31,094 : INFO : PROGRESS: at 26.71% examples, 737179 words/s, in_qsize 3, out_qsize 0
2017-05-10 21:10:32,112 : INFO : PROGRESS: at 27.86% examples, 738277 words/s, in_qsize 5, out_qsize 1
2017-05-10 21:10:33,106 : INFO : PROGRESS: at 28.99% examples, 738965 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:34,112 : INFO : PROGRESS: at 30.08% examples, 738670 words/s, in_qsize 4, out_qsize 1
2017-05-10 21:10:35,113 : INFO : PROGRESS: at 31.17% examples, 738202 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:36,117 : INFO : PROGRESS: at 32.23% examples, 737496 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:37,136 : INFO : PROGRESS: at 33.39% examples, 738902 words/s, in_qsize 5, out_qsize 1
2017-05-10 21:10:38,123 : INFO : PROGRESS: at 34.54% examples, 739952 words/s, in_qsize 4, out_qsize 0
2017-05-10 21:10:39,123 : INFO : PROGRESS: at 35.70% examples, 740921 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:40,124 : INFO : PROGRESS: at 36.75% examples, 739566 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:41,168 : INFO : PROGRESS: at 37.28% examples, 727518 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:42,177 : INFO : PROGRESS: at 37.84% examples, 717330 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:43,177 : INFO : PROGRESS: at 38.71% examples, 713667 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:44,189 : INFO : PROGRESS: at 39.65% examples, 711206 words/s, in_qsize 6, out_qsize 1
2017-05-10 21:10:45,194 : INFO : PROGRESS: at 40.64% examples, 710028 words/s, in_qsize 5, out_qsize 1
2017-05-10 21:10:46,198 : INFO : PROGRESS: at 41.71% examples, 709811 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:47,211 : INFO : PROGRESS: at 42.81% examples, 710204 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:48,202 : INFO : PROGRESS: at 43.87% examples, 709887 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:49,222 : INFO : PROGRESS: at 44.95% examples, 710073 words/s, in_qsize 4, out_qsize 1
2017-05-10 21:10:50,213 : INFO : PROGRESS: at 45.96% examples, 709311 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:51,214 : INFO : PROGRESS: at 47.08% examples, 709832 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:52,218 : INFO : PROGRESS: at 48.20% examples, 710640 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:53,228 : INFO : PROGRESS: at 49.27% examples, 710678 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:54,233 : INFO : PROGRESS: at 50.31% examples, 710326 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:10:55,237 : INFO : PROGRESS: at 51.24% examples, 708606 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:56,261 : INFO : PROGRESS: at 51.86% examples, 702622 words/s, in_qsize 5, out_qsize 1
2017-05-10 21:10:57,246 : INFO : PROGRESS: at 52.66% examples, 699245 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:58,247 : INFO : PROGRESS: at 53.54% examples, 697157 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:10:59,256 : INFO : PROGRESS: at 54.44% examples, 695353 words/s, in_qsize 4, out_qsize 0
2017-05-10 21:11:00,269 : INFO : PROGRESS: at 55.42% examples, 694370 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:01,278 : INFO : PROGRESS: at 56.35% examples, 692941 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:02,292 : INFO : PROGRESS: at 57.36% examples, 692556 words/s, in_qsize 6, out_qsize 1
2017-05-10 21:11:03,309 : INFO : PROGRESS: at 58.33% examples, 691519 words/s, in_qsize 4, out_qsize 1
2017-05-10 21:11:04,322 : INFO : PROGRESS: at 59.41% examples, 691989 words/s, in_qsize 4, out_qsize 0
2017-05-10 21:11:05,316 : INFO : PROGRESS: at 60.36% examples, 690988 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:11:06,336 : INFO : PROGRESS: at 61.48% examples, 691864 words/s, in_qsize 5, out_qsize 1
2017-05-10 21:11:07,328 : INFO : PROGRESS: at 62.62% examples, 692824 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:11:08,333 : INFO : PROGRESS: at 63.66% examples, 692722 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:09,340 : INFO : PROGRESS: at 64.70% examples, 692597 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:11:10,354 : INFO : PROGRESS: at 65.81% examples, 693227 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:11:11,357 : INFO : PROGRESS: at 66.98% examples, 694419 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:12,358 : INFO : PROGRESS: at 68.15% examples, 695840 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:13,358 : INFO : PROGRESS: at 69.23% examples, 696315 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:14,367 : INFO : PROGRESS: at 70.28% examples, 696370 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:11:15,377 : INFO : PROGRESS: at 71.28% examples, 695829 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:11:16,385 : INFO : PROGRESS: at 72.43% examples, 696964 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:17,388 : INFO : PROGRESS: at 73.63% examples, 698438 words/s, in_qsize 4, out_qsize 0
2017-05-10 21:11:18,399 : INFO : PROGRESS: at 74.80% examples, 699585 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:19,402 : INFO : PROGRESS: at 75.96% examples, 700577 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:11:20,408 : INFO : PROGRESS: at 77.12% examples, 701493 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:21,408 : INFO : PROGRESS: at 78.25% examples, 702337 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:11:22,418 : INFO : PROGRESS: at 79.41% examples, 703182 words/s, in_qsize 5, out_qsize 1
2017-05-10 21:11:23,429 : INFO : PROGRESS: at 80.56% examples, 704069 words/s, in_qsize 4, out_qsize 0
2017-05-10 21:11:24,422 : INFO : PROGRESS: at 81.71% examples, 704836 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:25,435 : INFO : PROGRESS: at 82.90% examples, 705835 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:26,433 : INFO : PROGRESS: at 84.06% examples, 706547 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:27,443 : INFO : PROGRESS: at 85.23% examples, 707357 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:28,446 : INFO : PROGRESS: at 86.42% examples, 708307 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:29,459 : INFO : PROGRESS: at 87.60% examples, 709304 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:30,466 : INFO : PROGRESS: at 88.77% examples, 710111 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:31,475 : INFO : PROGRESS: at 89.92% examples, 710784 words/s, in_qsize 6, out_qsize 0
2017-05-10 21:11:32,479 : INFO : PROGRESS: at 91.09% examples, 711543 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:33,499 : INFO : PROGRESS: at 92.24% examples, 712191 words/s, in_qsize 6, out_qsize 1
2017-05-10 21:11:34,502 : INFO : PROGRESS: at 93.40% examples, 712934 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:35,518 : INFO : PROGRESS: at 94.59% examples, 713731 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:36,524 : INFO : PROGRESS: at 95.74% examples, 714271 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:37,537 : INFO : PROGRESS: at 96.89% examples, 714809 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:38,544 : INFO : PROGRESS: at 98.05% examples, 715453 words/s, in_qsize 4, out_qsize 0
2017-05-10 21:11:39,548 : INFO : PROGRESS: at 99.23% examples, 716118 words/s, in_qsize 5, out_qsize 0
2017-05-10 21:11:40,190 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-05-10 21:11:40,202 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-05-10 21:11:40,207 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-05-10 21:11:40,208 : INFO : training on 88991350 raw words (66861654 effective words) took 93.3s, 716674 effective words/s

In [169]:
print(wv_model.wv.most_similar(positive=['keanu', 'comedy'], negative=['sandler']))


[(u'vehicle', 0.49521416425704956), (u'dour', 0.4686088562011719), (u'opera', 0.44382157921791077), (u'potboiler', 0.4345802962779999), (u'kops', 0.43210089206695557), (u'programming', 0.4305029511451721), (u'melodrama', 0.425048291683197), (u'drama', 0.42215511202812195), (u'sitcom', 0.4108881950378418), (u'morros', 0.40979620814323425)]

Opções para o experimento

Vetores

  • Usar vetores pre-treinados (word2vec GloVe)
  • Treinar vetores em dados do IMDB sem remover stop-words
  • Treinar vetores em outros corpora

Intrinsic evaluation

  • Usar google questions
  • Criar questions baseadas no domínio (filmes, adjetivos, etc, personagens!!, diretores e atrizes, outras analogias, gênero e ator, ver distâncias --> ver a interface web para escolher as perguntas) <--------
  • Seguir modelo do artigo do GloVe (Pennington et al. 2014)
  • Use 20 News groups from scikit

Extrinsic evaluation


In [ ]: