In [3]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
np.random.seed(1337)

from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.datasets import imdb
from keras import backend as K

from theano import function


Using Theano backend.

In [6]:
print("Loading data...")
max_features = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words = max_features)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')


Loading data...
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
25000 train sequences
25000 test sequences

In [8]:
print("Pad sequences(samples x time)")
maxlen = 500
X_train = sequence.pad_sequences(X_train, maxlen = maxlen)
X_test = sequence.pad_sequences(X_test, maxlen = maxlen)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


Pad sequences(samples x time)
X_train shape: (25000, 500)
X_test shape: (25000, 500)

In [9]:
print("Build model..")
model = Sequential()
model.add(Embedding(max_features, 128, input_length = maxlen))


Build model..

In [10]:
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [13]:
model.compile(loss='binary_crossentropy',
              optimizer = 'adam',
              metrics=["accuracy"])
print("Train..")
batch_size = 30
score = model.fit(X_train, y_train, batch_size = batch_size,
          nb_epoch = 4, validation_data = (X_test, y_test))


Train..
Train on 25000 samples, validate on 25000 samples
Epoch 1/4
25000/25000 [==============================] - 1224s - loss: 0.3778 - acc: 0.8353 - val_loss: 0.3857 - val_acc: 0.8345
Epoch 2/4
25000/25000 [==============================] - 1157s - loss: 0.3204 - acc: 0.8697 - val_loss: 0.5001 - val_acc: 0.7509
Epoch 3/4
25000/25000 [==============================] - 2144s - loss: 0.2639 - acc: 0.8935 - val_loss: 0.3063 - val_acc: 0.8739
Epoch 4/4
25000/25000 [==============================] - 2257s - loss: 0.2117 - acc: 0.9189 - val_loss: 0.3256 - val_acc: 0.8778

In [16]:
import matplotlib.pyplot as plt

plt.plot(score.history['acc'])
plt.plot(score.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(score.history['loss'])
plt.plot(score.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()



In [17]:
X_train[0]


Out[17]:
array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    1,   14,   22,   16,
         43,  530,  973, 1622, 1385,   65,  458, 4468,   66, 3941,    4,
        173,   36,  256,    5,   25,  100,   43,  838,  112,   50,  670,
          2,    9,   35,  480,  284,    5,  150,    4,  172,  112,  167,
          2,  336,  385,   39,    4,  172, 4536, 1111,   17,  546,   38,
         13,  447,    4,  192,   50,   16,    6,  147, 2025,   19,   14,
         22,    4, 1920, 4613,  469,    4,   22,   71,   87,   12,   16,
         43,  530,   38,   76,   15,   13, 1247,    4,   22,   17,  515,
         17,   12,   16,  626,   18,    2,    5,   62,  386,   12,    8,
        316,    8,  106,    5,    4, 2223,    2,   16,  480,   66, 3785,
         33,    4,  130,   12,   16,   38,  619,    5,   25,  124,   51,
         36,  135,   48,   25, 1415,   33,    6,   22,   12,  215,   28,
         77,   52,    5,   14,  407,   16,   82,    2,    8,    4,  107,
        117,    2,   15,  256,    4,    2,    7, 3766,    5,  723,   36,
         71,   43,  530,  476,   26,  400,  317,   46,    7,    4,    2,
       1029,   13,  104,   88,    4,  381,   15,  297,   98,   32, 2071,
         56,   26,  141,    6,  194,    2,   18,    4,  226,   22,   21,
        134,  476,   26,  480,    5,  144,   30,    2,   18,   51,   36,
         28,  224,   92,   25,  104,    4,  226,   65,   16,   38, 1334,
         88,   12,   16,  283,    5,   16, 4472,  113,  103,   32,   15,
         16,    2,   19,  178,   32], dtype=int32)

In [18]:
word_index = imdb.get_word_index()
index_word = {v:k for k,v in word_index.items()}


Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json

In [25]:
type(index_word.keys()[0])


Out[25]:
int

In [32]:
index_word[0] = '0'
' '.join(index_word[w] for w in X_train[0])


Out[32]:
u"0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room and it so heart shows to years of every never going and help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but and to story wonderful that in seeing in character to of 70s and with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other and in of seen over and for anyone of and br show's to whether from than out themselves history he name half some br of and odd was two most of mean for 1 any an boat she he should is thought and but of script you not while history he heart to real at and but when from one bit then have two of script their with her nobody most that with wasn't to with armed acting watch an for with and film want an"

In [91]:
X_train_words = []
for sentence in X_train:
    X_train_words += [[index_word[w] for w in sentence if w != "0"]]

In [67]:
import gensim
import logging

In [92]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
w2v_model = gensim.models.Word2Vec(X_train_words, min_count=1)


2017-05-10 14:28:45,839 : INFO : collecting all words and their counts
2017-05-10 14:28:45,846 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-05-10 14:28:46,440 : INFO : PROGRESS: at sentence #10000, processed 5000000 words, keeping 4998 word types
2017-05-10 14:28:46,979 : INFO : PROGRESS: at sentence #20000, processed 10000000 words, keeping 4998 word types
2017-05-10 14:28:47,230 : INFO : collected 4998 word types from a corpus of 12500000 raw words and 25000 sentences
2017-05-10 14:28:47,231 : INFO : Loading a fresh vocabulary
2017-05-10 14:28:47,328 : INFO : min_count=1 retains 4998 unique words (100% of original 4998, drops 0)
2017-05-10 14:28:47,329 : INFO : min_count=1 leaves 12500000 word corpus (100% of original 12500000, drops 0)
2017-05-10 14:28:47,352 : INFO : deleting the raw counts dictionary of 4998 items
2017-05-10 14:28:47,353 : INFO : sample=0.001 downsamples 21 most-common words
2017-05-10 14:28:47,355 : INFO : downsampling leaves estimated 4531025 word corpus (36.2% of prior 12500000)
2017-05-10 14:28:47,357 : INFO : estimated required memory for 4998 words and 100 dimensions: 6497400 bytes
2017-05-10 14:28:47,383 : INFO : resetting layer weights
2017-05-10 14:28:47,482 : INFO : training model with 3 workers on 4998 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2017-05-10 14:28:48,489 : INFO : PROGRESS: at 3.94% examples, 903660 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:28:49,489 : INFO : PROGRESS: at 8.10% examples, 917340 words/s, in_qsize 5, out_qsize 0
2017-05-10 14:28:50,491 : INFO : PROGRESS: at 12.38% examples, 934189 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:28:51,495 : INFO : PROGRESS: at 16.99% examples, 958488 words/s, in_qsize 6, out_qsize 1
2017-05-10 14:28:52,504 : INFO : PROGRESS: at 21.39% examples, 967189 words/s, in_qsize 6, out_qsize 1
2017-05-10 14:28:53,505 : INFO : PROGRESS: at 25.12% examples, 947237 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:28:54,509 : INFO : PROGRESS: at 28.21% examples, 910461 words/s, in_qsize 6, out_qsize 1
2017-05-10 14:28:55,511 : INFO : PROGRESS: at 31.41% examples, 886526 words/s, in_qsize 5, out_qsize 0
2017-05-10 14:28:56,513 : INFO : PROGRESS: at 35.98% examples, 902297 words/s, in_qsize 5, out_qsize 1
2017-05-10 14:28:57,515 : INFO : PROGRESS: at 40.88% examples, 923592 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:28:58,520 : INFO : PROGRESS: at 45.39% examples, 932692 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:28:59,522 : INFO : PROGRESS: at 50.13% examples, 942927 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:00,526 : INFO : PROGRESS: at 54.67% examples, 949447 words/s, in_qsize 6, out_qsize 1
2017-05-10 14:29:01,528 : INFO : PROGRESS: at 59.01% examples, 951779 words/s, in_qsize 4, out_qsize 0
2017-05-10 14:29:02,534 : INFO : PROGRESS: at 62.69% examples, 944540 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:03,535 : INFO : PROGRESS: at 66.53% examples, 939537 words/s, in_qsize 6, out_qsize 1
2017-05-10 14:29:04,536 : INFO : PROGRESS: at 70.02% examples, 929941 words/s, in_qsize 5, out_qsize 0
2017-05-10 14:29:05,537 : INFO : PROGRESS: at 73.34% examples, 920449 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:06,539 : INFO : PROGRESS: at 76.90% examples, 914117 words/s, in_qsize 5, out_qsize 1
2017-05-10 14:29:07,539 : INFO : PROGRESS: at 80.38% examples, 908463 words/s, in_qsize 4, out_qsize 0
2017-05-10 14:29:08,545 : INFO : PROGRESS: at 83.55% examples, 899567 words/s, in_qsize 5, out_qsize 1
2017-05-10 14:29:09,547 : INFO : PROGRESS: at 87.31% examples, 896817 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:10,552 : INFO : PROGRESS: at 91.09% examples, 894464 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:11,554 : INFO : PROGRESS: at 95.46% examples, 898365 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:12,558 : INFO : PROGRESS: at 98.78% examples, 892542 words/s, in_qsize 6, out_qsize 0
2017-05-10 14:29:12,965 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-05-10 14:29:12,967 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-05-10 14:29:12,973 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-05-10 14:29:12,974 : INFO : training on 62500000 raw words (22656609 effective words) took 25.5s, 888977 effective words/s

In [95]:
X_train_words[0]
# w2v_model.wv[u'wonderful']


Out[95]:
['0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 u'the',
 u'as',
 u'you',
 u'with',
 u'out',
 u'themselves',
 u'powerful',
 u'lets',
 u'loves',
 u'their',
 u'becomes',
 u'reaching',
 u'had',
 u'journalist',
 u'of',
 u'lot',
 u'from',
 u'anyone',
 u'to',
 u'have',
 u'after',
 u'out',
 u'atmosphere',
 u'never',
 u'more',
 u'room',
 u'and',
 u'it',
 u'so',
 u'heart',
 u'shows',
 u'to',
 u'years',
 u'of',
 u'every',
 u'never',
 u'going',
 u'and',
 u'help',
 u'moments',
 u'or',
 u'of',
 u'every',
 u'chest',
 u'visual',
 u'movie',
 u'except',
 u'her',
 u'was',
 u'several',
 u'of',
 u'enough',
 u'more',
 u'with',
 u'is',
 u'now',
 u'current',
 u'film',
 u'as',
 u'you',
 u'of',
 u'mine',
 u'potentially',
 u'unfortunately',
 u'of',
 u'you',
 u'than',
 u'him',
 u'that',
 u'with',
 u'out',
 u'themselves',
 u'her',
 u'get',
 u'for',
 u'was',
 u'camp',
 u'of',
 u'you',
 u'movie',
 u'sometimes',
 u'movie',
 u'that',
 u'with',
 u'scary',
 u'but',
 u'and',
 u'to',
 u'story',
 u'wonderful',
 u'that',
 u'in',
 u'seeing',
 u'in',
 u'character',
 u'to',
 u'of',
 u'70s',
 u'and',
 u'with',
 u'heart',
 u'had',
 u'shadows',
 u'they',
 u'of',
 u'here',
 u'that',
 u'with',
 u'her',
 u'serious',
 u'to',
 u'have',
 u'does',
 u'when',
 u'from',
 u'why',
 u'what',
 u'have',
 u'critics',
 u'they',
 u'is',
 u'you',
 u'that',
 u"isn't",
 u'one',
 u'will',
 u'very',
 u'to',
 u'as',
 u'itself',
 u'with',
 u'other',
 u'and',
 u'in',
 u'of',
 u'seen',
 u'over',
 u'and',
 u'for',
 u'anyone',
 u'of',
 u'and',
 u'br',
 u"show's",
 u'to',
 u'whether',
 u'from',
 u'than',
 u'out',
 u'themselves',
 u'history',
 u'he',
 u'name',
 u'half',
 u'some',
 u'br',
 u'of',
 u'and',
 u'odd',
 u'was',
 u'two',
 u'most',
 u'of',
 u'mean',
 u'for',
 u'1',
 u'any',
 u'an',
 u'boat',
 u'she',
 u'he',
 u'should',
 u'is',
 u'thought',
 u'and',
 u'but',
 u'of',
 u'script',
 u'you',
 u'not',
 u'while',
 u'history',
 u'he',
 u'heart',
 u'to',
 u'real',
 u'at',
 u'and',
 u'but',
 u'when',
 u'from',
 u'one',
 u'bit',
 u'then',
 u'have',
 u'two',
 u'of',
 u'script',
 u'their',
 u'with',
 u'her',
 u'nobody',
 u'most',
 u'that',
 u'with',
 u"wasn't",
 u'to',
 u'with',
 u'armed',
 u'acting',
 u'watch',
 u'an',
 u'for',
 u'with',
 u'and',
 u'film',
 u'want',
 u'an']

In [ ]:


In [107]:
# w2v_model.wv['lucas']
# [index_word[w] for w in X_train[0]]
# w2v_model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
w2v_model.wv.similarity('good', 'spielberg')
w2v_model.wv.similarity('good', 'tarantino')


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-107-bf103bc6b568> in <module>()
      3 # w2v_model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
      4 w2v_model.wv.similarity('good', 'spielberg')
----> 5 w2v_model.wv.similarity('good', 'tarantino')

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/gensim/models/keyedvectors.pyc in similarity(self, w1, w2)
    594 
    595         """
--> 596         return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2]))
    597 
    598     def n_similarity(self, ws1, ws2):

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/gensim/models/keyedvectors.pyc in __getitem__(self, words)
    574         if isinstance(words, string_types):
    575             # allow calls like trained_model['office'], as a shorthand for trained_model[['office']]
--> 576             return self.word_vec(words)
    577 
    578         return vstack([self.word_vec(word) for word in words])

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/gensim/models/keyedvectors.pyc in word_vec(self, word, use_norm)
    273                 return self.syn0[self.vocab[word].index]
    274         else:
--> 275             raise KeyError("word '%s' not in vocabulary" % word)
    276 
    277     def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None):

KeyError: "word 'tarantino' not in vocabulary"

In [47]:
from tempfile import mkstemp

In [48]:
fs, temp_path = mkstemp('word2vec_model_may9')
w2v_model.save(temp_path)


2017-05-09 02:44:32,610 : INFO : saving Word2Vec object under /var/folders/ns/srq5jt196rndzc9kh00zm1gc0000gp/T/tmpdSFF6Pword2vec_model_may9, separately None
2017-05-09 02:44:32,612 : INFO : not storing attribute syn0norm
2017-05-09 02:44:32,613 : INFO : not storing attribute cum_table
2017-05-09 02:44:32,619 : INFO : saved /var/folders/ns/srq5jt196rndzc9kh00zm1gc0000gp/T/tmpdSFF6Pword2vec_model_may9

In [51]:
w2v_model.accuracy('questions-words.txt')


Out[51]:
[{'correct': [], 'incorrect': [], 'section': u'capital-common-countries'},
 {'correct': [], 'incorrect': [], 'section': u'capital-world'},
 {'correct': [], 'incorrect': [], 'section': u'currency'},
 {'correct': [], 'incorrect': [], 'section': u'city-in-state'},
 {'correct': [], 'incorrect': [], 'section': u'family'},
 {'correct': [], 'incorrect': [], 'section': u'gram1-adjective-to-adverb'},
 {'correct': [], 'incorrect': [], 'section': u'gram2-opposite'},
 {'correct': [], 'incorrect': [], 'section': u'gram3-comparative'},
 {'correct': [], 'incorrect': [], 'section': u'gram4-superlative'},
 {'correct': [], 'incorrect': [], 'section': u'gram5-present-participle'},
 {'correct': [], 'incorrect': [], 'section': u'gram6-nationality-adjective'},
 {'correct': [], 'incorrect': [], 'section': u'gram7-past-tense'},
 {'correct': [], 'incorrect': [], 'section': u'gram8-plural'},
 {'correct': [], 'incorrect': [], 'section': u'gram9-plural-verbs'},
 {'correct': [], 'incorrect': [], 'section': 'total'}]

In [70]:
fs, temp_path = mkstemp('word2vec_model_may9')
w2v.model = gensim.models.Word2Vec.load('word2vec_model_may9')


2017-05-10 11:22:33,773 : INFO : loading Word2Vec object from word2vec_model_may9
---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-70-363e0eba1453> in <module>()
      1 fs, temp_path = mkstemp('word2vec_model_may9')
----> 2 w2v.model = gensim.models.Word2Vec.load('word2vec_model_may9')

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/gensim/models/word2vec.pyc in load(cls, *args, **kwargs)
   1408     @classmethod
   1409     def load(cls, *args, **kwargs):
-> 1410         model = super(Word2Vec, cls).load(*args, **kwargs)
   1411         # update older models
   1412         if hasattr(model, 'table'):

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/gensim/utils.pyc in load(cls, fname, mmap)
    269         compress, subname = SaveLoad._adapt_by_suffix(fname)
    270 
--> 271         obj = unpickle(fname)
    272         obj._load_specials(fname, mmap, compress, subname)
    273         logger.info("loaded %s", fname)

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/gensim/utils.pyc in unpickle(fname)
    928 def unpickle(fname):
    929     """Load pickled object from `fname`"""
--> 930     with smart_open(fname, 'rb') as f:
    931         # Because of loading from S3 load can't be used (missing readline in smart_open)
    932         if sys.version_info > (3, 0):

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/smart_open/smart_open_lib.pyc in smart_open(uri, mode, **kw)
    138             # local files -- both read & write supported
    139             # compression, if any, is determined by the filename extension (.gz, .bz2)
--> 140             return file_smart_open(parsed_uri.uri_path, mode)
    141         elif parsed_uri.scheme in ("s3", "s3n", "s3u"):
    142             kwargs = {}

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/smart_open/smart_open_lib.pyc in file_smart_open(fname, mode)
    642 
    643     """
--> 644     return compression_wrapper(open(fname, mode), fname, mode)
    645 
    646 

IOError: [Errno 2] No such file or directory: 'word2vec_model_may9'

In [52]:
from gensim.corpora import WikiCorpus

In [53]:
import pandas as pd

In [55]:
train = pd.read_csv( "labeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )

In [58]:
# Verify the number of reviews that were read (100,000 in total)
print (train["review"].size, test["review"].size, unlabeled_train["review"].size)


25000 25000 50000

In [59]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [60]:
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [61]:
# Download the punkt tokenizer for sentence splitting
import nltk.data
nltk.download()   

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences


removing collection member with no package: panlex_lite
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
removing collection member with no package: panlex_lite
removing collection member with no package: panlex_lite

In [64]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)


Parsing sentences from training set
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html5lib"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

The code that caused this warning is on line 162 of the file /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py. To get rid of this warning, change code that looks like this:

 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))
---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-64-c0833954352c> in <module>()
      3 print("Parsing sentences from training set")
      4 for review in train["review"]:
----> 5     sentences += review_to_sentences(review, tokenizer)
      6 
      7 print("Parsing sentences from unlabeled set")

<ipython-input-61-ec4923480682> in review_to_sentences(review, tokenizer, remove_stopwords)
     12     #
     13     # 1. Use the NLTK tokenizer to split the paragraph into sentences
---> 14     raw_sentences = tokenizer.tokenize(review.strip())
     15     #
     16     # 2. Loop over each sentence

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in tokenize(self, text, realign_boundaries)
   1235         Given a text, returns a list of the sentences in that text.
   1236         """
-> 1237         return list(self.sentences_from_text(text, realign_boundaries))
   1238 
   1239     def debug_decisions(self, text):

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in sentences_from_text(self, text, realign_boundaries)
   1283         follows the period.
   1284         """
-> 1285         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
   1286 
   1287     def _slices_from_text(self, text):

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in span_tokenize(self, text, realign_boundaries)
   1274         if realign_boundaries:
   1275             slices = self._realign_boundaries(text, slices)
-> 1276         return [(sl.start, sl.stop) for sl in slices]
   1277 
   1278     def sentences_from_text(self, text, realign_boundaries=True):

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in _realign_boundaries(self, text, slices)
   1314         """
   1315         realign = 0
-> 1316         for sl1, sl2 in _pair_iter(slices):
   1317             sl1 = slice(sl1.start + realign, sl1.stop)
   1318             if not sl2:

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in _pair_iter(it)
    309     it = iter(it)
    310     prev = next(it)
--> 311     for el in it:
    312         yield (prev, el)
    313         prev = el

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in _slices_from_text(self, text)
   1289         for match in self._lang_vars.period_context_re().finditer(text):
   1290             context = match.group() + match.group('after_tok')
-> 1291             if self.text_contains_sentbreak(context):
   1292                 yield slice(last_break, match.end())
   1293                 if match.group('next_tok'):

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in text_contains_sentbreak(self, text)
   1335         """
   1336         found = False # used to ignore last token
-> 1337         for t in self._annotate_tokens(self._tokenize_words(text)):
   1338             if found:
   1339                 return True

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in _annotate_second_pass(self, tokens)
   1470         heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
   1471         """
-> 1472         for t1, t2 in _pair_iter(tokens):
   1473             self._second_pass_annotation(t1, t2)
   1474             yield t1

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in _pair_iter(it)
    308     """
    309     it = iter(it)
--> 310     prev = next(it)
    311     for el in it:
    312         yield (prev, el)

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in _annotate_first_pass(self, tokens)
    575           - ellipsis_toks: The indices of all ellipsis marks.
    576         """
--> 577         for aug_tok in tokens:
    578             self._first_pass_annotation(aug_tok)
    579             yield aug_tok

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/nltk/tokenize/punkt.pyc in _tokenize_words(self, plaintext)
    540         """
    541         parastart = False
--> 542         for line in plaintext.split('\n'):
    543             if line.strip():
    544                 line_toks = iter(self._lang_vars.word_tokenize(line))

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 15: ordinal not in range(128)

Opções para o experimento

Vetores

  • Usar vetores pre-treinados (word2vec GloVe)
  • Treinar vetores em dados do IMDB sem remover stop-words
  • Treinar vetores em outros corpora

Intrinsic evaluation

  • Usar google questions
  • Criar questions baseadas no domínio (filmes, adjetivos, etc, personagens!!, diretores e atrizes, outras analogias, gênero e ator, ver distâncias --> ver a interface web para escolher as perguntas) <--------
  • Seguir modelo do artigo do GloVe (Pennington et al. 2014)
  • Use 20 News groups from scikit

Extrinsic evaluation


In [ ]: