word2vec IMDB data

Training word2vec embeddings on the IMDB database and experimenting.

Referência: Tutorial Kagggle "Bag of Words meets Bags of Popcorn"


In [ ]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
import pandas as pd
import gensim

In [7]:
train = pd.read_csv( "labeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )

In [88]:
train


Out[88]:
id sentiment review
0 "5814_8" 1 "With all this stuff going down at the moment ...
1 "2381_9" 1 "\"The Classic War of the Worlds\" by Timothy ...
2 "7759_3" 0 "The film starts with a manager (Nicholas Bell...
3 "3630_4" 0 "It must be assumed that those who praised thi...
4 "9495_8" 1 "Superbly trashy and wondrously unpretentious ...
5 "8196_8" 1 "I dont know why people think this is such a b...
6 "7166_2" 0 "This movie could have been very good, but com...
7 "10633_1" 0 "I watched this video at a friend's house. I'm...
8 "319_1" 0 "A friend of mine bought this film for £1, and...
9 "8713_10" 1 "<br /><br />This movie is full of references....
10 "2486_3" 0 "What happens when an army of wetbacks, towelh...
11 "6811_10" 1 "Although I generally do not like remakes beli...
12 "11744_9" 1 "\"Mr. Harvey Lights a Candle\" is anchored by...
13 "7369_1" 0 "I had a feeling that after \"Submerged\", thi...
14 "12081_1" 0 "note to George Litman, and others: the Myster...
15 "3561_4" 0 "Stephen King adaptation (scripted by King him...
16 "4489_1" 0 "`The Matrix' was an exciting summer blockbust...
17 "3951_2" 0 "Ulli Lommel's 1980 film 'The Boogey Man' is n...
18 "3304_10" 1 "This movie is one among the very few Indian m...
19 "9352_10" 1 "Most people, especially young people, may not...
20 "3374_7" 1 "\"Soylent Green\" is one of the best and most...
21 "10782_7" 1 "Michael Stearns plays Mike, a sexually frustr...
22 "5414_10" 1 "This happy-go-luck 1939 military swashbuckler...
23 "10492_1" 0 "I would love to have that two hours of my lif...
24 "3350_3" 0 "The script for this movie was probably found ...
25 "6581_7" 1 "Looking for Quo Vadis at my local video store...
26 "2203_3" 0 "Note to all mad scientists everywhere: if you...
27 "689_1" 0 "What the ........... is this ? This must, wit...
28 "9152_1" 0 "Intrigued by the synopsis (every gay video th...
29 "6077_1" 0 "Would anyone really watch this RUBBISH if it ...
... ... ... ...
24970 "9389_7" 1 "Red Rock West (1993)<br /><br />Nicolas Cage ...
24971 "9251_9" 1 "what can i say?, ms Erika Eleniak is my favor...
24972 "1422_10" 1 "The spoiler warning is for those people who w...
24973 "7415_2" 0 "What do you call a horror story without horro...
24974 "7492_7" 1 "Though not a horror film in the traditional s...
24975 "7689_10" 1 "This was what black society was like before t...
24976 "12370_4" 0 "They probably should have called this movie T...
24977 "5625_8" 1 "Attractive Marjorie(Farrah Fawcett)lives in f...
24978 "9397_9" 1 "Vaguely reminiscent of great 1940's westerns,...
24979 "5992_7" 1 "I admit I had no idea what to expect before v...
24980 "2488_10" 1 "To me, the final scene, in which Harris respo...
24981 "9627_10" 1 "This is by far the funniest short made by the...
24982 "3822_2" 0 "To be a Buster Keaton fan is to have your hea...
24983 "5983_4" 0 "I was one of those \"few Americans\" that gre...
24984 "8021_2" 0 "Visually disjointed and full of itself, the d...
24985 "3471_3" 0 "this movie had more holes than a piece of swi...
24986 "6034_10" 1 "Last November, I had a chance to see this fil...
24987 "1988_9" 1 "First off, I'd like to make a correction on a...
24988 "7623_9" 1 "While originally reluctant to jump on the ban...
24989 "5974_7" 1 "I heard about this movie when watching VH1's ...
24990 "2034_9" 1 "I've never been huge on IMAX films. They're c...
24991 "9416_3" 0 "Steve McQueen has certainly a lot of loyal fa...
24992 "10994_1" 0 "Sometimes you wonder how some people get fund...
24993 "10957_3" 0 "I am a student of film, and have been for sev...
24994 "2372_1" 0 "Unimaginably stupid, redundant and humiliatin...
24995 "3453_3" 0 "It seems like more consideration has gone int...
24996 "5064_1" 0 "I don't believe they made this film. Complete...
24997 "10905_3" 0 "Guy is a loser. Can't get girls, needs to bui...
24998 "10194_3" 0 "This 30 minute documentary Buñuel made in the...
24999 "8478_8" 1 "I saw this movie as a child and it broke my h...

25000 rows × 3 columns


In [8]:
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [9]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.decode('utf-8').strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [10]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)


Parsing sentences from training set
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html5lib"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

The code that caused this warning is on line 162 of the file /System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py. To get rid of this warning, change code that looks like this:

 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:219: UserWarning: "." looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.
  ' Beautiful Soup.' % markup)
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.happierabroad.com"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
Parsing sentences from unlabeled set
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.archive.org/details/LovefromaStranger"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.loosechangeguide.com/LooseChangeGuide.html"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.msnbc.msn.com/id/4972055/site/newsweek/"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:219: UserWarning: ".." looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.
  ' Beautiful Soup.' % markup)
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://www.youtube.com/watch?v=a0KSqelmgN8"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: "http://jake-weird.blogspot.com/2007/08/beneath.html"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup

In [ ]:
y = train["sentiment"]
X = []
for review in train["review"]:
    sentences = review_to_sentences(review, tokenizer)
    words = []
    for sentence in sentences:
        words += sentence
    X.append(sentence)

In [123]:
print len(train["sentiment"]), len(train["review"]), len(X), X[0]
# print review_to_sentences(train[0], tokenizer)


25000 25000 266551 [u'with', u'all', u'this', u'stuff', u'going', u'down', u'at', u'the', u'moment', u'with', u'mj', u'i', u've', u'started', u'listening', u'to', u'his', u'music', u'watching', u'the', u'odd', u'documentary', u'here', u'and', u'there', u'watched', u'the', u'wiz', u'and', u'watched', u'moonwalker', u'again']

In [14]:
model = gensim.models.Word2Vec(sentences, min_count=1)

In [100]:
def vectorize(sentence):
    return [model[word] for word in sentence]

In [ ]:
print(model.wv.most_similar(positive=['he', 'her'], negative=['she']))

In [29]:
acc = model.accuracy('questions-words.txt')

In [36]:
[(d.keys()[1], d[d.keys()[1]]) for d in acc]


Out[36]:
[('section', u'capital-common-countries'),
 ('section', u'capital-world'),
 ('section', u'currency'),
 ('section', u'city-in-state'),
 ('section', u'family'),
 ('section', u'gram1-adjective-to-adverb'),
 ('section', u'gram2-opposite'),
 ('section', u'gram3-comparative'),
 ('section', u'gram4-superlative'),
 ('section', u'gram5-present-participle'),
 ('section', u'gram6-nationality-adjective'),
 ('section', u'gram7-past-tense'),
 ('section', u'gram8-plural'),
 ('section', u'gram9-plural-verbs'),
 ('section', 'total')]

In [54]:
for i in range(0, len(acc)):
    print(acc[i][acc[i].keys()[1]], len(acc[i]['correct']), len(acc[i]['incorrect']))#, len(acc[i]['correct']/len(acc[i]['incorrect']))


(u'capital-common-countries', 26, 246)
(u'capital-world', 19, 269)
(u'currency', 0, 40)
(u'city-in-state', 6, 851)
(u'family', 241, 179)
(u'gram1-adjective-to-adverb', 47, 883)
(u'gram2-opposite', 42, 608)
(u'gram3-comparative', 643, 689)
(u'gram4-superlative', 217, 539)
(u'gram5-present-participle', 271, 541)
(u'gram6-nationality-adjective', 80, 1149)
(u'gram7-past-tense', 382, 950)
(u'gram8-plural', 273, 657)
(u'gram9-plural-verbs', 352, 350)
('total', 2599, 7951)

In [82]:
model.wv.syn0.shape


Out[82]:
(123504, 100)

Keras model


In [69]:
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.datasets import imdb
from keras import backend as K

from theano import function


Using Theano backend.

In [ ]:
max_features = 20000
max_len = 500
# weights = model.wv.syn0
weights = np.vstack([np.zeros(100), model.wv.syn0])
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=weights.shape[0], output_dim=weights.shape[1], weights=[weights]))
# lstm_model.add(Embedding(max_features, 128, input_length = max_len))
lstm_model.add(LSTM(100))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(1))
lstm_model.add(Activation('sigmoid'))

print(lstm_model.summary())

In [84]:
max_features = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words = max_features)

In [ ]:


In [86]:
lstm_model.compile(loss='binary_crossentropy',
              optimizer = 'adam',
              metrics=["accuracy"])
print("Train..")
batch_size = 30
score = lstm_model.fit(X_train, y_train, batch_size = batch_size,
                      nb_epoch = 4, validation_data = (X_test, y_test))


Train..
Train on 25000 samples, validate on 25000 samples
Epoch 1/4
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-86-94704e6cdfba> in <module>()
      5 batch_size = 30
      6 score = lstm_model.fit(X_train, y_train, batch_size = batch_size,
----> 7                       nb_epoch = 4, validation_data = (X_test, y_test))

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/keras/models.pyc in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)
    854                               class_weight=class_weight,
    855                               sample_weight=sample_weight,
--> 856                               initial_epoch=initial_epoch)
    857 
    858     def evaluate(self, x, y, batch_size=32, verbose=1,

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/keras/engine/training.pyc in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)
   1496                               val_f=val_f, val_ins=val_ins, shuffle=shuffle,
   1497                               callback_metrics=callback_metrics,
-> 1498                               initial_epoch=initial_epoch)
   1499 
   1500     def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/keras/engine/training.pyc in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch)
   1150                 batch_logs['size'] = len(batch_ids)
   1151                 callbacks.on_batch_begin(batch_index, batch_logs)
-> 1152                 outs = f(ins_batch)
   1153                 if not isinstance(outs, list):
   1154                     outs = [outs]

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/keras/backend/theano_backend.pyc in __call__(self, inputs)
   1156     def __call__(self, inputs):
   1157         assert isinstance(inputs, (list, tuple))
-> 1158         return self.function(*inputs)
   1159 
   1160 

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    793                         s.storage[0] = s.type.filter(
    794                             arg, strict=s.strict,
--> 795                             allow_downcast=s.allow_downcast)
    796 
    797                     except Exception as e:

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/theano/tensor/type.pyc in filter(self, data, strict, allow_downcast)
    115             if allow_downcast:
    116                 # Convert to self.dtype, regardless of the type of data
--> 117                 data = theano._asarray(data, dtype=self.dtype)
    118                 # TODO: consider to pad shape with ones to make it consistent
    119                 # with self.broadcastable... like vector->row type thing

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/theano/misc/safe_asarray.pyc in _asarray(a, dtype, order)
     32         dtype = theano.config.floatX
     33     dtype = np.dtype(dtype)  # Convert into dtype object.
---> 34     rval = np.asarray(a, dtype=dtype, order=order)
     35     # Note that dtype comparison must be done by comparing their `num`
     36     # attribute. One cannot assume that two identical data types are pointers

/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/numpy/core/numeric.pyc in asarray(a, dtype, order)
    529 
    530     """
--> 531     return array(a, dtype, copy=False, order=order)
    532 
    533 

ValueError: Bad input argument to theano function with name "/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/keras/backend/theano_backend.py:1154" at index 0 (0-based).  
Backtrace when that variable is created:

  File "/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-85-aa344cb3472f>", line 5, in <module>
    lstm_model.add(Embedding(input_dim=weights.shape[0], output_dim=weights.shape[1], weights=[weights]))
  File "/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/keras/models.py", line 429, in add
    dtype=layer.dtype, name=layer.name + '_input')
  File "/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/keras/engine/topology.py", line 1414, in Input
    input_tensor=tensor)
  File "/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/keras/legacy/interfaces.py", line 88, in wrapper
    return func(*args, **kwargs)
  File "/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/keras/engine/topology.py", line 1325, in __init__
    name=self.name)
  File "/Users/emannuelcarvalho/.virtualenvs/primogenithon/lib/python2.7/site-packages/keras/backend/theano_backend.py", line 185, in placeholder
    x = T.TensorType(dtype, broadcast)(name)
setting an array element with a sequence.

In [ ]: