In [12]:
%pylab inline
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.datasets import imdb, reuters
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.optimizers import SGD, RMSprop
from keras.utils import np_utils
from keras.layers.convolutional import Convolution1D, MaxPooling1D, ZeroPadding1D, AveragePooling1D
from keras.callbacks import EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from gensim.models import word2vec
Load the MNIST dataset, flatten the images, convert the class labels, and scale the data.
In [2]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=500, maxlen=100, test_split=0.2)
X_train = sequence.pad_sequences(X_train, maxlen=100)
X_test = sequence.pad_sequences(X_test, maxlen=100)
Let's look at one sample from X_train and the first 10 elements of y_train. The codes give indicies for the word in the vocabulary (unfortunately, we do not have access to the vocabulary for this set).
In [3]:
print(X_train[0])
print(y_train[:10])
We now construct a model, the layer of which is a vector embedding. We then have a dense layer and then the activation layer. Notice that the output of the Embedding needs to be Flattened.
In [4]:
model = Sequential()
model.add(Embedding(500, 32, input_length=100))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256))
model.add(Dropout(0.25))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
In [5]:
model.fit(X_train, y_train, batch_size=32, nb_epoch=10, verbose=1,
validation_data=(X_test, y_test))
Out[5]:
The accuracy is not terribly, and certainly better than random guessing, but the model is clearly overfitting. To test your understanding, would you have been able to guess the sizes of the weights in these layers? Where does the 3200 comes from the first Dense layer?
In [6]:
print(model.layers[0].get_weights()[0].shape) # Embedding
print(model.layers[3].get_weights()[0].shape) # Dense(256)
print(model.layers[6].get_weights()[0].shape) # Dense(1)
In [7]:
model = Sequential()
# embedding
model.add(Embedding(500, 32, input_length=100))
model.add(Dropout(0.25))
# convolution layers
model.add(Convolution1D(nb_filter=32,
filter_length=4,
border_mode='valid',
activation='relu'))
model.add(MaxPooling1D(pool_length=2))
# dense layers
model.add(Flatten())
model.add(Dense(256))
model.add(Dropout(0.25))
model.add(Activation('relu'))
# output layer
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
In [8]:
model.fit(X_train, y_train, batch_size=32, nb_epoch=15, verbose=1,
validation_data=(X_test, y_test))
Out[8]:
The performance is significantly improved, and could be much better if we further tweaked the parameters and constructed a deeper model.
In [9]:
(X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=500, maxlen=100, test_split=0.2)
X_train = sequence.pad_sequences(X_train, maxlen=100)
X_test = sequence.pad_sequences(X_test, maxlen=100)
Y_train = np_utils.to_categorical(y_train, 46)
Y_test = np_utils.to_categorical(y_test, 46)
In [10]:
model = Sequential()
# embedding
model.add(Embedding(500, 32, input_length=100))
model.add(Dropout(0.25))
# convolution layers
model.add(Convolution1D(nb_filter=32,
filter_length=4,
border_mode='valid',
activation='relu'))
model.add(MaxPooling1D(pool_length=2))
# dense layers
model.add(Flatten())
model.add(Dense(256))
model.add(Dropout(0.25))
model.add(Activation('relu'))
# output layer
model.add(Dense(46))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
In [11]:
model.fit(X_train, Y_train, batch_size=32, nb_epoch=15, verbose=1,
validation_data=(X_test, Y_test))
Out[11]:
The results are less impressive than they may at first seem, as the majority of the articles are in one of three categories.
In [14]:
loc = "/Users/taylor/files/word2vec_python/GoogleNews-vectors-negative300.bin"
model = word2vec.Word2Vec.load_word2vec_format(loc, binary=True)
In [15]:
jobs = ["professor", "teacher", "actor", "clergy", "musician", "philosopher",
"writer", "singer", "dancers", "model", "anesthesiologist", "audiologist",
"chiropractor", "optometrist", "pharmacist", "psychologist", "physician",
"architect", "firefighter", "judges", "lawyer", "biologist", "botanist",
"ecologist", "geneticist", "zoologist", "chemist", "programmer", "designer"]
In [40]:
print(model[jobs[0]].shape)
print(model[jobs[0]][:25])
Out[40]:
In [20]:
embedding = np.array([model[x] for x in jobs])
In [27]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(embedding)
embedding_pca = np.transpose(pca.transform(embedding))
embedding_pca.shape
Out[27]:
In [39]:
plt.figure(figsize=(16, 10))
plt.scatter(embedding_pca[0], embedding_pca[1], alpha=0)
for index,(x,y) in enumerate(np.transpose(embedding_pca)):
plt.text(x,y,jobs[index])
Now, let's repeate with country clubs.
In [50]:
country = ["United_States", "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Argentina",
"Armenia", "Australia", "Austria", "Azerbaijan", "Bahrain", "Bangladesh",
"Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan",
"Bolivia", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burundi",
"Cambodia", "Cameroon", "Canada", "Chad", "Chile", "Colombia",
"Comoros", "Croatia", "Cuba", "Cyprus", "Denmark", "Djibouti",
"Dominica", "Ecuador", "Egypt", "Eritrea", "Estonia", "Ethiopia",
"Fiji", "Finland", "France", "Gabon", "Georgia", "Germany", "Ghana",
"Greece", "Grenada", "Guatemala", "Guinea",
"Guyana", "Haiti", "Honduras", "Hungary", "Iceland", "India",
"Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Jamaica",
"Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "Kuwait",
"Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia",
"Libya", "Liechtenstein", "Lithuania", "Luxembourg", "Macedonia",
"Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta",
"Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova",
"Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique",
"Namibia", "Nauru", "Nepal", "Netherlands", "Nicaragua", "Niger",
"Nigeria", "Norway", "Oman", "Pakistan", "Palau", "Panama", "Paraguay",
"Peru", "Philippines", "Poland", "Portugal", "Qatar", "Romania",
"Russia", "Rwanda", "Samoa", "Senegal", "Serbia", "Seychelles",
"Singapore", "Slovakia", "Slovenia", "Somalia", "Spain", "Sudan",
"Suriname", "Swaziland", "Sweden", "Switzerland", "Syria", "Tajikistan",
"Tanzania", "Thailand", "Togo", "Tonga", "Tunisia", "Turkey",
"Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "Uruguay", "Uzbekistan",
"Vanuatu", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe",
"Abkhazia", "Somaliland", "Mayotte", "Niue",
"Tokelau", "Guernsey", "Jersey", "Anguilla", "Bermuda", "Gibraltar",
"Montserrat", "Guam", "Macau", "Greenland", "Guadeloupe", "Martinique",
"Reunion", "Aland", "Aruba", "Svalbard", "Ascension"]
In [51]:
embedding = np.array([model[x] for x in country])
pca = PCA(n_components=2)
pca.fit(embedding)
embedding_pca = np.transpose(pca.transform(embedding))
embedding_pca.shape
plt.figure(figsize=(16, 10))
plt.scatter(embedding_pca[0], embedding_pca[1], alpha=0)
for index,(x,y) in enumerate(np.transpose(embedding_pca)):
plt.text(x,y,country[index])
And, just because I think this is fun, let's run this on a smaller set of counties and their capitals.
In [60]:
city_pairs = ["Afghanistan", "Belarus", "Belgium", "Brazil", "Costa_Rica",
"Canada", "Netherlands", "United_Kingdom", "United_States", "Iran", "Kabul",
"Minsk", "Brussels", "Brasilia", "San_Jose", "Ottawa", "Amsterdam",
"London", "Washington", "Tehran"]
In [61]:
embedding = np.array([model[x] for x in city_pairs])
pca = PCA(n_components=2)
pca.fit(embedding)
embedding_pca = np.transpose(pca.transform(embedding))
embedding_pca.shape
plt.figure(figsize=(16, 10))
plt.scatter(embedding_pca[0], embedding_pca[1], alpha=0)
for index,(x,y) in enumerate(np.transpose(embedding_pca)):
plt.text(x,y,city_pairs[index])
Look how the line between country and capital has roughly the same slope and length for all of the pairs.
It is by no means fast (the algorithm is horribly implemented in gensim) but we can also do the reverse, and find the closest words in the embedding space to a given term:
In [62]:
these = model.most_similar('Afghanistan', topn=25)
for th in these:
print("%02.04f - %s" % th[::-1])