The original script that this notebook is based on is here
In [1]:
# Imports
from __future__ import print_function
import numpy as np
np.random.seed(1337) # for reproducibility
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
max_words
: Only keep this many words as features. Uses the most common words.batch_size
: The number of samples per gradient update. Bigger values make the gradient update more accurate, but mean it takes longer to train the neural networknb_epoch
: The number of times to go through all of the training data. Since batch_size
is less than the full training set size, each "epoch" will be updating the gradient multiple times. So basically, the number of iterations is nb_epoch * sample_size / batch_size
. nb_hidden
: The number of hidden layers to usenb_dense
: The number of units to use in the hidden layer(s).p_dropout
: Randomly sets this fraction of the input units to 0 at each gradient update. It helps to prevent overfitting.Here is something close to what the neural network we use here looks like.
Each of the input nodes correspond to a yes/no answer to the questions "Does this article contain the word 'x'?" In our model, we have max_words
input nodes instead of the 3 shown here.
The next layer, the hidden layer, is where a lot of the magic happens. Each hidden layer node input is a linear combination of the input layer values. Their output is a nonlinear "activation" function applied to the input. Typical activation functions are tanh
or in this case, relu
. The more hidden layer nodes you have, the more accurate the neural network can be.
The output layer in our case is the number of types of news articles. Like the hidden layer, each node is a linear combination of the previous layer's outputs.
In [2]:
max_words = 1000
batch_size = 32
nb_epoch = 15
nb_dense = 512
nb_hidden = 1 # The number of hidden layers to use
p_dropout = 0.5
In [3]:
print('Loading data...')
(X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')
nb_classes = np.max(y_train)+1
print(nb_classes, 'classes')
print('Vectorizing sequence data...')
tokenizer = Tokenizer(nb_words=max_words)
X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
print('Y_train shape:', Y_train.shape)
print('Y_test shape:', Y_test.shape)
In [4]:
print('Building model...')
model = Sequential()
model.add(Dense(nb_dense, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(p_dropout))
for _ in range(nb_hidden-1):
model.add(Dense(nb_dense))
model.add(Activation('relu'))
model.add(Dropout(p_dropout))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
In [5]:
import time
t1 = time.time()
history = model.fit(X_train, Y_train,
nb_epoch=nb_epoch, batch_size=batch_size,
verbose=1, validation_split=0.1)
t2 = time.time()
print('Model training took {:.2g} minutes'.format((t2-t1)/60))
In [6]:
score = model.evaluate(X_test, Y_test,
batch_size=batch_size, verbose=1)
print('\nTest score:', score[0])
print('Test accuracy:', score[1])
In [13]:
import output_model
In [19]:
output_model.save_model(model, 'models/Reuters_MLP_model')
In [ ]: