We are going to train a Neural Network to predict the origin of a document coming from the 20newsgroup dataset.
For this puprose we'll use Tensorflow, and sklearn. Your job is to fill in the missing code into the cells below.
You will find the steps you need to perform in the Task section in each cell.
You must submit your code with experiments:
In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from nltk import TweetTokenizer
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras import regularizers, initializers
In [2]:
def next_batch(x_, y_, batch_size, ids = None):
if (ids is None):
# Random sample from the dataset. It can be sequential (but must be shuffled) within epoch, that will guarantee that you'll use all the data.
# The two approaches are practically equal when using a large number of epochs.
ids = np.random.choice(x_.shape[0], batch_size, replace=False)
feed_dict = {
'x': x_[ids],
'y': y_[ids]
}
return feed_dict
def tweet_tokenize(text):
tknzr = TweetTokenizer(preserve_case=True, strip_handles=True)
return tknzr.tokenize(text)
def evalute_accuracy(x, y):
return sess.run(accuracy, feed_dict = next_batch(x, y, len(x)))
In [3]:
hparams = tf.contrib.training.HParams(
batch_size = 32,
max_epochs = 100,
max_features = 1000,
learning_rate = 0.03,
reg_param = 0.03,
dropout_keep_prob = 0.9,
use_dropout = False,
use_early_stoppoing = True,
early_stopping_patience = 3,
use_l2_reg = False,
layers = 2,
seed = 42
)
We are going to use the 20newsgroup dataset for multi-class text classification with Tensorflow.
First we use the fetch_20newgroup module from sklearn.
In [4]:
print('Loading data...')
# Passing none as we want to train over all the data.
newsgroups_train = fetch_20newsgroups(subset='train',
categories=None)
newsgroups_test = fetch_20newsgroups(subset='test',
categories=None)
print('Data loaded.')
In this paragraph you need to pre-process your data and create vectors suitable for feeding the NN. You can try different transorfmations and features, TFIDF would be a good start.
You can use:<
20 classes
Vectorizing sequence data...
x_train shape: (11314, max_features)
x_test shape: (7532, max_features)
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (11314, 20)
y_test shape: (7532, 20)
In [5]:
num_classes = np.max(newsgroups_train.target) + 1
print(num_classes, 'classes')
print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words=hparams.max_features)
tokenizer.fit_on_texts(newsgroups_train.data)
x_train = tokenizer.texts_to_matrix(newsgroups_train.data, mode='binary')
x_test = tokenizer.texts_to_matrix(newsgroups_test.data, mode='binary')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('Convert class vector to binary class matrix '
'(for use with categorical_crossentropy)')
y_train = to_categorical(newsgroups_train.target, num_classes)
y_test = to_categorical(newsgroups_test.target, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
In [6]:
def create_model(hparams):
input_layer = layers.Input(shape=(hparams.max_features,), name='input')
hidden = input_layer
for i in range(hparams.layers):
#create layers
hidden = layers.Dense(
128,
activation='relu',
kernel_regularizer=regularizers.l2(hparams.reg_param) if hparams.use_l2_reg else None,
kernel_initializer=initializers.glorot_normal(seed=hparams.seed),
name='dense-{}'.format(i)
)(hidden)
if hparams.use_dropout:
#use hparams.dropout_keep_prob and add dropout mask
hidden = layers.Dropout(rate=1 - hparams.dropout_keep_prob)(hidden)
# Softmax over classes for ouput
output_layer = layers.Dense(
num_classes,
activation='softmax',
kernel_regularizer=regularizers.l2(hparams.reg_param) if hparams.use_l2_reg else None,
kernel_initializer=initializers.glorot_normal(seed=hparams.seed),
name='output'
)(hidden)
if hparams.use_dropout:
#use hparams.dropout_keep_prob and add dropout mask
output_layer = layers.Dropout(rate=1 - hparams.dropout_keep_prob)(output_layer)
model = Model(inputs=[input_layer], outputs=output_layer)
# Minimize error using cross entropy
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()
return model
model = create_model(hparams)
In [7]:
def train_model(model, hparams):
full_history = {
'loss': [[], []],
'acc': [[], []]
}
patience = 0
best_test_loss = np.inf
best_epoch = 0
# Training cycle
for epoch in range(hparams.max_epochs):
history = model.fit(
x=x_train,
y=y_train,
batch_size=hparams.batch_size,
epochs=1,
shuffle=True
)
train_loss = history.history['loss'][0]
train_acc = history.history['acc'][0]
test_loss, test_acc = model.evaluate(
x=x_test,
y=y_test,
batch_size=hparams.batch_size
)
full_history['loss'][0].append(train_loss)
full_history['loss'][1].append(test_loss)
full_history['acc'][0].append(train_acc)
full_history['acc'][1].append(test_acc)
if hparams.use_early_stoppoing:
if test_loss < best_test_loss:
best_test_loss = test_loss
best_epoch = epoch
else:
if patience < hparams.early_stopping_patience:
patience = patience + 1
else:
print('best epoch to stop is: {} with loss: {}'.format(best_epoch, best_test_loss))
break
print("Optimization Finished!")
return full_history
history = train_model(model, hparams)
In [8]:
def visualize_history(history, key='loss'):
plt.plot(history[key][0])
plt.plot(history[key][1])
plt.title('model {}'.format(key))
plt.ylabel(key)
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
return plt
In [9]:
visualize_history(history, key='loss').show()
In [10]:
visualize_history(history, key='acc').show()
In [11]:
def run_experiment(hparams, title='Experiment'):
print('RUNNING EXPERIMENT: {}'.format(title))
model = create_model(hparams)
history = train_model(model, hparams)
visualize_history(history, key='loss').show()
visualize_history(history, key='acc').show()
final_test_loss = history['loss'][1][-1]
final_test_acc = history['acc'][1][-1]
print('Final test loss: {}'.format(final_test_loss))
print('Final test accuracy: {}'.format(final_test_acc))
batch_size = 32max_epochs = 100max_features = 1000learning_rate = 0.03reg_param = 0.03dropout_keep_prob = 0.9seed = 42The number of neurons in each layer is 128.
A grid search on all these parameters would be too big, so we emphasize on Layer Count, using L2 Regularization, using Dropout or using Early Stopping.
| Experiment # | Layer count | L2 Regularization | Dropout | Early Stopping | Accuracy | Loss | Comment |
|---|---|---|---|---|---|---|---|
| 1 | 2 | yes | no | no | 0.2493 | 2.8602 | |
| 2 | 2 | no | yes | no | 0.0426 | NaN | gradient exploded |
| 3 | 2 | no | no | yes | 0.6279 | 1.7183 | stopped at first epoch with patience 1 (also with patience 3) |
| 4 | 2 | yes | yes | no | 0.1251 | 2.8817 | |
| 5 | 3 | no | yes | yes | 0.6480 | 1.5360 | best result |
| 6 | 3 | yes | no | yes | 0.1225 | 2.8889 | stopped at epoch 10 |
| 7 | 3 | yes | yes | yes | 0.0528 | 2.9902 |
In [12]:
run_experiment(tf.contrib.training.HParams(
batch_size = 32,
max_epochs = 100,
max_features = 1000,
learning_rate = 0.03,
reg_param = 0.03,
dropout_keep_prob = 0.9,
use_dropout = False,
use_early_stoppoing = False,
early_stopping_patience = 1,
use_l2_reg = True,
layers = 2,
seed = 42
), title="1) 2 Layers, L2")
In [13]:
run_experiment(tf.contrib.training.HParams(
batch_size = 32,
max_epochs = 100,
max_features = 1000,
learning_rate = 0.03,
reg_param = 0.03,
dropout_keep_prob = 0.9,
use_dropout = True,
use_early_stoppoing = False,
early_stopping_patience = 1,
use_l2_reg = False,
layers = 2,
seed = 42
), title="2) 2 Layers, Dropout")
In [14]:
run_experiment(tf.contrib.training.HParams(
batch_size = 32,
max_epochs = 100,
max_features = 1000,
learning_rate = 0.03,
reg_param = 0.03,
dropout_keep_prob = 0.9,
use_dropout = False,
use_early_stoppoing = True,
early_stopping_patience = 3,
use_l2_reg = False,
layers = 2,
seed = 42
), title="3) 2 Layers, Early Stopping")
In [15]:
run_experiment(tf.contrib.training.HParams(
batch_size = 32,
max_epochs = 100,
max_features = 1000,
learning_rate = 0.03,
reg_param = 0.03,
dropout_keep_prob = 0.9,
use_dropout = True,
use_early_stoppoing = False,
early_stopping_patience = 3,
use_l2_reg = True,
layers = 2,
seed = 42
), title="4) 2 Layers, L2, Dropout")
In [16]:
run_experiment(tf.contrib.training.HParams(
batch_size = 32,
max_epochs = 100,
max_features = 1000,
learning_rate = 0.03,
reg_param = 0.03,
dropout_keep_prob = 0.9,
use_dropout = True,
use_early_stoppoing = True,
early_stopping_patience = 3,
use_l2_reg = False,
layers = 3,
seed = 42
), title="5) 3 Layers, Dropout, Early Stopping")
In [17]:
run_experiment(tf.contrib.training.HParams(
batch_size = 32,
max_epochs = 100,
max_features = 1000,
learning_rate = 0.03,
reg_param = 0.03,
dropout_keep_prob = 0.9,
use_dropout = False,
use_early_stoppoing = True,
early_stopping_patience = 3,
use_l2_reg = True,
layers = 3,
seed = 42
), title="6) 3 Layers, L2, Early Stopping")
In [18]:
run_experiment(tf.contrib.training.HParams(
batch_size = 32,
max_epochs = 100,
max_features = 1000,
learning_rate = 0.03,
reg_param = 0.03,
dropout_keep_prob = 0.9,
use_dropout = True,
use_early_stoppoing = True,
early_stopping_patience = 3,
use_l2_reg = True,
layers = 3,
seed = 42
), title="7) 3 Layers, L2, Dropout, Early Stopping")