AutoEncoder and Deep Neural Networks

This scripts reads in the 20 newsgroup corpus from SKLearn. Each document is created to a BoW-vector over the 2000 most common words.

1) Computes a baseline using Naive Bayes and SVM

2) Deep Feed Forward Network

3) AutoEncoder (should work in principle, but it does not yet converge. Check the parameters and the training method).


In [ ]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
import random

random.seed(1)
np.random.seed(1)


max_words = 2000
examples_per_labels = 1000

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

#count_vect = CountVectorizer(stop_words='english', max_features=max_words)
count_vect = TfidfVectorizer(stop_words='english', max_features=max_words)
train_x = count_vect.fit_transform(newsgroups_train.data).toarray()
test_x = count_vect.transform(newsgroups_test.data).toarray()

train_y = newsgroups_train.target
test_y = newsgroups_test.target

nb_labels = max(train_y)+1

print "Train: ",train_x.shape
print "Test: ",test_x.shape
print "%d labels" % nb_labels



examples = []
examples_labels = []
examples_count = {}

for idx in xrange(train_x.shape[0]):
    label = train_y[idx]
    
    if label not in examples_count:
        examples_count[label] = 0
    
    if examples_count[label] < examples_per_labels:
        arr = train_x[idx]
        examples.append(arr)
        examples_labels.append(label)
        examples_count[label]+=1

train_subset_x = np.asarray(examples)
train_subset_y = np.asarray(examples_labels)

print "Train Subset: ",train_subset_x.shape

Baseline

We use Scikit-learn to derive some baselines for the above setting


In [ ]:
#Naive Bayes
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB(alpha=.01)
clf.fit(train_subset_x, train_subset_y)
pred = clf.predict(test_x)
acc = metrics.accuracy_score(test_y, pred)
print "Naive Bayes: %f%%" % (acc*100)

#Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(train_subset_x, train_subset_y)
pred = clf.predict(test_x)
acc = metrics.accuracy_score(test_y, pred)
print "Gaussian Naive Bayes: %f%%" % (acc*100)

#MultinomialNB 
from sklearn.naive_bayes import MultinomialNB 
clf = MultinomialNB()
clf.fit(train_subset_x, train_subset_y)  
pred = clf.predict(test_x)
acc = metrics.accuracy_score(test_y, pred)
print "Multinomial Naive Bayes: %f%%" % (acc*100)

#LinearSVM
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(train_subset_x, train_subset_y)  
pred = clf.predict(test_x)
acc = metrics.accuracy_score(test_y, pred)
print "LinearSVM: %f%%" % (acc*100)

Deep Feed Forward Network


In [ ]:
from keras.layers import containers
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Flatten, AutoEncoder, Dropout
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.callbacks import EarlyStopping

random.seed(2)
np.random.seed(2)

nb_epoch = 30
batch_size = 200


model = Sequential()
model.add(Dense(500, input_dim=max_words, activation='relu'))
model.add(Dropout(0.5))
#model.add(Dense(500, activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(nb_labels, activation='softmax'))

train_subset_y_cat = np_utils.to_categorical(train_subset_y, nb_labels)
test_y_cat = np_utils.to_categorical(test_y, nb_labels)


model.compile(loss='categorical_crossentropy', optimizer='adam')


print('Start training')
model.fit(train_subset_x, train_subset_y_cat, batch_size=batch_size, nb_epoch=nb_epoch,
          show_accuracy=True, verbose=True, validation_data=(test_x, test_y_cat))

score = model.evaluate(test_x, test_y_cat, show_accuracy=True, verbose=False)
print('Test accuracy:', score[1])

Autoencoder

This is the code how the autoencoder should work in principle. However, the pretraining seems not to work as the loss stays approx. identical for all epochs. If someone finds the problem, please send me an email.


In [ ]:
# Train the autoencoder
# Source: https://github.com/fchollet/keras/issues/358
from keras.layers import containers
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Flatten, AutoEncoder, Dropout
from keras.optimizers import SGD
from keras.utils import np_utils

random.seed(3)
np.random.seed(3)

nb_epoch = 30
batch_size = 200

nb_epoch_pretraining = 10
batch_size_pretraining = 1000


# Layer-wise pretraining
encoders = []
decoders = []
nb_hidden_layers = [max_words, 1000]
X_train_tmp = np.copy(train_x)
for i, (n_in, n_out) in enumerate(zip(nb_hidden_layers[:-1], nb_hidden_layers[1:]), start=1):
    print('Training the layer {}: Input {} -> Output {}'.format(i, n_in, n_out))
    # Create AE and training
    ae = Sequential()
    encoder = containers.Sequential([Dense(output_dim=n_out, input_dim=n_in, activation='tanh'), Dropout(0.3)])
    decoder = containers.Sequential([Dense(output_dim=n_in, input_dim=n_out, activation='tanh')])
    ae.add(AutoEncoder(encoder=encoder, decoder=decoder, output_reconstruction=False))
    
    sgd = SGD(lr=2, decay=1e-6, momentum=0.0, nesterov=True)
    ae.compile(loss='mean_squared_error', optimizer='Adam')
    ae.fit(X_train_tmp, X_train_tmp, batch_size=batch_size_pretraining, nb_epoch=nb_epoch_pretraining, verbose = True, shuffle=True)
    # Store trainined weight and update training data
    encoders.append(ae.layers[0].encoder)
    decoders.append(ae.layers[0].decoder)
    X_train_tmp = ae.predict(X_train_tmp)
    
#End to End Autoencoder training    
if len(nb_hidden_layers) > 2:
    full_encoder = containers.Sequential()
    for encoder in encoders:
        full_encoder.add(encoder)

    full_decoder = containers.Sequential()
    for decoder in reversed(decoders):
        full_decoder.add(decoder)

    full_ae = Sequential()
    full_ae.add(AutoEncoder(encoder=full_encoder, decoder=full_decoder, output_reconstruction=False))    
    full_ae.compile(loss='mean_squared_error', optimizer='Adam')

    print "Pretraining of full AE"
    full_ae.fit(train_x, train_x, batch_size=batch_size_pretraining, nb_epoch=nb_epoch_pretraining, verbose = True, shuffle=True)

    
# Fine-turning
model = Sequential()
for encoder in encoders:
    model.add(encoder)
    
model.add(Dense(output_dim=nb_labels, input_dim=nb_hidden_layers[-1], activation='softmax'))

train_subset_y_cat = np_utils.to_categorical(train_subset_y, nb_labels)
test_y_cat = np_utils.to_categorical(test_y, nb_labels)


model.compile(loss='categorical_crossentropy', optimizer='Adam')
score = model.evaluate(test_x, test_y_cat, show_accuracy=True, verbose=0)
print('Test score before fine turning:', score[0])
print('Test accuracy before fine turning:', score[1])
model.fit(train_subset_x, train_subset_y_cat, batch_size=batch_size, nb_epoch=nb_epoch,
          show_accuracy=True, validation_data=(test_x, test_y_cat), shuffle=True)
score = model.evaluate(test_x, test_y_cat, show_accuracy=True, verbose=0)
print('Test score after fine turning:', score[0])
print('Test accuracy after fine turning:', score[1])

In [ ]:


In [ ]: