This scripts reads in the 20 newsgroup corpus from SKLearn. Each document is created to a BoW-vector over the 2000 most common words.
1) Computes a baseline using Naive Bayes and SVM
2) Deep Feed Forward Network
3) AutoEncoder (should work in principle, but it does not yet converge. Check the parameters and the training method).
In [ ]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
import random
random.seed(1)
np.random.seed(1)
max_words = 2000
examples_per_labels = 1000
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
#count_vect = CountVectorizer(stop_words='english', max_features=max_words)
count_vect = TfidfVectorizer(stop_words='english', max_features=max_words)
train_x = count_vect.fit_transform(newsgroups_train.data).toarray()
test_x = count_vect.transform(newsgroups_test.data).toarray()
train_y = newsgroups_train.target
test_y = newsgroups_test.target
nb_labels = max(train_y)+1
print "Train: ",train_x.shape
print "Test: ",test_x.shape
print "%d labels" % nb_labels
examples = []
examples_labels = []
examples_count = {}
for idx in xrange(train_x.shape[0]):
label = train_y[idx]
if label not in examples_count:
examples_count[label] = 0
if examples_count[label] < examples_per_labels:
arr = train_x[idx]
examples.append(arr)
examples_labels.append(label)
examples_count[label]+=1
train_subset_x = np.asarray(examples)
train_subset_y = np.asarray(examples_labels)
print "Train Subset: ",train_subset_x.shape
In [ ]:
#Naive Bayes
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB(alpha=.01)
clf.fit(train_subset_x, train_subset_y)
pred = clf.predict(test_x)
acc = metrics.accuracy_score(test_y, pred)
print "Naive Bayes: %f%%" % (acc*100)
#Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(train_subset_x, train_subset_y)
pred = clf.predict(test_x)
acc = metrics.accuracy_score(test_y, pred)
print "Gaussian Naive Bayes: %f%%" % (acc*100)
#MultinomialNB
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_subset_x, train_subset_y)
pred = clf.predict(test_x)
acc = metrics.accuracy_score(test_y, pred)
print "Multinomial Naive Bayes: %f%%" % (acc*100)
#LinearSVM
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(train_subset_x, train_subset_y)
pred = clf.predict(test_x)
acc = metrics.accuracy_score(test_y, pred)
print "LinearSVM: %f%%" % (acc*100)
In [ ]:
from keras.layers import containers
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Flatten, AutoEncoder, Dropout
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
random.seed(2)
np.random.seed(2)
nb_epoch = 30
batch_size = 200
model = Sequential()
model.add(Dense(500, input_dim=max_words, activation='relu'))
model.add(Dropout(0.5))
#model.add(Dense(500, activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(nb_labels, activation='softmax'))
train_subset_y_cat = np_utils.to_categorical(train_subset_y, nb_labels)
test_y_cat = np_utils.to_categorical(test_y, nb_labels)
model.compile(loss='categorical_crossentropy', optimizer='adam')
print('Start training')
model.fit(train_subset_x, train_subset_y_cat, batch_size=batch_size, nb_epoch=nb_epoch,
show_accuracy=True, verbose=True, validation_data=(test_x, test_y_cat))
score = model.evaluate(test_x, test_y_cat, show_accuracy=True, verbose=False)
print('Test accuracy:', score[1])
In [ ]:
# Train the autoencoder
# Source: https://github.com/fchollet/keras/issues/358
from keras.layers import containers
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Flatten, AutoEncoder, Dropout
from keras.optimizers import SGD
from keras.utils import np_utils
random.seed(3)
np.random.seed(3)
nb_epoch = 30
batch_size = 200
nb_epoch_pretraining = 10
batch_size_pretraining = 1000
# Layer-wise pretraining
encoders = []
decoders = []
nb_hidden_layers = [max_words, 1000]
X_train_tmp = np.copy(train_x)
for i, (n_in, n_out) in enumerate(zip(nb_hidden_layers[:-1], nb_hidden_layers[1:]), start=1):
print('Training the layer {}: Input {} -> Output {}'.format(i, n_in, n_out))
# Create AE and training
ae = Sequential()
encoder = containers.Sequential([Dense(output_dim=n_out, input_dim=n_in, activation='tanh'), Dropout(0.3)])
decoder = containers.Sequential([Dense(output_dim=n_in, input_dim=n_out, activation='tanh')])
ae.add(AutoEncoder(encoder=encoder, decoder=decoder, output_reconstruction=False))
sgd = SGD(lr=2, decay=1e-6, momentum=0.0, nesterov=True)
ae.compile(loss='mean_squared_error', optimizer='Adam')
ae.fit(X_train_tmp, X_train_tmp, batch_size=batch_size_pretraining, nb_epoch=nb_epoch_pretraining, verbose = True, shuffle=True)
# Store trainined weight and update training data
encoders.append(ae.layers[0].encoder)
decoders.append(ae.layers[0].decoder)
X_train_tmp = ae.predict(X_train_tmp)
#End to End Autoencoder training
if len(nb_hidden_layers) > 2:
full_encoder = containers.Sequential()
for encoder in encoders:
full_encoder.add(encoder)
full_decoder = containers.Sequential()
for decoder in reversed(decoders):
full_decoder.add(decoder)
full_ae = Sequential()
full_ae.add(AutoEncoder(encoder=full_encoder, decoder=full_decoder, output_reconstruction=False))
full_ae.compile(loss='mean_squared_error', optimizer='Adam')
print "Pretraining of full AE"
full_ae.fit(train_x, train_x, batch_size=batch_size_pretraining, nb_epoch=nb_epoch_pretraining, verbose = True, shuffle=True)
# Fine-turning
model = Sequential()
for encoder in encoders:
model.add(encoder)
model.add(Dense(output_dim=nb_labels, input_dim=nb_hidden_layers[-1], activation='softmax'))
train_subset_y_cat = np_utils.to_categorical(train_subset_y, nb_labels)
test_y_cat = np_utils.to_categorical(test_y, nb_labels)
model.compile(loss='categorical_crossentropy', optimizer='Adam')
score = model.evaluate(test_x, test_y_cat, show_accuracy=True, verbose=0)
print('Test score before fine turning:', score[0])
print('Test accuracy before fine turning:', score[1])
model.fit(train_subset_x, train_subset_y_cat, batch_size=batch_size, nb_epoch=nb_epoch,
show_accuracy=True, validation_data=(test_x, test_y_cat), shuffle=True)
score = model.evaluate(test_x, test_y_cat, show_accuracy=True, verbose=0)
print('Test score after fine turning:', score[0])
print('Test accuracy after fine turning:', score[1])
In [ ]:
In [ ]: