In [ ]:
%load_ext autoreload
%autoreload 2
from lib import models, graph, coarsening, utils
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy.sparse
import numpy as np
import time, shutil
%matplotlib inline
In [ ]:
flags = tf.app.flags
FLAGS = flags.FLAGS
# Graphs.
flags.DEFINE_integer('number_edges', 16, 'Graph: minimum number of edges per vertex.')
flags.DEFINE_string('metric', 'cosine', 'Graph: similarity measure (between features).')
# TODO: change cgcnn for combinatorial Laplacians.
flags.DEFINE_bool('normalized_laplacian', True, 'Graph Laplacian: normalized.')
flags.DEFINE_integer('coarsening_levels', 0, 'Number of coarsened graphs.')
flags.DEFINE_string('dir_data', os.path.join('data', 'rcv1'), 'Directory to store data.')
flags.DEFINE_integer('val_size', 400, 'Size of the validation set.')
From Dropout (Bruna did the same). We took the dataset and split it into 63 classes based on the the 63 categories at the second-level of the category tree. We removed 11 categories that did not have any data and one category that had only 4 training examples. We also removed one category that covered a huge chunk (25%) of the examples. This left us with 50 classes and 402,738 documents. We divided the documents into equal-sized training and test sets randomly. Each document was represented using the 2000 most frequent non-stopwords in the dataset.
In [ ]:
# Fetch dataset from Scikit-learn.
dataset = utils.TextRCV1(data_home=FLAGS.dir_data)
# Pre-processing: transform everything to a-z and whitespace.
#print(train.show_document(1)[:400])
#train.clean_text(num='substitute')
# Analyzing / tokenizing: transform documents to bags-of-words.
#stop_words = set(sklearn.feature_extraction.text.ENGLISH_STOP_WORDS)
# Or stop words from NLTK.
# Add e.g. don, ve.
#train.vectorize(stop_words='english')
#print(train.show_document(1)[:400])
In [ ]:
# Selection of classes.
keep = ['C11','C12','C13','C14','C15','C16','C17','C18','C21','C22','C23','C24',
'C31','C32','C33','C34','C41','C42','E11','E12','E13','E14','E21','E31',
'E41','E51','E61','E71','G15','GCRIM','GDEF','GDIP','GDIS','GENT','GENV',
'GFAS','GHEA','GJOB','GMIL','GOBIT','GODD','GPOL','GPRO','GREL','GSCI',
'GSPO','GTOUR','GVIO','GVOTE','GWEA','GWELF','M11','M12','M13','M14']
assert len(keep) == 55 # There is 55 second-level categories according to LYRL2004.
keep.remove('C15') # 151785 documents
keep.remove('GMIL') # 5 documents only
dataset.show_doc_per_class()
dataset.show_classes_per_doc()
dataset.remove_classes(keep)
dataset.show_doc_per_class(True)
dataset.show_classes_per_doc()
In [ ]:
# Remove documents with multiple classes.
dataset.select_documents()
dataset.data_info()
In [ ]:
# Remove short documents.
#train.data_info(True)
#wc = train.remove_short_documents(nwords=20, vocab='full')
#train.data_info()
#print('shortest: {}, longest: {} words'.format(wc.min(), wc.max()))
#plt.figure(figsize=(17,5))
#plt.semilogy(wc, '.');
In [ ]:
# Feature selection.
# Other options include: mutual information or document count.
#freq = train.keep_top_words(1000, 20)
#train.data_info()
#train.show_document(1)
#plt.figure(figsize=(17,5))
#plt.semilogy(freq);
# Remove documents whose signal would be the zero vector.
#wc = train.remove_short_documents(nwords=5, vocab='selected')
#train.data_info(True)
In [ ]:
#dataset.normalize(norm='l1')
dataset.show_document(1);
In [ ]:
# Word embedding
#if True:
# train.embed()
#else:
# train.embed('data_word2vec/GoogleNews-vectors-negative300.bin')
#train.data_info()
# Further feature selection. (TODO)
In [ ]:
perm = np.random.RandomState(seed=42).permutation(dataset.data.shape[0])
Ntest = dataset.data.shape[0] // 2
perm_test = perm[:Ntest]
perm_train = perm[Ntest:]
train_data = dataset.data[perm_train,:].astype(np.float32)
test_data = dataset.data[perm_test,:].astype(np.float32)
train_labels = dataset.labels[perm_train]
test_labels = dataset.labels[perm_test]
if False:
graph_data = train.embeddings.astype(np.float32)
else:
graph_data = dataset.data.T.astype(np.float32)
#del dataset
In [ ]:
t_start = time.process_time()
dist, idx = graph.distance_lshforest(graph_data.astype(np.float64), k=FLAGS.number_edges, metric=FLAGS.metric)
A = graph.adjacency(dist.astype(np.float32), idx)
print("{} > {} edges".format(A.nnz//2, FLAGS.number_edges*graph_data.shape[0]//2))
A = graph.replace_random_edges(A, 0)
graphs, perm = coarsening.coarsen(A, levels=FLAGS.coarsening_levels, self_connections=False)
L = [graph.laplacian(A, normalized=True) for A in graphs]
print('Execution time: {:.2f}s'.format(time.process_time() - t_start))
#graph.plot_spectrum(L)
#del graph_data, A, dist, idx
In [ ]:
assert FLAGS.coarsening_levels is 0
#t_start = time.process_time()
#train_data = scipy.sparse.csr_matrix(coarsening.perm_data(train_data.toarray(), perm))
#test_data = scipy.sparse.csr_matrix(coarsening.perm_data(test_data.toarray(), perm))
#print('Execution time: {:.2f}s'.format(time.process_time() - t_start))
#del perm
In [ ]:
# Training set is shuffled already.
#perm = np.random.permutation(train_data.shape[0])
#train_data = train_data[perm,:]
#train_labels = train_labels[perm]
# Validation set.
if False:
val_data = train_data[:FLAGS.val_size,:]
val_labels = train_labels[:FLAGS.val_size]
train_data = train_data[FLAGS.val_size:,:]
train_labels = train_labels[FLAGS.val_size:]
else:
val_data = test_data
val_labels = test_labels
In [ ]:
if False:
utils.baseline(train_data, train_labels, test_data, test_labels)
In [ ]:
common = {}
common['dir_name'] = 'rcv1/'
common['num_epochs'] = 4
common['batch_size'] = 100
common['decay_steps'] = len(train_labels) / common['batch_size']
common['eval_frequency'] = 200
common['filter'] = 'chebyshev5'
common['brelu'] = 'b1relu'
common['pool'] = 'mpool1'
C = max(train_labels) + 1 # number of classes
model_perf = utils.model_perf()
In [ ]:
if True:
name = 'softmax'
params = common.copy()
params['dir_name'] += name
params['regularization'] = 0
params['dropout'] = 1
params['learning_rate'] = 1e3
params['decay_rate'] = 0.95
params['momentum'] = 0.9
params['F'] = []
params['K'] = []
params['p'] = []
params['M'] = [C]
model_perf.test(models.cgcnn(L, **params), name, params,
train_data, train_labels, val_data, val_labels, test_data, test_labels)
In [ ]:
if True:
name = 'fc_softmax'
params = common.copy()
params['dir_name'] += name
params['regularization'] = 0
params['dropout'] = 1
params['learning_rate'] = 0.1
params['decay_rate'] = 0.95
params['momentum'] = 0.9
params['F'] = []
params['K'] = []
params['p'] = []
params['M'] = [2500, C]
model_perf.test(models.cgcnn(L, **params), name, params,
train_data, train_labels, val_data, val_labels, test_data, test_labels)
In [ ]:
if True:
name = 'fc_fc_softmax'
params = common.copy()
params['dir_name'] += name
params['regularization'] = 0
params['dropout'] = 1
params['learning_rate'] = 0.1
params['decay_rate'] = 0.95
params['momentum'] = 0.9
params['F'] = []
params['K'] = []
params['p'] = []
params['M'] = [2500, 500, C]
model_perf.test(models.cgcnn(L, **params), name, params,
train_data, train_labels, val_data, val_labels, test_data, test_labels)
In [ ]:
if True:
name = 'cgconv_softmax'
params = common.copy()
params['dir_name'] += name
params['regularization'] = 1e-3
params['dropout'] = 1
params['learning_rate'] = 0.1
params['decay_rate'] = 0.999
params['momentum'] = 0
params['F'] = [1]
params['K'] = [5]
params['p'] = [1]
params['M'] = [C]
model_perf.test(models.cgcnn(L, **params), name, params,
train_data, train_labels, val_data, val_labels, test_data, test_labels)
In [ ]:
if True:
name = 'cgconv_fc_softmax'
params = common.copy()
params['dir_name'] += name
params['regularization'] = 0
params['dropout'] = 1
params['learning_rate'] = 0.1
params['decay_rate'] = 0.999
params['momentum'] = 0
params['F'] = [5]
params['K'] = [15]
params['p'] = [1]
params['M'] = [100, C]
model_perf.test(models.cgcnn(L, **params), name, params,
train_data, train_labels, val_data, val_labels, test_data, test_labels)
In [ ]:
model_perf.show()