In [1]:
from __future__ import print_function, division
import os
import sys
import timeit
from six.moves import cPickle as pickle

import numpy as np
import pandas as pd

import theano
import theano.tensor as T

from lib.deeplearning import autoencoder

os.chdir('~/Codes/DL - Topic Modelling')


Using gpu device 0: Tesla K40c (CNMeM is disabled, cuDNN 5105)
/home/ekhongl/.conda/envs/py3/lib/python3.5/site-packages/theano/sandbox/cuda/__init__.py:600: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5.
  warnings.warn(warn)

In [2]:
dat_x = np.genfromtxt('data/dtm_2000_20news.csv', dtype='float32', delimiter=',', skip_header = 1)
dat_y = dat_x[:,0]
dat_x = dat_x[:,1:]
vocab =  np.genfromtxt('data/dtm_2000_20news.csv', dtype=str, delimiter=',', max_rows = 1)[1:]
test_input = theano.shared(dat_x)

loading weights pretrained from the Deep Belief Net (DBN) to the Autoencoder


In [4]:
model = autoencoder( architecture = [2000, 500, 500, 128], opt_epochs = [110,15,10], model_src = 'params_2000/dbn_params_pretrain')


Building layer: 0
   Input units: 2000
  Output units: 500
Building layer: 1
   Input units: 500
  Output units: 500
Building layer: 2
   Input units: 500
  Output units: 128

Training the Autoencoder


In [5]:
model.train(test_input, batch_size = 200, epochs = 110, add_noise = 16, output_path = 'params_2000/ae_train')


... getting the finetuning functions
... finetuning the model
Saving model...
...model saved
Training epoch 0, cost  7.79978342056
Saving model...
...model saved
Training epoch 100, cost  7.48429107666
Saving model...
...model saved
Training epoch 109, cost  7.46735124588
Training ran for 0.29m

Loading the trained Auto-Encoder


In [3]:
model = autoencoder( architecture = [2000, 500, 500, 128], model_src = 'params_2000/ae_train_nonoise',  param_type = 'ae')


Building layer: 0
   Input units: 2000
  Output units: 500
Building layer: 1
   Input units: 500
  Output units: 500
Building layer: 2
   Input units: 500
  Output units: 128
Loading the trained auto-encoder parameters.
...please ensure that the auto-encoder params matches the defined architecture.

Extracting features from the trained Auto-Encoder


In [4]:
output = model.score(test_input)

Saving the features extracted


In [27]:
colnames = ['bit'] * 128
colnames = [colnames[i] + str(i) for i in range(128)]
colnames.insert(0,'_label_')
pd.DataFrame(data = np.c_[dat_y, output], 
             columns = colnames). \
             to_csv( 'data/ae_features_2000_nonoise.csv', index = False)

Visualizing the convergence behavior


In [24]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt_dat = np.genfromtxt('params_2000/ae_train_nonoise/cost_profile.csv', delimiter=',', names = True)
plt.plot(plt_dat)
plt.show()