python rcv2.py
in the optvaedatasets folder
In [1]:
import sys,os,glob
from collections import OrderedDict
import numpy as np
from utils.misc import readPickle, createIfAbsent
sys.path.append('../')
from optvaedatasets.load import loadDataset as loadDataset_OVAE
from sklearn.feature_extraction.text import TfidfTransformer
In [2]:
default_params = readPickle('../optvaeutils/default_settings.pkl')[0]
for k in default_params:
print '(',k,default_params[k],')',
print
For the moment, we will leave everything as is. Some worthwhile parameters to note:
n_steps
: Number of steps of optimizing $\psi(x)$, the local variational parameters as output by the inference network. We'll set this to 10 below for the moment.dim_stochastic
: Number of latent dimensions.
In [3]:
default_params['opt_type'] = 'finopt' #set to finopt to optimize var. params, none otherwise
default_params['n_steps'] = 5
#temporary directory where checkpoints are saved
default_params['savedir'] = './tmp'
<dict>
is structured
In [4]:
dset = loadDataset_OVAE('rcv2')
#Visualize structure of dataset dict
for k in dset:
print k, type(dset[k]),
if hasattr(dset[k],'shape'):
print dset[k].shape
elif type(dset[k]) is not list:
print dset[k]
else:
print
#Add parameters to default_params
for k in ['dim_observations','data_type']:
default_params[k] = dset[k]
default_params['max_word_count'] =dset['train'].max()
#Create IDF
additional_attrs = {}
tfidf = TfidfTransformer(norm=None)
tfidf.fit(dset['train'])
additional_attrs['idf'] = tfidf.idf_
In [5]:
from optvaemodels.vae import VAE as Model
import optvaemodels.vae_learn as Learn
import optvaemodels.vae_evaluate as Evaluate
In [6]:
default_params['savedir']+='-rcv2-'+default_params['opt_type']
createIfAbsent(default_params['savedir'])
pfile= default_params['savedir']+'/'+default_params['unique_id']+'-config.pkl'
print 'Training model from scratch. Parameters in: ',pfile
model = Model(default_params, paramFile = pfile, additional_attrs = additional_attrs)
train.py
In [ ]:
savef = os.path.join(default_params['savedir'],default_params['unique_id']) #Prefix for saving in checkpoint directory
savedata = Learn.learn( model,
dataset = dset['train'],
epoch_start = 0 ,
epoch_end = 3, #epochs -- set w/ default_params['epochs']
batch_size = default_params['batch_size'], #batch size
savefreq = default_params['savefreq'], #frequency of saving
savefile = savef,
dataset_eval= dset['valid']
)
In [ ]:
for k in savedata:
print k, type(savedata[k]), savedata[k].shape