Callables in Research

The main purpose of Research is to run pipleines with different configs in parallel but you also can add callables and realize very flexible plans of experiments even without pipelines.


In [1]:
import sys
import os
import shutil

import warnings
warnings.filterwarnings('ignore')

from tensorflow import logging
logging.set_verbosity(logging.ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import matplotlib
%matplotlib inline

import numpy as np

In [2]:
sys.path.append('../../..')

from batchflow import Pipeline, B, C, V, D, L
from batchflow.opensets import MNIST
from batchflow.models.tf import VGG7, VGG16
from batchflow.research import Research, Option, Results, RP, RR, RD, REP, RID, RI

In [3]:
def clear_previous_results(res_name):
    if os.path.exists(res_name):
        shutil.rmtree(res_name)

Simple example

To add your callable into Research use add_callable method:


In [4]:
res_name = 'sample_callable_research'
clear_previous_results(res_name)

def randn_std():
    return np.random.randn()

research = Research().add_callable(randn_std, returns='random', name='randn_std')

research.run(5, name=res_name)

research.load_results().df


Research sample_callable_research is starting...
Out[4]:
name random iteration sample_index repetition update
0 randn_std 0.048672 0 1523285757 0 0
1 randn_std -1.523364 1 1523285757 0 0
2 randn_std 1.198689 2 1523285757 0 0
3 randn_std -0.700135 3 1523285757 0 0
4 randn_std -0.415482 4 1523285757 0 0

You also can use args and kwargs for your callables, just add them into add_callable.


In [5]:
clear_previous_results(res_name)

def randn(mean=0, std=1):
    return np.random.randn() * std + mean

research = Research().add_callable(randn, mean=2, std=5, returns='random', name='randn')

research.run(5, name=res_name)

research.load_results().df


Research sample_callable_research is starting...
Out[5]:
name random iteration sample_index repetition update
0 randn -8.773935 0 3261914171 0 0
1 randn 2.617877 1 3261914171 0 0
2 randn 5.094172 2 3261914171 0 0
3 randn 4.470154 3 3261914171 0 0
4 randn 4.431135 4 3261914171 0 0

Named Expressions

Obviously, such usage of args and kwargs is not very usefull because can be realized by partial but you also can use named expressions to substitute into functions objects which depends on research objects. For example, you can use ready results of the current research by RR named expression which corresponds to Results(path=res_name).


In [6]:
res_name = 'max_research'

clear_previous_results(res_name)

def stat(results):
    return results.random.min(), results.random.max()

research = (Research()
    .add_callable(randn, mean=2, std=5, returns='random', name='randn', dump=1)
    .add_callable(stat, results=RR().df, returns=['min_value', 'max_value'], name='stat')
)

research.run(5, name=res_name)

research.load_results().df


Research max_research is starting...
Out[6]:
name random min_value max_value iteration sample_index repetition update
0 randn 7.993444 NaN NaN 0 2441034918 0 0
1 randn -1.500677 NaN NaN 1 2441034918 0 0
2 randn -0.077410 NaN NaN 2 2441034918 0 0
3 randn 4.242234 NaN NaN 3 2441034918 0 0
4 randn 0.794869 NaN NaN 4 2441034918 0 0
5 stat NaN 7.993444 7.993444 0 2441034918 0 0
6 stat NaN -1.500677 7.993444 1 2441034918 0 0
7 stat NaN -1.500677 7.993444 2 2441034918 0 0
8 stat NaN -1.500677 7.993444 3 2441034918 0 0
9 stat NaN -1.500677 7.993444 4 2441034918 0 0

Save only the best model

One can use callables to save only the best (in some sense) model, for example, the model with the highest accuracy on the test.

Firstly, define pipelines as usual


In [7]:
BATCH_SIZE = 64
mnist = MNIST()
domain = Option('layout', ['cna', 'can']) * Option('bias', [True, False])

model_config={
    'inputs/images/shape': B('image_shape'),
    'inputs/labels/classes': 10,
    'inputs/labels/name': 'targets',
    'initial_block/inputs': 'images',
    'body/block/layout': C('layout'),
    'common/conv/use_bias': C('bias'),
}

In [8]:
train_ppl = (Pipeline()
            .init_variable('loss')
            .init_model('dynamic', VGG7, 'conv', config=model_config)
            .to_array()
            .train_model('conv', 
                         images=B('images'), labels=B('labels'),
                         fetches='loss', save_to=V('loss', mode='w'))           
)

train_root = mnist.train.p.run_later(BATCH_SIZE, shuffle=True, n_epochs=None)

In [9]:
test_ppl = (Pipeline()
                 .init_variable('predictions')
                 .init_variable('metrics')
                 .import_model('conv', C('import_from'))
                 .to_array()
                 .predict_model('conv', 
                                images=B('images'), labels=B('labels'),
                                fetches='predictions', save_to=V('predictions'))
                 .gather_metrics('class', targets=B('labels'), predictions=V('predictions'), 
                                fmt='logits', axis=-1, save_to=V('metrics', mode='a'))
)

test_root = mnist.test.p.run_later(BATCH_SIZE, shuffle=True, n_epochs=1) #Note  n_epochs=1

Now define callable which will get train pipeline with model, results for the current experiment, path to the folder with experiment results and current iteration of the reserach.


In [10]:
import glob
import shutil

def save_model(ppl, results, path, iteration):
    best_row = results.iloc[results.accuracy.idxmax()]
    if best_row.iteration == iteration:
        for item in glob.glob(glob.escape(path) + '/model_*'):
            shutil.rmtree(item)
        model_path = os.path.join(path, 'model_{}'.format(iteration))
        ppl.get_model_by_name("conv").save(model_path)
    return path

In [11]:
res_name = 'save_model_research'

clear_previous_results(res_name)

To define values of parameters we will use named expressions. RR args and kwargs will be used in Results initialization.


In [12]:
EXECUTE_EACH = 10

research = (Research()
    .init_domain(domain)
    .add_pipeline(train_root, train_ppl, variables='loss', name='train_ppl')
    .add_pipeline(test_root, test_ppl, variables='metrics', run=True, name='test_ppl',
                  import_from=RP('train_ppl'),
                  execute=[EXECUTE_EACH, 'last'], dump=[EXECUTE_EACH, 'last'])
    .get_metrics(pipeline='test_ppl', metrics_var='metrics', metrics_name='accuracy',
                 returns='accuracy',
                 execute=[EXECUTE_EACH, 'last'], dump=[EXECUTE_EACH, 'last'])
    .add_callable(save_model, returns='model_path', execute=[EXECUTE_EACH, 'last'],
                  ppl=RP('train_ppl'), 
                  results=RR(sample_index=RID(), names='test_ppl_metrics').df,
                  path=L(os.path.join)(RD(), REP()),
                  iteration=RI())
)

research.run(300, branches=4, name=res_name, bar=True)


Research save_model_research is starting...
Domain updated: 0: 100%|██████████| 300/300.0 [13:10<00:00,  2.64s/it]
Out[12]:
<batchflow.research.research.Research at 0x7f897077dba8>

Let's check that we have only the best models for each config.


In [13]:
results = research.load_results(concat_config=True).df

List of the saved models:


In [14]:
glob.glob(os.path.join(res_name, 'results', '*', '*', 'model*'))


Out[14]:
[]

Iterations for each config with the best test accuracy:


In [15]:
results.groupby('config').apply(lambda x: x.loc[x.accuracy.idxmax()])[['config', 'accuracy', 'iteration']]


Out[15]:
config accuracy iteration
config
bias_False-layout_can bias_False-layout_can 0.968833 299
bias_False-layout_cna bias_False-layout_cna 0.970084 299
bias_True-layout_can bias_True-layout_can 0.970641 299
bias_True-layout_cna bias_True-layout_cna 0.967897 299