In [1]:
%matplotlib inline
%autosave 0
import sys, os
sys.path.insert(0, os.path.expanduser('~/work/git/github/taku-y/bmlingam'))
sys.path.insert(0, os.path.expanduser('~/work/git/github/pymc-devs/pymc3'))
import theano
theano.config.floatX = 'float64'
from copy import deepcopy
import hashlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
from expr1 import run_trial
from bmlingam import load_pklz, save_pklz
# from bmlingam import do_mcmc_bmlingam, InferParams, MCMCParams, save_pklz, load_pklz, define_hparam_searchspace, find_best_model
# from bmlingam.utils.gendata import GenDataParams, gen_artificial_data
The experimental conditions are as follows.
Parameter of artiifcial data
n_samples): [100]n_confs or $Q$): [1, 3, 5, 10]data_noise_type): ['laplace', 'uniform']Hyperparameter range
L_cov_21s): [[-.9, -.7, -.5, -.3, 0, .3, .5, .7, .9]]model_noise_type): ['gg']
In [2]:
conds = [
{
'totalnoise': totalnoise,
'L_cov_21s': L_cov_21s,
'n_samples': n_samples,
'n_confs': n_confs,
'data_noise_type': data_noise_type,
'model_noise_type': model_noise_type
}
for totalnoise in [0.25, 0.5, 1.0, 3.0]
for L_cov_21s in [[-.9, -.7, -.5, -.3, 0, .3, .5, .7, .9]]
for n_samples in [100]
for n_confs in [1, 3, 5, 10] # [1, 3, 5, 10]
for data_noise_type in ['laplace', 'uniform']
for model_noise_type in ['gg']
]
Identifier of a trial is determined based on:
ix_trial)n_samples)n_confs)data_noise_type)model_noise_type)L_cov_21s)totalnoise)We use identifiers to store results of trials.
In [3]:
def make_id(ix_trial, n_samples, n_confs, data_noise_type, model_noise_type, L_cov_21s, totalnoise):
L_cov_21s_ = ' '.join([str(v) for v in L_cov_21s])
return hashlib.md5(
str((L_cov_21s_, ix_trial, n_samples, n_confs, data_noise_type, model_noise_type, totalnoise)).encode('utf-8')
).hexdigest()
# Test
print(make_id(55, 100, 12, 'all', 'gg', [1, 2, 3], 0.3))
In [4]:
def add_result_to_df(df, result):
if df is None:
return pd.DataFrame({k: [v] for k, v in result.items()})
else:
return df.append(result, ignore_index=True)
# Test
result1 = {'col1': 10, 'col2': 20}
result2 = {'col1': 30, 'col2': -10}
df1 = add_result_to_df(None, result1)
print('--- df1 ---')
print(df1)
df2 = add_result_to_df(df1, result2)
print('--- df2 ---')
print(df2)
In [5]:
def load_df(df_file):
if os.path.exists(df_file):
return load_pklz(df_file)
else:
return None
def save_df(df_file, df):
save_pklz(df_file, df)
In [6]:
def df_exist_result_id(df, result_id):
if df is not None:
return result_id in np.array(df['result_id'])
else:
False
def run_expr(conds, n_trials_per_cond=50):
"""Perform evaluation of BMLiNGAM given a set of experimental conditions.
For each condition, several trials are executed.
In a trial, BMLiNGAM is applied to causal inference for artificial data.
The average accuracy is computed for each condition.
"""
# Filename of dataframe
data_dir = '.'
df_file = data_dir + '/20160822-eval-bml-results.pklz'
# Load results computed in previous
df = load_df(df_file)
# Loop over experimental conditions
n_skip = 0
for cond in conds:
print(cond)
# Loop over trials
for ix_trial in range(n_trials_per_cond):
# Identifier of a trial for (cond, ix_trial)
result_id = make_id(ix_trial, **cond)
# Check if the result has been already stored in the data frame
if df_exist_result_id(df, result_id):
n_skip += 1
else:
# `result` is a dict including results of trials.
# `ix_trial` is used as the random seed of the corresponding trial.
result = run_trial(ix_trial, cond)
result.update({'result_id': result_id})
df = add_result_to_df(df, result)
save_df(df_file, df)
print('Number of skipped trials = {}'.format(n_skip))
return df
In [7]:
df = run_expr(conds)
In [8]:
import pandas as pd
df_file = './20160822-eval-bml-results.pklz'
df = load_pklz(df_file)
df = pd.concat(
{
'2log(bf)': df['log_bf'],
'correct rate': df['correct_rate'],
'totalnoise': df['totalnoise'],
'data noise type': df['data_noise_type'],
'n_confs': df['n_confs']
}, axis=1
)
sg = df.groupby(['data noise type', 'n_confs', 'totalnoise'])
sg1 = sg['correct rate'].mean()
sg2 = sg['2log(bf)'].mean()
pd.concat(
{
'correct_rate': sg1,
'2log(bf)': sg2,
}, axis=1
)
Out[8]:
In [9]:
import pandas as pd
def count(x): return np.sum(x.astype(int))
data_dir = '.'
df_file = data_dir + '/20160822-eval-bml-results.pklz'
df = load_pklz(df_file)
df = pd.concat(
{
'2log(bf)': df['log_bf'],
'correct rate': df['correct_rate'],
'count': df['correct_rate'],
'totalnoise': df['totalnoise'],
'data noise type': df['data_noise_type']
}, axis=1
)
df = df.pivot_table(values=['correct rate', 'count'],
index=['totalnoise', pd.cut(df['2log(bf)'], [0., 2., 6., 10., 100.])],
columns='data noise type',
aggfunc={'correct rate': np.mean, 'count': np.sum})
df
Out[9]:
In [ ]: