In [1]:
%matplotlib inline
%autosave 0
import sys, os
sys.path.insert(0, os.path.expanduser('~/work/tmp/20160915-bmlingam/bmlingam'))
from copy import deepcopy
import hashlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
from bmlingam import do_mcmc_bmlingam, InferParams, MCMCParams, save_pklz, load_pklz, define_hparam_searchspace, find_best_model
from bmlingam.utils.gendata import GenDataParams, gen_artificial_data
実験条件は以下の通りです.
人工データパラメータ
n_samples): [100]data_noise_type): ['laplace', 'uniform']n_confs or $Q$): [10]uniform(-1.5, 1.5)推定ハイパーパラメータ
L_cov_21s): [[-.9, -.7, -.5, -.3, 0, .3, .5, .7, .9]]model_noise_type): ['gg']
In [2]:
conds = [
{
'totalnoise': totalnoise,
'L_cov_21s': L_cov_21s,
'n_samples': n_samples,
'n_confs': n_confs,
'data_noise_type': data_noise_type,
'model_noise_type': model_noise_type,
'b21_dist': b21_dist
}
for totalnoise in [0.5, 1.0]
for L_cov_21s in [[-.9, -.7, -.5, -.3, 0, .3, .5, .7, .9]]
for n_samples in [100]
for n_confs in [10] # [1, 3, 5, 10]
for data_noise_type in ['laplace', 'uniform']
for model_noise_type in ['gg']
for b21_dist in ['uniform(-1.5, 1.5)']
]
print('{} conditions'.format(len(conds)))
In [3]:
def gen_artificial_data_given_cond(ix_trial, cond):
# 実験条件に基づく人工データ生成パラメータの設定
n_confs = cond['n_confs']
gen_data_params = deepcopy(gen_data_params_default)
gen_data_params.n_samples = cond['n_samples']
gen_data_params.conf_dist = [['all'] for _ in range(n_confs)]
gen_data_params.e1_dist = [cond['data_noise_type']]
gen_data_params.e2_dist = [cond['data_noise_type']]
gen_data_params.b21_dist = cond['b21_dist']
noise_scale = cond['totalnoise'] / np.sqrt(n_confs)
gen_data_params.f1_coef = [noise_scale for _ in range(n_confs)]
gen_data_params.f2_coef = [noise_scale for _ in range(n_confs)]
# 人工データ生成
gen_data_params.seed = ix_trial
data = gen_artificial_data(gen_data_params)
return data
# 人工データ生成パラメータの基準値
gen_data_params_default = GenDataParams(
n_samples=100,
b21_dist='r2intervals',
mu1_dist='randn',
mu2_dist='randn',
f1_scale=1.0,
f2_scale=1.0,
f1_coef=['r2intervals', 'r2intervals', 'r2intervals'],
f2_coef=['r2intervals', 'r2intervals', 'r2intervals'],
conf_dist=[['all'], ['all'], ['all']],
e1_std=3.0,
e2_std=3.0,
e1_dist=['laplace'],
e2_dist=['laplace'],
seed=0
)
実行例です.
In [4]:
data = gen_artificial_data_given_cond(0, conds[0])
xs = data['xs']
plt.figure(figsize=(3, 3))
plt.scatter(xs[:, 0], xs[:, 1])
Out[4]:
In [5]:
data = gen_artificial_data_given_cond(0,
{
'totalnoise': 3 * np.sqrt(1),
'n_samples': 10000,
'n_confs': 1,
'data_noise_type': 'laplace',
'b21_dist': 'uniform(-1.5, 1.5)'
}
)
xs = data['xs']
plt.figure(figsize=(3, 3))
plt.scatter(xs[:, 0], xs[:, 1])
Out[5]:
In [6]:
n_trials_per_cond = 100
In [7]:
# 因果推論パラメータ
infer_params = InferParams(
seed=0,
standardize=True,
subtract_mu_reg=False,
fix_mu_zero=True,
prior_var_mu='auto',
prior_scale='uniform',
max_c=1.0,
n_mc_samples=10000,
dist_noise='laplace',
df_indvdl=8.0,
prior_indvdls=['t'],
cs=[0.4, 0.6, 0.8],
scale_coeff=2. / 3.,
L_cov_21s=[-0.8, -0.6, -0.4, 0.4, 0.6, 0.8],
betas_indvdl=None, # [0.25, 0.5, 0.75, 1.],
betas_noise=[0.25, 0.5, 1.0, 3.0],
causalities=[[1, 2], [2, 1]],
sampling_mode='cache_mp4'
)
# 回帰係数推定パラメータ
mcmc_params = MCMCParams(
n_burn=1000,
n_mcmc_samples=1000
)
In [8]:
def make_id(ix_trial, n_samples, n_confs, data_noise_type, model_noise_type, L_cov_21s, totalnoise, b21_dist):
L_cov_21s_ = ' '.join([str(v) for v in L_cov_21s])
return hashlib.md5(
str((L_cov_21s_, ix_trial, n_samples, n_confs, data_noise_type, model_noise_type, totalnoise, b21_dist.replace(' ', ''))).encode('utf-8')
).hexdigest()
# テスト
print(make_id(55, 100, 12, 'all', 'gg', [1, 2, 3], 0.3, 'uniform(-1.5, 1.5)'))
print(make_id(55, 100, 12, 'all', 'gg', [1, 2, 3], 0.3, 'uniform(-1.5, 1.5)')) # 空白を無視します
In [9]:
def add_result_to_df(df, result):
if df is None:
return pd.DataFrame({k: [v] for k, v in result.items()})
else:
return df.append(result, ignore_index=True)
# テスト
result1 = {'col1': 10, 'col2': 20}
result2 = {'col1': 30, 'col2': -10}
df1 = add_result_to_df(None, result1)
print('--- df1 ---')
print(df1)
df2 = add_result_to_df(df1, result2)
print('--- df2 ---')
print(df2)
In [10]:
def df_exist_result_id(df, result_id):
if df is not None:
return result_id in np.array(df['result_id'])
else:
False
In [11]:
def load_df(df_file):
if os.path.exists(df_file):
return load_pklz(df_file)
else:
return None
def save_df(df_file, df):
save_pklz(df_file, df)
In [12]:
def _estimate_hparams(xs, infer_params):
assert(type(infer_params) == InferParams)
sampling_mode = infer_params.sampling_mode
hparamss = define_hparam_searchspace(infer_params)
results = find_best_model(xs, hparamss, sampling_mode)
hparams_best = results[0]
bf = results[2] - results[5] # Bayes factor
return hparams_best, bf
def run_trial(ix_trial, cond):
# 人工データ生成
data = gen_artificial_data_given_cond(ix_trial, cond)
b_true = data['b']
causality_true = data['causality_true']
# 因果推論
t = time.time()
infer_params.L_cov_21s = cond['L_cov_21s']
infer_params.dist_noise = cond['model_noise_type']
hparams, bf = _estimate_hparams(data['xs'], infer_params)
causality_est = hparams['causality']
time_causal_inference = time.time() - t
# 回帰係数推定
t = time.time()
trace = do_mcmc_bmlingam(data['xs'], hparams, mcmc_params)
b_post = np.mean(trace['b'])
time_posterior_inference = time.time() - t
return {
'causality_true': causality_true,
'regcoef_true': b_true,
'n_samples': cond['n_samples'],
'n_confs': cond['n_confs'],
'data_noise_type': cond['data_noise_type'],
'model_noise_type': cond['model_noise_type'],
'causality_est': causality_est,
'correct_rate': (1.0 if causality_est == causality_true else 0.0),
'error_reg_coef': np.abs(b_post - b_true),
'regcoef_est': b_post,
'log_bf': 2 * bf, # 2log(p(M) / p(M_rev))なので常に正の値となります.
'time_causal_inference': time_causal_inference,
'time_posterior_inference': time_posterior_inference,
'L_cov_21s': str(cond['L_cov_21s']),
'n_mc_samples': infer_params.n_mc_samples,
'confs_absmean': np.mean(np.abs(data['confs'].ravel())),
'totalnoise': cond['totalnoise']
}
In [13]:
def run_expr(conds):
# データフレームファイル名
data_dir = '.'
df_file = data_dir + '/20160902-eval-bml-results.pklz'
# ファイルが存在すれば以前の続きから実行します.
df = load_df(df_file)
# 実験条件に渡るループ
n_skip = 0
for cond in conds:
print(cond)
# トライアルに渡るループ
for ix_trial in range(n_trials_per_cond):
# 識別子
result_id = make_id(ix_trial, **cond)
# データフレームに結果が保存済みかどうかチェックします.
if df_exist_result_id(df, result_id):
n_skip += 1
else:
# resultはトライアルの結果が含まれるdictです.
# トライアルインデックスix_trialは乱数シードとして使用されます.
result = run_trial(ix_trial, cond)
result.update({'result_id': result_id})
df = add_result_to_df(df, result)
save_df(df_file, df)
print('Number of skipped trials = {}'.format(n_skip))
return df
In [14]:
df = run_expr(conds)
In [15]:
import pandas as pd
# データフレームファイル名
data_dir = '.'
df_file = data_dir + '/20160902-eval-bml-results.pklz'
df = load_pklz(df_file)
sg = df.groupby(['model_noise_type', 'data_noise_type', 'n_confs', 'totalnoise'])
sg1 = sg['correct_rate'].mean()
sg2 = sg['correct_rate'].count()
sg3 = sg['time_causal_inference'].mean()
pd.concat(
{
'correct_rate': sg1,
'count': sg2,
'time': sg3,
}, axis=1
)
Out[15]:
In [16]:
data = np.array(df[['regcoef_true', 'log_bf', 'totalnoise', 'correct_rate']])
ixs1 = np.where(data[:, 3] == 1.0)[0]
ixs2 = np.where(data[:, 3] == 0.0)[0]
plt.scatter(data[ixs1, 1], np.abs(data[ixs1, 0]), marker='o', s=20, c='r', label='Success')
plt.scatter(data[ixs2, 1], np.abs(data[ixs2, 0]), marker='*', s=70, c='b', label='Failure')
plt.ylabel('|b|')
plt.xlabel('2 * log(bayes_factor)')
plt.legend(fontsize=15, loc=4, shadow=True, frameon=True, framealpha=1.0)
Out[16]:
In [17]:
data = np.array(df[['regcoef_true', 'regcoef_est', 'correct_rate']])
ixs1 = np.where(data[:, 2] == 1)[0]
ixs2 = np.where(data[:, 2] == 0)[0]
assert(len(ixs1) + len(ixs2) == len(data))
plt.figure(figsize=(5, 5))
plt.scatter(data[ixs1, 0], data[ixs1, 1], c='r', label='Correct')
plt.scatter(data[ixs2, 0], data[ixs2, 1], c='b', label='Incorrect')
plt.plot([-3, 3], [-3, 3])
plt.xlim(-3, 3)
plt.ylim(-3, 3)
plt.gca().set_aspect('equal')
plt.xlabel('Reg coef (true)')
plt.ylabel('Reg coef (posterior mean)')
Out[17]:
In [18]:
data = np.array(df[['regcoef_true', 'regcoef_est', 'correct_rate']])
ixs1 = np.where(data[:, 2] == 1)[0]
ixs2 = np.where(data[:, 2] == 0)[0]
assert(len(ixs1) + len(ixs2) == len(data))
plt.figure(figsize=(5, 5))
plt.scatter(data[ixs1, 0], data[ixs1, 1], c='r', label='Correct')
plt.plot([-3, 3], [-3, 3])
plt.xlim(-3, 3)
plt.ylim(-3, 3)
plt.gca().set_aspect('equal')
plt.xlabel('Reg coef (true)')
plt.ylabel('Reg coef (posterior mean)')
plt.title('Correct inference')
plt.savefig('20160905-eval-bml-correct.eps')
In [19]:
plt.figure(figsize=(5, 5))
plt.scatter(data[ixs2, 0], data[ixs2, 1], c='b', label='Incorrect')
plt.plot([-3, 3], [-3, 3])
plt.xlim(-3, 3)
plt.ylim(-3, 3)
plt.gca().set_aspect('equal')
plt.xlabel('Reg coef (true)')
plt.ylabel('Reg coef (posterior mean)')
plt.title('Incorrect inference')
plt.savefig('20160905-eval-bml-incorrect.eps')