In [1]:
import numpy
import re
import os
import matplotlib.pyplot as pyplot
%matplotlib inline
from collections import namedtuple
from collections import defaultdict
ExpStatsOLD = namedtuple('Stats_old', ['train_X_Y', 'train_X', 'train_Y', 'train_Y_X',
'test_X_Y', 'test_X', 'test_Y', 'test_Y_X',
'acc_mpe', 'ham_mpe', 'ext_mpe',
'acc_marg', 'ham_marg', 'ext_marg'])
ExpStats = namedtuple('Stats', ['train_X_Y', 'train_X', 'train_Y', 'train_Y_X',
'test_X_Y', 'test_X', 'test_Y', 'test_Y_X',
'train_acc_mpe', 'train_ham_mpe', 'train_ext_mpe',
'train_acc_marg', 'train_ham_marg', 'train_ext_marg',
'test_acc_mpe', 'test_ham_mpe', 'test_ext_mpe',
'test_acc_marg', 'test_ham_marg', 'test_ext_marg',
'x_y_edges', 'y_y_edges', 'x_x_edges'])
USE_NOTEBOOK = True
In [2]:
def parse_ll_from_log(log_str, ll_prefix='Train:', n_folds=5, newline='\n'):
p_X_Y_lls = []
p_X_lls = []
p_Y_lls = []
p_Y_X_lls = []
lines = log_str.split(newline)
for line in lines:
if ll_prefix in line:
stats = re.findall(r"[-+]?\d*\.\d+|\d+", line)
assert len(stats) == 4
p_X_Y_lls.append(float(stats[0]))
p_X_lls.append(float(stats[1]))
p_Y_lls.append(float(stats[2]))
p_Y_X_lls.append(float(stats[3]))
assert len(p_X_Y_lls) == 5
assert len(p_X_lls) == 5
assert len(p_Y_lls) == 5
assert len(p_Y_X_lls) == 5
return numpy.array(p_X_Y_lls), numpy.array(p_X_lls), numpy.array(p_Y_lls), numpy.array(p_Y_X_lls)
In [3]:
if USE_NOTEBOOK:
file_path = 'newcsn/l0.6ts0/arts.log'
with open(file_path, 'r') as f:
log_contents = f.read()
train_X_Y, train_X, train_Y, train_Y_X = parse_ll_from_log(log_contents, 'Train:', 5)
print(train_X_Y)
print(train_X)
print(train_Y)
print(train_Y_X)
test_X_Y, test_X, test_Y, test_Y_X = parse_ll_from_log(log_contents, 'Test:', 5)
print(test_X_Y)
print(test_X)
print(test_Y)
print(test_Y_X)
In [4]:
def parse_scores_from_log_old(log_str, stat_prefix='Accuracy', n_folds=5, newline='\n'):
scores = []
lines = log_str.split(newline)
for line in lines:
if stat_prefix in line:
stats = re.findall(r"[-+]?\d*\.\d+|\d+", line)
if len(stats) == n_folds:
scores.extend([float(s) for s in stats])
assert len(scores) == n_folds * 2
return numpy.array(scores[:n_folds]), numpy.array(scores[n_folds:])
In [5]:
def parse_scores_from_log(log_str, stat_prefix='Train Accuracy mpe', n_folds=5, newline='\n'):
scores = []
lines = log_str.split(newline)
for line in lines:
if stat_prefix in line:
stats = re.findall(r"[-+]?\d*\.\d+|\d+", line)
if len(stats) == n_folds:
scores.extend([float(s) for s in stats])
assert len(scores) == n_folds
return numpy.array(scores)
In [6]:
if USE_NOTEBOOK:
file_path = 'newcsn/l0.6ts0/arts.log'
with open(file_path, 'r') as f:
log_contents = f.read()
acc_mpe = parse_scores_from_log(log_contents, 'Train Accuracy mpe', 5)
print(acc_mpe)
ham_mpe = parse_scores_from_log(log_contents, 'Train Hamming Score mpe', 5)
print(ham_mpe)
ext_mpe = parse_scores_from_log(log_contents, 'Train Exact match mpe', 5)
print(ext_mpe)
In [7]:
def parse_edges_from_log(log_str, edge_prefix='Xs with Y parent', n_folds=5, newline='\n'):
edges = []
lines = log_str.split(newline)
for line in lines:
if edge_prefix in line:
edge_count = re.findall(r"\d+", line)
assert len(edge_count) == 1
edges.append(int(edge_count[0]))
assert len(edges) == n_folds
return numpy.array(edges)
In [8]:
if USE_NOTEBOOK:
file_path = 'newcsn/l0.6ts0/arts.log'
with open(file_path, 'r') as f:
log_contents = f.read()
x_y_edges = parse_edges_from_log(log_contents, 'Xs with Y parent', 5)
print(x_y_edges)
y_y_edges = parse_edges_from_log(log_contents, 'Ys with Y parent', 5)
print(y_y_edges)
In [9]:
def parse_stats_from_log(log_path, n_folds=5, tot_edges=None, newline='\n'):
"""
From a string representing a log file content,
Extract these information:
For each fold:
- train log likelihoods:
- p(X, Y)
- p(X)
- p(Y)
- p(Y|X)
- test log likelihoods:
- p(X, Y)
- p(X)
- p(Y)
- p(Y|X)
- MPE scores:
- accuracy
- hamming loss
- exact match
- MARG scores:
- accuracy
- hamming loss
- exact match
they are put into a ExpStats namedtuple
"""
log_str = None
with open(log_path, 'r') as log_file:
log_str = log_file.read()
train_X_Y, train_X, train_Y, train_Y_X = parse_ll_from_log(log_str, 'Train:', n_folds=n_folds, newline=newline)
test_X_Y, test_X, test_Y, test_Y_X = parse_ll_from_log(log_str, 'Test:', n_folds=n_folds, newline=newline)
train_acc_mpe = parse_scores_from_log(log_str, 'Train Accuracy mpe', n_folds=n_folds, newline=newline)
train_ham_mpe = parse_scores_from_log(log_str, 'Train Hamming Score mpe', n_folds=n_folds, newline=newline)
train_ext_mpe = parse_scores_from_log(log_str, 'Train Exact match mpe', n_folds=n_folds, newline=newline)
train_acc_marg = parse_scores_from_log(log_str, 'Train Accuracy marg', n_folds=n_folds, newline=newline)
train_ham_marg = parse_scores_from_log(log_str, 'Train Hamming Score marg', n_folds=n_folds, newline=newline)
train_ext_marg = parse_scores_from_log(log_str, 'Train Exact match marg', n_folds=n_folds, newline=newline)
test_acc_mpe = parse_scores_from_log(log_str, 'Test Accuracy mpe', n_folds=n_folds, newline=newline)
test_ham_mpe = parse_scores_from_log(log_str, 'Test Hamming Score mpe', n_folds=n_folds, newline=newline)
test_ext_mpe = parse_scores_from_log(log_str, 'Test Exact match mpe', n_folds=n_folds, newline=newline)
test_acc_marg = parse_scores_from_log(log_str, 'Test Accuracy marg', n_folds=n_folds, newline=newline)
test_ham_marg = parse_scores_from_log(log_str, 'Test Hamming Score marg', n_folds=n_folds, newline=newline)
test_ext_marg = parse_scores_from_log(log_str, 'Test Exact match marg', n_folds=n_folds, newline=newline)
x_y_edges = parse_edges_from_log(log_str, 'Xs with Y parent', n_folds=n_folds, newline=newline)
y_y_edges = parse_edges_from_log(log_str, 'Ys with Y parent', n_folds=n_folds, newline=newline)
x_x_edges = None
if tot_edges is not None:
x_x_edges = [tot_edges - x_y - y_y for x_y, y_y in zip(x_y_edges, y_y_edges)]
return ExpStats(train_X_Y, train_X, train_Y, train_Y_X,
test_X_Y, test_X, test_Y, test_Y_X,
train_acc_mpe, train_ham_mpe, train_ext_mpe,
train_acc_marg, train_ham_marg, train_ext_marg,
test_acc_mpe, test_ham_mpe, test_ext_mpe,
test_acc_marg, test_ham_marg, test_ext_marg,
x_y_edges, y_y_edges, x_x_edges)
In [10]:
def print_stats(exp_stats):
print('train log p(X,Y):\t', exp_stats.train_X_Y)
print('train log p(X):\t', exp_stats.train_X)
print('train log p(Y):\t', exp_stats.train_Y)
print('train log p(Y|X):\t', exp_stats.train_Y_X)
print('test log p(X,Y):\t', exp_stats.test_X_Y)
print('test log p(X):\t', exp_stats.test_X)
print('test log p(Y):\t', exp_stats.test_Y)
print('test log p(Y|X):\t', exp_stats.test_Y_X)
print('train accuracy MPE:\t', exp_stats.train_acc_mpe)
print('train hamming MPE:\t', exp_stats.train_ham_mpe)
print('train extact match MPE:\t', exp_stats.train_ext_mpe)
print('train accuracy Marg:\t', exp_stats.train_acc_marg)
print('train hamming Marg:\t', exp_stats.train_ham_marg)
print('train extact match marg:\t', exp_stats.train_ext_marg)
print('test accuracy MPE:\t', exp_stats.test_acc_mpe)
print('test hamming MPE:\t', exp_stats.test_ham_mpe)
print('test extact match MPE:\t', exp_stats.test_ext_mpe)
print('test accuracy Marg:\t', exp_stats.test_acc_marg)
print('test hamming Marg:\t', exp_stats.test_ham_marg)
print('test extact match marg:\t', exp_stats.test_ext_marg)
print('n edges X -> Y:\t', exp_stats.x_y_edges)
print('n edges Y -> Y:\t', exp_stats.y_y_edges)
print('n edges X -> X:\t', exp_stats.x_x_edges)
In [11]:
if USE_NOTEBOOK:
file_path = 'newcsn/l0.6ts0/arts.log'
exp_stats = parse_stats_from_log(file_path)
print_stats(exp_stats)
In [12]:
DATASETS = [
#'arts',
'birds', 'business', 'cal', 'emotions', 'flags', 'health', 'human', 'plant', 'scene', 'yeast']
DATASET_LENGTH = {
'arts':525,
'birds':None,
'business':529,
'cal':241,
'emotions':77,
'flags':25,
'health':531,
'human':453,
'plant':451,
'scene':299,
'yeast':116
}
def parse_stats_for_exp(exp_dir, datasets=DATASETS, n_folds=5, newline='\n'):
"""
Parse the stats for each dataset log file in a dir
"""
stats_dict = {}
for d in datasets:
print('\n\tconsidering dataset {}'.format(d))
data_path = os.path.join(exp_dir, '{}.log'.format(d))
exp_stats = parse_stats_from_log(data_path, n_folds=n_folds,tot_edges=DATASET_LENGTH[d], newline=newline)
stats_dict[d] = exp_stats
return stats_dict
In [13]:
if USE_NOTEBOOK:
exp_dir = './newcsn/l0.6ts0'
stats = parse_stats_for_exp(exp_dir)
for k, v in stats.items():
print(k)
print_stats(v)
In [14]:
TREE_STRUCTURES = [0, 1, 2, 3]
CL_CSN = ['0.6', '1.0']
Y_LEAVES = [False, True]
def parse_stats_for_models_exps(exp_dir,
tree_structures=TREE_STRUCTURES,
splits=CL_CSN,
leaves=Y_LEAVES,
datasets=DATASETS,
n_folds=5,
newline='\n'):
dataset_assoc = defaultdict(dict)
model_assoc = {}
for ts in tree_structures:
for s in splits:
exp_prefix = 'l{}ts{}'.format(s, ts)
for l in leaves:
if l:
exp_prefix += 'l'
print('\n\nConsidering exp: {}'.format(exp_prefix))
exp_path = os.path.join(exp_dir, exp_prefix)
stats_dict = parse_stats_for_exp(exp_path, datasets=datasets, n_folds=n_folds, newline=newline)
# for k, v in stats_dict.items():
# print('\t{}'.format(k))
# print_stats(v)
model_assoc[exp_prefix] = stats_dict
for dataset, v in stats_dict.items():
dataset_assoc[dataset][exp_prefix] = v
return dataset_assoc, model_assoc
In [15]:
if USE_NOTEBOOK:
exp_dir = './newcsn'
dataset_assoc, model_assoc = parse_stats_for_models_exps(exp_dir, datasets=DATASETS)
In [16]:
def tiling_sizes(n_images, n_cols=None):
n_rows = None
if n_cols is None:
n_rows = int(numpy.sqrt(n_images))
n_cols = n_rows
else:
n_rows = max(n_images // n_cols, 1)
rem_tiles = n_images - n_rows * n_cols
if rem_tiles > 0:
n_rem_rows, n_rem_cols = tiling_sizes(rem_tiles, n_cols)
return n_rows + n_rem_rows, n_cols
return n_rows, n_cols
In [17]:
def plot_model_boxplot_stat(model_assoc,
stat_names=['train_X_Y', 'test_X_Y'],
model_names=None,
n_rows=1,
fig_size=(16, 18)):
n_models = len(model_assoc)
n_stats = len(stat_names)
labels = list(k for k in model_assoc)
if n_rows > 1:
n_cols = int(numpy.ceil(n_stats / n_rows))
elif n_rows == 1:
n_cols = n_stats
print('\nPlotting into {} rows and {} cols'.format(n_rows, n_cols))
fig, axes = pyplot.subplots(nrows=n_rows, ncols=n_cols, figsize=fig_size)
if model_names is None:
model_names = [k for k in model_assoc]
for k in range(n_stats):
# print('Considering stat {}'.format(stat_names[k]))
data = []
# for model, stats in model_assoc.items():
for model in model_names:
stats = model_assoc[model]
data.append(getattr(stats, stat_names[k]))
data = numpy.array(data).T
i = k // n_cols
j = k - i * n_cols
# for i in range(n_models):
ax = None
if n_rows == 1:
ax = axes[k]
else:
ax = axes[i, j]
ax.boxplot(data,
#labels=labels,
showmeans=True
)
ax.set_title(stat_names[k])
ax.set_xticklabels(model_names, rotation=90)
pyplot.show()
In [18]:
if USE_NOTEBOOK:
exp_dir = './newcsn'
dataset_assoc, model_assoc = parse_stats_for_models_exps(exp_dir, datasets=DATASETS)
for d in DATASETS:
print('\n\n\n\t{}'.format(d))
plot_model_boxplot_stat(dataset_assoc[d],
stat_names=['train_X_Y', 'train_X', 'train_Y', 'train_Y_X',
'test_X_Y', 'test_X', 'test_Y', 'test_Y_X',
'train_acc_mpe', 'train_acc_marg',
'train_ham_mpe', 'train_ham_marg',
'test_acc_mpe', 'test_acc_marg',
'test_ham_mpe', 'test_ham_marg',
'train_ext_mpe', 'train_ext_marg',
'test_ext_mpe', 'test_ext_marg'],
n_rows=5,
model_names=['l1.0ts0l', 'l1.0ts1l', 'l1.0ts2l', 'l1.0ts3l'])
In [19]:
if USE_NOTEBOOK:
exp_dir = './newcsn'
dataset_assoc, model_assoc = parse_stats_for_models_exps(exp_dir, datasets=DATASETS)
for d in DATASETS:
print('\n\n\n\t{}'.format(d))
plot_model_boxplot_stat(dataset_assoc[d],
stat_names=['train_X_Y', 'train_X', 'train_Y', 'train_Y_X',
'test_X_Y', 'test_X', 'test_Y', 'test_Y_X',
'train_acc_mpe', 'train_acc_marg',
'train_ham_mpe', 'train_ham_marg',
'test_acc_mpe', 'test_acc_marg',
'test_ham_mpe', 'test_ham_marg',
'train_ext_mpe', 'train_ext_marg',
'test_ext_mpe', 'test_ext_marg'],
n_rows=5,
model_names=['l0.6ts0l', 'l0.6ts1l', 'l0.6ts2l', 'l0.6ts3l'])
In [ ]:
print(dataset_assoc['human']['l1.0ts1l'].ham_mpe.mean())
print(dataset_assoc['human']['l1.0ts1l'].ham_marg.mean())
print(dataset_assoc['human']['l1.0ts2l'].ham_mpe.mean())
print(dataset_assoc['human']['l1.0ts2l'].ham_marg.mean())
print(dataset_assoc['human']['l1.0ts1l'].ext_mpe.mean())
print(dataset_assoc['human']['l1.0ts1l'].ext_marg.mean())
print(dataset_assoc['human']['l1.0ts2l'].ext_mpe.mean())
print(dataset_assoc['human']['l1.0ts2l'].ext_marg.mean())
for dataset in DATASETS:
print(dataset)
print('train_X_Y')
print(dataset_assoc[dataset]['l1.0ts0l'].train_X_Y)
print(dataset_assoc[dataset]['l1.0ts1l'].train_X_Y)
print(dataset_assoc[dataset]['l1.0ts2l'].train_X_Y)
print('train_X')
print(dataset_assoc[dataset]['l1.0ts0l'].train_X)
print(dataset_assoc[dataset]['l1.0ts1l'].train_X)
print(dataset_assoc[dataset]['l1.0ts2l'].train_X)
print('train_Y')
print(dataset_assoc[dataset]['l1.0ts0l'].train_Y)
print(dataset_assoc[dataset]['l1.0ts1l'].train_Y)
print(dataset_assoc[dataset]['l1.0ts2l'].train_Y)
print('train_Y_X')
print(dataset_assoc[dataset]['l1.0ts0l'].train_Y_X)
print(dataset_assoc[dataset]['l1.0ts1l'].train_Y_X)
print(dataset_assoc[dataset]['l1.0ts2l'].train_Y_X)
In [ ]: