In [3]:
%load_ext autoreload
%autoreload 2
import sys, os
sys.path.append('../..')
%matplotlib inline
import matplotlib.pylab as plt
from misc.config import c
from data_api import *
import cPickle
import pandas as pd
from data_api import *
results_dir = c['RESULTS_DIR']
In [360]:
all_results = {}
models = ['exh_r2svm']
datasets = ['glass', 'australian', 'bank','breast_cancer', 'crashes', 'liver', 'segment', 'satimage', 'heart', 'vowel',
'diabetes', 'fourclass', 'german', 'indian', 'ionosphere', 'sonar', 'splice', 'iris', 'wine', 'pendigits',
'vehicle']
# datasets = ['vowel', 'vehicle', 'satimage', 'segment', 'pendigits']
paths = [ os.path.join(results_dir, model + '_' + dataset) for model in models for dataset in datasets ]
for path in paths:
if os.path.isdir(path):
print path
results = {}
for exp in os.listdir(path):
name = exp[:-11]
try:
exp_res = cPickle.load(open(os.path.join(path, exp),'r'))
except:
print exp
continue
merged_res = exp_res['monitors']
merged_res.update(exp_res['results'])
merged_res.update(exp_res['config']['params'])
results[name] = merged_res
name = path.split('/')[-1]
all_results[name] = results
In [4]:
csv_results = {}
csv_dir = os.path.join(results_dir, 'csv')
for csv_file in os.listdir(csv_dir):
csv_results[csv_file] = pd.DataFrame.from_csv(os.path.join(csv_dir, csv_file))
In [196]:
df[df['recurrent'] == False]['mean_acc'].max()
Out[196]:
In [195]:
Out[195]:
In [24]:
from collections import defaultdict
def get_accuracy_table(models=['test_elm', 'test_svm', 'test_linear_svm',\
'test_r2svm', 'test_r2elm', 'random_r2svm', 'fixed_r2svm',
'triple_svm', 'triple_r2svm', 'triple_fixed_r2svm'], \
datasets=['glass', 'australian', 'bank','breast_cancer', 'crashes', 'liver', 'segment', 'satimage', 'heart', 'vowel',
'diabetes', 'fourclass', 'mushrooms', 'german', 'indian', 'ionosphere', 'sonar', 'splice', 'iris', 'wine', 'pendigits',
'vehicle', 'svmguide2', 'svmguide4']
):
pd.options.display.float_format = '{:2.2f}'.format
best_results = {model: {} for model in models}
best_results_val = {data: {} for data in datasets}
best_per_dataset = defaultdict(float)
for model in models:
for data in datasets:
if model + '_' + data in csv_results.keys():
df = csv_results[model + '_' + data]
best_per_dataset[data] = max(best_per_dataset[data], df['mean_acc'].max())
for model in models:
for data in datasets:
if model + '_' + data in csv_results.keys():
df = csv_results[model + '_' + data]
acc = df['mean_acc'].max()
if 'std' in df.columns:
std = '%.2f' % df.loc[df['mean_acc'].idxmax(),'std']
else:
std = '0.01'
if acc == best_per_dataset[data]:
txt = "\textb{"+("%.2f"%acc)+"} & {\\tiny $ \pm $\textb{"+std+"} }" #TODO: add STD estimation
else:
txt = ("%.2f"%acc)+ " & {\\tiny $ \pm $" + "0.01 }"
best_results[model][data] = txt
best_results_val[data][model] = acc
return pd.DataFrame.from_dict(best_results)[models], best_results_val
In [148]:
In [13]:
from itertools import izip
In [27]:
models=[ 'test_r2svm','fixed_r2svm','test_r2elm', \
'test_elm', 'test_svm', 'test_linear_svm',\
'triple_r2svm', 'triple_svm', 'triple_fixed_r2svm']
datasets = ['glass', 'australian', 'bank','breast_cancer', 'crashes', 'liver', 'segment', 'satimage', 'heart', 'vowel',
'diabetes', 'fourclass', 'german', 'indian', 'ionosphere', 'sonar', 'splice', 'iris', 'wine', 'pendigits',
'vehicle', 'svmguide2', 'svmguide4', 'mushrooms']
tb, _ = get_accuracy_table(models=models, datasets = datasets)
# 1. Change column names
tb = tb.rename(columns=dict(izip(models,
["\rrsvm", "fixed \\rrsvm", "\\drelm", \
"ELM + SIG", "SVM + RBF", "SVM" \
"triple \rrsvm", "triple SVM+RBF", "triple f \rrsvm"])))
h = tb.to_latex(index=True, header=True, formatters=[lambda x:x]*10)
h = h.replace("\\&", "&").replace("\\textbackslash", "\\").replace("\\$", "$").replace("extb", "\\textbf").replace("\\{", "{").replace("\\}", "}").replace("\\\\%", "\\%")
print h
In [6]:
models = ['test_elm', 'test_svm', 'test_linear_svm','test_r2svm', 'test_r2elm', 'random_r2svm', 'fixed_r2svm',
'triple_svm', 'triple_r2svm', 'triple_fixed_r2svm'] #,exh_r2svm]
datasets = ['glass', 'australian', 'bank','breast_cancer', 'crashes', 'liver', 'segment', 'satimage', 'heart', 'vowel',
'diabetes', 'fourclass', 'german', 'indian', 'ionosphere', 'sonar', 'splice', 'iris', 'wine', 'pendigits',
'vehicle', 'svmguide2', 'svmguide4']
pd.options.display.float_format = '{:2.4f}'.format
In [15]:
best_std = {model: {} for model in models}
for model in models:
for data in datasets:
if model + '_' + data in csv_results.keys():
df = csv_results[model + '_' + data]
if 'std' in df.columns:
best_std[model][data] = df.loc[df['mean_acc'].idxmax(),'std'] * 100
# scores = df.loc[df['mean_acc'].idxmax(),'acc_fold']
# best_std[model][data] = np.mean([np.std(fold_scores) for fold_scores in scores]) * 100
print "Best std"
std_pd = pd.DataFrame.from_dict(best_std)
In [32]:
best_train_time = {model: {} for model in models}
for model in models:
for data in datasets:
if model + '_' + data in csv_results.keys():
df = csv_results[model + '_' + data]
time = df.loc[df['mean_acc'].idxmax(),'train_time']
time = time.translate(None, '[]')
best_train_time[model][data] = np.mean([float(t) for t in time.split(', ')])
pd.options.display.float_format = '{:2.4f}'.format
print "Best train time"
pd.DataFrame.from_dict(best_train_time)
Out[32]:
In [31]:
best_test_time = {model: {} for model in models}
for model in models:
for data in datasets:
if model + '_' + data in csv_results.keys():
df = csv_results[model + '_' + data]
time = df.loc[df['mean_acc'].idxmax(),'test_time']
time = time.translate(None, '[]')
best_test_time[model][data] = np.mean([float(t) for t in time.split(', ')])
pd.options.display.float_format = '{:2.6f}'.format
print "Best testtime"
pd.DataFrame.from_dict(best_test_time)
# multiply those?
Out[31]:
In [375]:
[('test_r2svm' in result_dict[d], d) for d in datasets]
Out[375]:
In [376]:
tb, result_dict = get_accuracy_table()
datasets=['glass', 'australian', 'bank','breast_cancer', 'crashes', 'liver', 'segment', 'satimage', 'heart', 'vowel',
'diabetes', 'fourclass', 'german', 'indian', 'ionosphere', 'sonar', 'splice', 'iris', 'wine', 'pendigits',
'vehicle', 'svmguide2', 'svmguide4']
points = [(result_dict[d]["test_r2svm"]/result_dict[d]["test_svm"], (result_dict[d]["test_svm"] - result_dict[d]["test_linear_svm"])) for d in datasets]
%matplotlib inline
X = [p[1] for p in points]
Y = [p[0] for p in points]
plt.scatter(X, Y)
print(scipy.stats.spearmanr(X, Y))
In [416]:
datasets = list(set(datasets).difference(set(["segment", "pendigits", "satimage"])))
points = [(result_dict[d]["test_r2svm"]/result_dict[d]["test_svm"], spaceness(d)) for d in datasets]
%matplotlib inline
X = [p[1] for p in points]
Y = [p[0] for p in points]
plt.scatter(X, Y)
print(scipy.stats.spearmanr(X, Y))
In [414]:
# Define datasets tested
binary = ["australian", "bank", "breast_cancer", "crashes", "diabetes",\
"fourclass", "german", "heart", "indian", "ionosphere", "liver", "sonar", "splice"]
multi = set(datasets).difference(set(binary))
datasetss=['australian',\
'bank',\
'breast_cancer',\
'crashes',\
'diabetes',\
'fourclass',\
'german',\
'glass',\
'heart',\
'indian',\
'ionosphere',\
'iris',\
'liver',\
'mushrooms',\
'pendigits',\
'satimage',\
'segment',\
'sonar',\
'splice',\
'svmguide2',\
'svmguide4',\
'vehicle',\
'vowel',\
'wine']
dim = [14,4,10,20,8,2,24,9,13,10,34,4,6,112,16,36,19,60,60,20,10,18,10,4]
manifolds = [1,3,1,1,2,2,3,6,3,3,24,2,3,40,9,6,7,28,55,15,1,6,8,2]
dimd = {}
manifoldsd = {}
for i in range(len(datasetss)):
dimd[datasetss[i]] = dim[i]
manifoldsd[datasetss[i]] = manifolds[i]
def spaceness(i):
N, M = dimd[i], manifoldsd[i]
return (N-M)/float(N)
tb, result_dict = get_accuracy_table()
T = 0.04
T_N = 0.35
rbf_stronger = [d for d in datasets if (result_dict[d]["test_svm"] - result_dict[d]["test_linear_svm"])/abs(result_dict[d]["test_svm"]) > T]
rbf_similar = set(datasets).difference(set(rbf_stronger))
spaced = [d for d,N,M in izip(datasets,dim,manifolds) if (N-M)/float(N) > T_N]
compact = set(datasets).difference(set(spaced))
imbalanced = [d for d in datasets if disbalance(d) > 0.15]
balanced = set(datasets).difference(imbalanced)
datasets_groups = {"bal":balanced, "bin":binary, "multi":multi, "rbf_str":rbf_stronger, "rbf_sim":rbf_similar, "spaced":spaced, "comp":compact, \
"im":imbalanced}
In [421]:
# Define predicates tested tested
def model_stronger_than_11lin(model, datasets):
return sum([result_dict[d][model] > 1.1*result_dict[d]['test_linear_svm'] for d in datasets])
def model_stronger_than_09rbf(model, datasets):
return sum([result_dict[d][model] > 0.98*result_dict[d]['test_svm'] for d in datasets])
def is_stronger_key_on_data(model, data, key='recurrent', threshold=0.95):
try:
df = csv_results[model + '_' + data]
acc_rec = df[df[key] == True]['mean_acc'].max()
acc_normal = df[df[key] == False]['mean_acc'].max()
return acc_normal/acc_rec < 0.96
except:
return False
def is_weaker_key_on_data(model, data, key='recurrent', threshold=0.95):
try:
df = csv_results[model + '_' + data]
acc_rec = df[df[key] == True]['mean_acc'].max()
acc_normal = df[df[key] == False]['mean_acc'].max()
return acc_rec/acc_normal < 0.96
except:
return False
def is_stronger_recurrent(model, datasets):
return sum(is_stronger_key_on_data(model, d) for d in datasets)
def is_weaer_recurrent(model, datasets):
return sum(is_weaker_key_on_data(model, d) for d in datasets)
def is_stronger_scale(model, datasets):
return sum(is_stronger_key_on_data(model, d, key='scale') for d in datasets)
def is_stronger_use_prev(model, datasets):
return sum(is_stronger_key_on_data(model, d, key='use_prev') for d in datasets)
predicates = {"str_than_11lin":model_stronger_than_11lin, \
"str_than_09rbf":model_stronger_than_09rbf, \
"str_rec":is_stronger_recurrent,\
"str_scale":is_stronger_scale, \
"str_use_preV":is_stronger_use_prev, \
"weak_rec": is_weaer_recurrent}
In [323]:
is_stronger_key_on_data("test_r2svm", "glass", key="scale")
Out[323]:
In [314]:
from itertools import product
In [422]:
def get_model_char(model=["test_r2svm"]):
results = {k:{} for k in datasets_groups.keys()}
for pred_key,d_group_key in product(predicates.keys(), datasets_groups):
try:
d_group = datasets_groups[d_group_key]
pred = predicates[pred_key]
sum_len, sum_count = 0, 0
for m in model:
sum_len += len(d_group)
sum_count += pred(m, d_group)
results[d_group_key][pred_key] = "%d"%(100.*sum_count/float(sum_len))+" "+ str(sum_count) + "/" + str(sum_len)
except:
pass
return pd.DataFrame.from_dict(results)
t = get_model_char()
In [423]:
get_model_char(["test_r2elm", "test_r2svm"])
Out[423]:
In [328]:
get_model_char("test_r2svm")
Out[328]:
In [309]:
get_model_char("triple_fixed_r2svm")
Out[309]:
In [ ]:
In [242]:
tb, result_dict = get_accuracy_table()
datasets=['glass', 'australian', 'bank','breast_cancer', 'crashes', 'liver', 'segment', 'satimage', 'heart', 'vowel',
'diabetes', 'fourclass', 'german', 'indian', 'ionosphere', 'sonar', 'splice', 'iris', 'wine', 'pendigits',
'vehicle', 'svmguide2', 'svmguide4']
points = [(result_dict[d]["test_r2svm"]/result_dict[d]["test_svm"], disbalance(d)) for d in datasets]
%matplotlib inline
X = [p[1] for p in points]
Y = [p[0] for p in points]
plt.scatter(X, Y)
print(scipy.stats.spearmanr(X, Y))
In [227]:
from collections import Counter
In [228]:
glass =
In [ ]:
Counter(glass.target)
In [247]:
def disbalance(name="glass"):
data = fetch_uci_datasets([name])[0]
c = Counter(data.target)
return (max(c.values()) - min(c.values()))/(float(len(data.target)))
In [250]:
disbalance('australian')
Out[250]:
In [240]:
Out[240]: