In [1]:
%load_ext autoreload
%autoreload 2
In [130]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import glob
import tabulate
import pprint
import click
import numpy as np
import pandas as pd
from ray.tune.commands import *
In [132]:
browser = RayTuneExperimentBrowser(os.path.expanduser("~/nta/results/VGG19SparseFull"))
In [133]:
df = browser.best_experiments(min_test_accuracy=0.0, min_noise_accuracy=0.0, sort_by="test_accuracy")
In [134]:
df.head(5)
Out[134]:
In [136]:
df.columns
Out[136]:
In [137]:
df.iloc[0]
Out[137]:
In [155]:
len(df[df['epochs']==164])
Out[155]:
In [156]:
df[df['epochs']==164][['test_accuracy_max', 'noise_accuracy_max']].corr()
Out[156]:
In [157]:
df[df['epochs']==164][['test_accuracy_max', 'noise_accuracy_max']].min()
Out[157]:
In [158]:
df[df['epochs']==164][['test_accuracy_max', 'noise_accuracy_max']].mean()
Out[158]:
In [159]:
df[df['epochs']==164][['test_accuracy_max', 'noise_accuracy_max']].max()
Out[159]:
In [160]:
len(df[df['epochs']==90])
Out[160]:
In [161]:
df[df['epochs']==90][['test_accuracy_max', 'noise_accuracy_max']].corr()
Out[161]:
In [162]:
df[df['epochs']==90][['test_accuracy_max', 'noise_accuracy_max']].min()
Out[162]:
In [163]:
df[df['epochs']==90][['test_accuracy_max', 'noise_accuracy_max']].mean()
Out[163]:
In [164]:
df[df['epochs']==90][['test_accuracy_max', 'noise_accuracy_max']].max()
Out[164]:
In [185]:
df[df['epochs']>=30][['epochs', 'test_accuracy']].astype(np.float32).corr()
Out[185]:
In [186]:
df[df['epochs']>=30][['epochs', 'noise_accuracy']].astype(np.float32).corr()
Out[186]:
In [207]:
tunable_params_general = ['learning_rate', 'learning_rate_gamma', 'weight_decay', 'momentum', 'batch_size', 'batches_in_epoch']
tunable_params_sparsity = ['boost_strength', 'boost_strength_factor', 'k_inference_factor', 'cnn_percent_on', 'cnn_weight_sparsity']
tunable_params = tunable_params_general + tunable_params_sparsity
performance_metrics = ['noise_accuracy_max', 'test_accuracy_max']
corr_params = tunable_params + performance_metrics
df[corr_params].astype(np.float32).corr()
Out[207]:
In [208]:
df[corr_params].astype(np.float32).corr() > 0.3
Out[208]:
In [209]:
df[corr_params].astype(np.float32).corr() < -0.3
Out[209]:
In [234]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from pprint import pprint
In [237]:
# Including all experiments with 30 or more epochs
df_inf = df[df['epochs']>=30]
y1 = df_inf['noise_accuracy_max']
y2 = df_inf['test_accuracy_max']
X = df_inf[tunable_params].astype(np.float32)
# adjust all X to same scale
scaler = StandardScaler()
X = scaler.fit_transform(X)
model_noise = LinearRegression()
model_noise.fit(X, y1)
print("\n Noise accuracy")
pprint(list(zip(tunable_params, model_noise.coef_)))
model_test = LinearRegression()
model_test.fit(X, y2)
print("\n Test accuracy")
pprint(list(zip(tunable_params, model_test.coef_)))
In [238]:
# Including all experiments with 90 or more epochs ("completed")
df_inf = df[df['epochs']>=90]
y1 = df_inf['noise_accuracy_max']
y2 = df_inf['test_accuracy_max']
X = df_inf[tunable_params].astype(np.float32)
# adjust all X to same scale
scaler = StandardScaler()
X = scaler.fit_transform(X)
model_noise = LinearRegression()
model_noise.fit(X, y1)
print("\n Noise accuracy")
pprint(list(zip(tunable_params, model_noise.coef_)))
model_test = LinearRegression()
model_test.fit(X, y2)
print("\n Test accuracy")
pprint(list(zip(tunable_params, model_test.coef_)))
In [254]:
# Only included complete experiments
df_inf = df[df['epochs']>=90][corr_params]
In [273]:
def stats(arr):
return [round(v, 4) for v in [np.min(arr), np.mean(arr), np.max(arr)]]
In [277]:
df_inf.sort_values('test_accuracy_max', ascending=False)[tunable_params].head(5).apply(stats)
Out[277]:
In [278]:
df_inf.sort_values('test_accuracy_max', ascending=True)[tunable_params].head(5).apply(stats)
Out[278]:
In [279]:
df_inf.sort_values('noise_accuracy_max', ascending=False)[tunable_params].head(5).apply(stats)
Out[279]:
In [280]:
df_inf.sort_values('noise_accuracy_max', ascending=True)[tunable_params].head(5).apply(stats)
Out[280]:
In [131]:
class RayTuneExperimentBrowser(object):
"""
Class for browsing and manipulating experiment results directories created
by Ray Tune.
"""
def __init__(self, experiment_path):
self.experiment_path = os.path.abspath(experiment_path)
self.experiment_states = self._get_experiment_states(
self.experiment_path, exit_on_fail=True)
self.progress = {}
self.exp_directories = {}
self.checkpoint_directories = {}
self.params = {}
for experiment_state in self.experiment_states:
self._read_experiment(experiment_state)
def _read_experiment(self, experiment_state):
checkpoint_dicts = experiment_state["checkpoints"]
checkpoint_dicts = [flatten_dict(g) for g in checkpoint_dicts]
for exp in checkpoint_dicts:
if exp.get("logdir", None) is None:
continue
exp_dir = os.path.basename(exp["logdir"])
csv = os.path.join(self.experiment_path, exp_dir, "progress.csv")
self.progress[exp["experiment_tag"]] = pd.read_csv(csv)
self.exp_directories[exp["experiment_tag"]] = os.path.abspath(
os.path.join(self.experiment_path, exp_dir))
# Figure out checkpoint file (.pt or .pth) if it exists. For some reason
# we need to switch to the directory in order for glob to work.
ed = os.path.abspath(os.path.join(self.experiment_path, exp_dir))
os.chdir(ed)
cds = glob.glob("checkpoint*")
if len(cds) > 0:
cd = max(cds)
cf = glob.glob(os.path.join(cd, "*.pt"))
cf += glob.glob(os.path.join(cd, "*.pth"))
if len(cf) > 0:
self.checkpoint_directories[exp["experiment_tag"]] = os.path.join(
ed, cf[0])
else:
self.checkpoint_directories[exp["experiment_tag"]] = ""
else:
self.checkpoint_directories[exp["experiment_tag"]] = ""
# Read in the configs for this experiment
paramsFile = os.path.join(self.experiment_path, exp_dir, "params.json")
with open(paramsFile) as f:
self.params[exp["experiment_tag"]] = json.load(f)
def get_value(self, exp_substring="",
tags=["test_accuracy", "noise_accuracy"],
which='max'):
"""
For every experiment whose name matches exp_substring, scan the history
and return the appropriate value associated with tag.
'which' can be one of the following:
last: returns the last value
min: returns the minimum value
max: returns the maximum value
median: returns the median value
Returns a pandas dataframe with two columns containing name and tag value
"""
# Collect experiment names that match exp at all
exps = [e for e in self.progress if exp_substring in e]
# empty histories always return None
columns = ['Experiment Name']
# add the columns names for main tags
for tag in tags:
columns.append(tag)
columns.append(tag+'_'+which)
if which in ["max", "min"]:
columns.append("epoch_"+str(tag))
# add training iterations
columns.append('epochs')
# add the remaining variables
columns.extend(self.params[exps[0]].keys())
all_values = []
for e in exps:
# values for the experiment name
values = [e]
# values for the main tags
for tag in tags:
values.append(self.progress[e][tag].iloc[-1])
if which == "max":
values.append(self.progress[e][tag].max())
v = self.progress[e][tag].idxmax()
values.append(v)
elif which == "min":
values.append(self.progress[e][tag].min())
values.append(self.progress[e][tag].idxmin())
elif which == "median":
values.append(self.progress[e][tag].median())
elif which == "last":
values.append(self.progress[e][tag].iloc[-1])
else:
raise RuntimeError("Invalid value for which='{}'".format(which))
# add number of epochs
values.append(self.progress[e]['training_iteration'].iloc[-1])
# remaining values
for v in self.params[e].values():
if isinstance(v,list):
values.append(np.mean(v))
else:
values.append(v)
all_values.append(values)
p = pd.DataFrame(all_values, columns=columns)
return p
def get_checkpoint_file(self, exp_substring=""):
"""
For every experiment whose name matches exp_substring, return the
full path to the checkpoint file. Returns a list of paths.
"""
# Collect experiment names that match exp at all
exps = [e for e in self.progress if exp_substring in e]
paths = [self.checkpoint_directories[e] for e in exps]
return paths
def _get_experiment_states(self, experiment_path, exit_on_fail=False):
"""
Return every experiment state JSON file in the path as a list of dicts.
The list is sorted such that newer experiments appear later.
"""
experiment_path = os.path.expanduser(experiment_path)
experiment_state_paths = glob.glob(
os.path.join(experiment_path, "experiment_state*.json"))
if not experiment_state_paths:
if exit_on_fail:
print("No experiment state found!")
sys.exit(0)
else:
return
experiment_state_paths = list(experiment_state_paths)
experiment_state_paths.sort()
experiment_states = []
for experiment_filename in list(experiment_state_paths):
with open(experiment_filename) as f:
experiment_states.append(json.load(f))
return experiment_states
def get_parameters(self, sorted_experiments):
for i,e in sorted_experiments.iterrows():
if e['Experiment Name'] in self.params:
params = self.params[e['Experiment Name']]
print(params['cnn_percent_on'][0])
print('test_accuracy')
for i,e in sorted_experiments.iterrows():
print(e['test_accuracy'])
print('noise_accuracy')
for i,e in sorted_experiments.iterrows():
print(e['noise_accuracy'])
def best_experiments(self, min_test_accuracy=0.86, min_noise_accuracy=0.785, sort_by="noise_accuracy"):
"""
Return a dataframe containing all experiments whose best test_accuracy and
noise_accuracy are above the specified thresholds.
"""
best_accuracies = self.get_value()
best_accuracies.sort_values(sort_by, axis=0, ascending=False,
inplace=True, na_position='last')
columns = best_accuracies.columns
best_experiments = pd.DataFrame(columns=columns)
for i, row in best_accuracies.iterrows():
if ((row["test_accuracy"] > min_test_accuracy)
and (row["noise_accuracy"] > min_noise_accuracy)):
best_experiments = best_experiments.append(row)
return best_experiments
def prune_checkpoints(self, max_test_accuracy=0.86, max_noise_accuracy=0.785):
"""
TODO: delete the checkpoints for all models whose best test_accuracy and
noise_accuracy are below the specified thresholds.
"""
pass
In [ ]: