In [7]:
import pandas as pd
import os
import pickle
In [2]:
datasets = ["production", "insurance", "sepsis_cases", "bpic2011", "bpic2015",
"bpic2012_declined", "bpic2012_accepted", "bpic2012_cancelled",
"bpic2017_refused", "bpic2017_accepted", "bpic2017_cancelled",
"traffic_fines_1", "hospital_billing_2", "hospital_billing_3"]
bucket_methods = ["single", "prefix", "state", "cluster", "knn"]
cls_encodings = ["laststate", "agg", "index"]
cls_methods = ["rf", "xgboost", "logit", "svm"]
In [26]:
for dataset_name in datasets:
for bucket_method in bucket_methods:
for cls_encoding in cls_encodings:
for cls_method in cls_methods:
optimal_params_filename = os.path.join("cv_results_revision", "optimal_params_%s_%s_%s_%s.pickle" % (cls_method, dataset_name, bucket_method, cls_encoding))
if not os.path.isfile(optimal_params_filename) or os.path.getsize(optimal_params_filename) <= 0:
file = 'cv_results_revision/param_optim_all_trials_%s_%s_%s_%s.csv' % (cls_method, dataset_name, bucket_method, cls_encoding)
if not os.path.isfile(file) or os.path.getsize(file) <= 52:
print(file)
continue
data = pd.read_csv(file, sep=";")
best_params = {val[0]: val[1] for _, val in data[data.score==data[data.param!="processing_time"].score.max()][["param", "value"]].iterrows()}
# write to file
with open(optimal_params_filename, "wb") as fout:
pickle.dump(best_params, fout)
In [ ]:
In [17]:
datasets = ["hospital_billing_2"]
bucket_methods = ["prefix"]
cls_encodings = ["index", "laststate", "agg"]
cls_methods = ["rf"]
In [18]:
for dataset_name in datasets:
for bucket_method in bucket_methods:
for cls_encoding in cls_encodings:
for cls_method in cls_methods:
optimal_params_filename = os.path.join("cv_results_revision_prefix", "optimal_params_%s_%s_%s_%s.pickle" % (cls_method, dataset_name, bucket_method, cls_encoding))
file = 'cv_results_revision/param_optim_all_trials_%s_%s_%s_%s.csv' % (cls_method, dataset_name, bucket_method, cls_encoding)
print(file)
if os.path.isfile(file) and os.path.getsize(file) > 0:
data = pd.read_csv(file, sep=";")
data = data[data.param!="processing_time"]
best_params = {}
print(data.columns)
if "nr_events" in list(data.columns):
for nr_events, group in data.groupby("nr_events"):
vals = {val[0]: val[1] for _, val in list(group[group.score==group.score.max()].groupby("iter"))[0][1][["param", "value"]].iterrows()}
best_params[nr_events] = vals
# write to file
with open(optimal_params_filename, "wb") as fout:
pickle.dump(best_params, fout)
In [ ]: