In [7]:
import pandas as pd
import os
import pickle

In [2]:
datasets = ["production", "insurance", "sepsis_cases", "bpic2011", "bpic2015",
            "bpic2012_declined", "bpic2012_accepted", "bpic2012_cancelled",
            "bpic2017_refused", "bpic2017_accepted", "bpic2017_cancelled",
            "traffic_fines_1", "hospital_billing_2", "hospital_billing_3"]

bucket_methods = ["single", "prefix", "state", "cluster", "knn"]
cls_encodings = ["laststate", "agg", "index"]
cls_methods = ["rf", "xgboost", "logit", "svm"]

In [26]:
for dataset_name in datasets:
    for bucket_method in bucket_methods:
        for cls_encoding in cls_encodings:
            for cls_method in cls_methods:
                optimal_params_filename = os.path.join("cv_results_revision", "optimal_params_%s_%s_%s_%s.pickle" % (cls_method, dataset_name, bucket_method, cls_encoding))
                if not os.path.isfile(optimal_params_filename) or os.path.getsize(optimal_params_filename) <= 0:
                    file = 'cv_results_revision/param_optim_all_trials_%s_%s_%s_%s.csv' % (cls_method, dataset_name, bucket_method, cls_encoding)
                    if not os.path.isfile(file) or os.path.getsize(file) <= 52:
                        print(file)
                        continue
                    data = pd.read_csv(file, sep=";")
                    best_params = {val[0]: val[1] for _, val in data[data.score==data[data.param!="processing_time"].score.max()][["param", "value"]].iterrows()}
                    # write to file
                    with open(optimal_params_filename, "wb") as fout:
                        pickle.dump(best_params, fout)

In [ ]:

Extract best params for prefix-based bucketing


In [17]:
datasets = ["hospital_billing_2"]

bucket_methods = ["prefix"]
cls_encodings = ["index", "laststate", "agg"]
cls_methods = ["rf"]

In [18]:
for dataset_name in datasets:
    for bucket_method in bucket_methods:
        for cls_encoding in cls_encodings:
            for cls_method in cls_methods:
                optimal_params_filename = os.path.join("cv_results_revision_prefix", "optimal_params_%s_%s_%s_%s.pickle" % (cls_method, dataset_name, bucket_method, cls_encoding))
                file = 'cv_results_revision/param_optim_all_trials_%s_%s_%s_%s.csv' % (cls_method, dataset_name, bucket_method, cls_encoding)
                print(file)
                if os.path.isfile(file) and os.path.getsize(file) > 0:
                    data = pd.read_csv(file, sep=";")
                    data = data[data.param!="processing_time"]
                    best_params = {}
                    print(data.columns)
                    if "nr_events" in list(data.columns):
                        for nr_events, group in data.groupby("nr_events"):
                            vals = {val[0]: val[1] for _, val in list(group[group.score==group.score.max()].groupby("iter"))[0][1][["param", "value"]].iterrows()}
                            best_params[nr_events] = vals
                        # write to file
                        with open(optimal_params_filename, "wb") as fout:
                            pickle.dump(best_params, fout)


cv_results_revision/param_optim_all_trials_rf_hospital_billing_2_prefix_index.csv
Index(['iter', 'dataset', 'cls', 'method', 'nr_events', 'param', 'value',
       'score'],
      dtype='object')
cv_results_revision/param_optim_all_trials_rf_hospital_billing_2_prefix_laststate.csv
Index(['iter', 'dataset', 'cls', 'method', 'nr_events', 'param', 'value',
       'score'],
      dtype='object')
cv_results_revision/param_optim_all_trials_rf_hospital_billing_2_prefix_agg.csv
Index(['iter', 'dataset', 'cls', 'method', 'nr_events', 'param', 'value',
       'score'],
      dtype='object')

In [ ]: