In [3]:
import sys
import imp
import yaml
import csv
import pandas as pd
import re
from rf import *
modl = imp.load_source('read_model_yaml', 'read_model_yaml.py')
In [4]:
# Import argument
inp_yaml = "model/spec/SS/SS_RF_1.yaml"
#inp_yaml = sys.argv[1]
In [5]:
# Open test and train sets
df_test = pd.read_csv("data/output/model_clean_data/test.tar.gz",compression='gzip', index_col = None)
df_train = pd.read_csv("data/output/model_clean_data/train.tar.gz",compression='gzip', index_col = None)
# Define test/training set
X_test = np.array(df_test.drop(['labels'], axis = 1))
Y_test = np.array(df_test[['labels']])[:,0]
X_train = np.array(df_train.drop(['labels'], axis = 1))
Y_train = np.array(df_train[['labels']])[:,0]
In [18]:
def write(filename,results,labels):
"""
Write results into csv file.
Parameters
----------
filename : string
filename to output the result
results : list or numpy array
results of some simulation
labels : list
labels for the results, i.e. names of parameters and metrics
"""
## # Write into csv file
# TODO: labels as header
results.tofile(filename,sep=',')
def run_model(inp_yaml,X_train,Y_train,X_test,Y_test):
"""Apply trees in the forest to X, return leaf indices.
Parameters
----------
inp_yaml : A yaml file with model specifications
Returns
-------
parameters_dict : A python dictionary with the model specifications
to be used to encode metadata for the model
and pass into specific model functions e.g. random
forest
"""
# Define output file name based on input
folder_name = re.split("/","model/spec/SS/SS_RF_1.yaml")[2]
file_name = re.split("/","model/spec/SS/SS_RF_1.yaml")[3][:-5]
output = 'data/output/'+folder_name+'/'+file_name+'.csv'
yaml_params = modl.read_model_yaml(inp_yaml)
if yaml_params["model_type"] == "RF":
n_estimators = yaml_params["parameters"]["n_estimators"]
criterion = yaml_params["parameters"]["criterion"]
max_features = yaml_params["parameters"]["max_features"]
max_depth = yaml_params["parameters"]["max_depth"]
n_jobs = yaml_params["parameters"]["n_jobs"]
# Define labels of output
labels = ["logloss",
"miss_err",
"prec",
"recall",
"f1",
"n_estimators",
"criterion",
"max_features",
"max_depth"]
# Run many simulations in parallel using as many cores as necessary
if yaml_params["simulations"]:
# Run simulation
result = rf_simulation(X_train = X_train,
Y_train = Y_train,
X_test = X_test,
Y_test = Y_test,
n_estimators = n_estimators,
criterion = criterion,
max_features = max_features,
max_depth = max_depth)
# Write into csv
write(output,result,labels)
# Run a single simulation
else:
labels.append("n_jobs")
# Run simulation
result = rf(X_train = X_train,
Y_train = Y_train,
X_test = X_test,
Y_test = Y_test,
n_estimators = n_estimators,
criterion = criterion,
max_features = max_features,
max_depth = max_depth)
result = np.array([result])
# Write into csv
write(output,result,labels)
In [19]:
# Run the model
run_model(inp_yaml,X_train,Y_train,X_test,Y_test)
In [ ]: