In [ ]:
import sys
import imp
import yaml
import csv
import pandas as pd
import re
from rf import *
from svm import *
modl = imp.load_source('read_model_yaml', 'read_model_yaml.py')
# Parse the YAML file location as the first parameter
inp_yaml = sys.argv[1]
def write_results_txt(filename, result):
"""
Write results into csv file.
Parameters
----------
filename : string
filename to output the result
labels : list
labels for the results, i.e. names of parameters and metrics
"""
with open(filename, "w") as fp:
for item in result:
fp.write("%s\n\n" % item)
def execute_model(inp_yaml):
"""Apply trees in the forest to X, return leaf indices.
Parameters
----------
inp_yaml : A yaml file with model specifications
Returns
-------
parameters_dict : A python dictionary with the model specifications
to be used to encode metadata for the model
and pass into specific model functions e.g. random
forest
"""
# Read in and parse all parameters from the YAML file
yaml_params = modl.read_model_yaml(inp_yaml)
# Define output file name based on input
folder_name = re.split("/", inp_yaml)[2]
file_name = re.split("/", inp_yaml)[3][:-5]
output_txt_file = 'data/output/' + folder_name + '/' + file_name + '.txt'
#-------------------------------------------------
# Create Train and Test Datasets
#-------------------------------------------------
train_data_source = yaml_params["train_data_source"]
test_data_source = yaml_params["test_data_source"]
# Open test and train sets
df_train = pd.read_csv("data/output/model_clean_data/" + train_data_source, compression='gzip', index_col = None)
df_test = pd.read_csv("data/output/model_clean_data/" + test_data_source, compression='gzip', index_col = None)
# Define test/training set
X_train = np.array(df_train.drop(['labels', 'train.csv', 'index', 'Time'], axis = 1))
Y_train = np.array(df_train[['labels']])[:,0]
X_test = np.array(df_test.drop(['labels', 'test.csv', 'index', 'Time'], axis = 1))
Y_test = np.array(df_test[['labels']])[:,0]
#-------------------------------------------------
# Run RF (RANDOM FOREST)
#-------------------------------------------------
if yaml_params["model_type"] == "RF":
# Extract the RF model variables from the YAML file
n_estimators = yaml_params["parameters"]["n_estimators"]
criterion = yaml_params["parameters"]["criterion"]
max_features = yaml_params["parameters"]["max_features"]
max_depth = yaml_params["parameters"]["max_depth"]
n_jobs = yaml_params["parameters"]["n_jobs"]
print("running RF WITHOUT simulation...")
# Run simulation
result = rf(X_train = X_train
, Y_train = Y_train
, X_test = X_test
, Y_test = Y_test
, n_estimators = n_estimators
, criterion = criterion
, max_features = max_features
, max_depth = max_depth)
print("finished - rf without simulation")
# Write into text file
write_results_txt(output_txt_file, result)
#-------------------------------------------------
# Run SVM (SUPPORT VECTOR MACHINE)
#-------------------------------------------------
# Extract the SVM model variables from the YAML file
if yaml_params["model_type"] == "SVM":
kernel = yaml_params["parameters"]["kernel"]
degree = yaml_params["parameters"]["degree"]
gamma = yaml_params["parameters"]["gamma"]
tol = yaml_params["parameters"]["tol"]
print("running SVM WITHOUT simulation...")
# Run a single simulation
result = svm(X_train = X_train
, Y_train = Y_train
, X_test = X_test
, Y_test = Y_test
, kernel = kernel
, C = 1.0
, degree = degree
, gamma = gamma
, tol = tol
, decision_function_shape='ovr')
# Write into text file
write_results_txt(output_txt_file, result)
print("finished - SVM without simulation")
# Run the execute model code
execute_model(inp_yaml)