In [3]:
import sys
import imp
import yaml
import csv
import pandas as pd
import re
from rf import *
modl = imp.load_source('read_model_yaml', '')

In [4]:
# Import argument
inp_yaml = "model/spec/SS/SS_RF_1.yaml"
#inp_yaml = sys.argv[1]

In [5]:
# Open test and train sets
df_test = pd.read_csv("data/output/model_clean_data/test.tar.gz",compression='gzip', index_col = None)
df_train = pd.read_csv("data/output/model_clean_data/train.tar.gz",compression='gzip', index_col = None)

# Define test/training set
X_test =  np.array(df_test.drop(['labels'], axis = 1))
Y_test =  np.array(df_test[['labels']])[:,0]
X_train = np.array(df_train.drop(['labels'], axis = 1))
Y_train = np.array(df_train[['labels']])[:,0]

In [18]:
def write(filename,results,labels):
    Write results into csv file.
    filename : string
        filename to output the result
    results : list or numpy array
        results of some simulation
    labels : list
        labels for the results, i.e. names of parameters and metrics
    ## # Write into csv file
    # TODO: labels as header

def run_model(inp_yaml,X_train,Y_train,X_test,Y_test):
    """Apply trees in the forest to X, return leaf indices.
        inp_yaml : A yaml file with model specifications

        parameters_dict : A python dictionary with the model specifications
                          to be used to encode metadata for the model
                          and pass into specific model functions e.g. random
    # Define output file name based on input
    folder_name = re.split("/","model/spec/SS/SS_RF_1.yaml")[2]
    file_name = re.split("/","model/spec/SS/SS_RF_1.yaml")[3][:-5]
    output = 'data/output/'+folder_name+'/'+file_name+'.csv'
    yaml_params = modl.read_model_yaml(inp_yaml)
    if yaml_params["model_type"] == "RF":        
        n_estimators  = yaml_params["parameters"]["n_estimators"]
        criterion     = yaml_params["parameters"]["criterion"]   
        max_features  = yaml_params["parameters"]["max_features"]          
        max_depth     = yaml_params["parameters"]["max_depth"]      
        n_jobs        = yaml_params["parameters"]["n_jobs"]
        # Define labels of output
        labels = ["logloss",
        # Run many simulations in parallel using as many cores as necessary
        if yaml_params["simulations"]:
            # Run simulation
            result = rf_simulation(X_train      = X_train,
                                   Y_train      = Y_train,
                                   X_test       = X_test,
                                   Y_test       = Y_test,
                                   n_estimators = n_estimators,
                                   criterion    = criterion,
                                   max_features = max_features,
                                   max_depth    = max_depth)

            # Write into csv
        # Run a single simulation
            # Run simulation
            result = rf(X_train     = X_train,
                       Y_train      = Y_train,
                       X_test       = X_test,
                       Y_test       = Y_test,
                       n_estimators = n_estimators,
                       criterion    = criterion,
                       max_features = max_features,
                       max_depth    = max_depth)
            result = np.array([result])
            # Write into csv

In [19]:
# Run the model

In [ ]: