In [3]:
import sys
import imp
import yaml
import csv
import pandas as pd
import re
from rf import *
modl = imp.load_source('read_model_yaml', 'read_model_yaml.py')

In [4]:
# Import argument
inp_yaml = "model/spec/SS/SS_RF_1.yaml"
#inp_yaml = sys.argv[1]

In [5]:
# Open test and train sets
df_test = pd.read_csv("data/output/model_clean_data/test.tar.gz",compression='gzip', index_col = None)
df_train = pd.read_csv("data/output/model_clean_data/train.tar.gz",compression='gzip', index_col = None)

# Define test/training set
X_test =  np.array(df_test.drop(['labels'], axis = 1))
Y_test =  np.array(df_test[['labels']])[:,0]
X_train = np.array(df_train.drop(['labels'], axis = 1))
Y_train = np.array(df_train[['labels']])[:,0]

In [18]:
def write(filename,results,labels):
    """
    Write results into csv file.
    
    Parameters
    ----------
    filename : string
        filename to output the result
    results : list or numpy array
        results of some simulation
    labels : list
        labels for the results, i.e. names of parameters and metrics
    """
    ## # Write into csv file
    # TODO: labels as header
    results.tofile(filename,sep=',')


def run_model(inp_yaml,X_train,Y_train,X_test,Y_test):
    """Apply trees in the forest to X, return leaf indices.
        Parameters
        ----------
        inp_yaml : A yaml file with model specifications

        Returns
        -------
        parameters_dict : A python dictionary with the model specifications
                          to be used to encode metadata for the model
                          and pass into specific model functions e.g. random
                          forest
        """    
    
    # Define output file name based on input
    folder_name = re.split("/","model/spec/SS/SS_RF_1.yaml")[2]
    file_name = re.split("/","model/spec/SS/SS_RF_1.yaml")[3][:-5]
    output = 'data/output/'+folder_name+'/'+file_name+'.csv'
    
    yaml_params = modl.read_model_yaml(inp_yaml)
    
    if yaml_params["model_type"] == "RF":        
        n_estimators  = yaml_params["parameters"]["n_estimators"]
        criterion     = yaml_params["parameters"]["criterion"]   
        max_features  = yaml_params["parameters"]["max_features"]          
        max_depth     = yaml_params["parameters"]["max_depth"]      
        n_jobs        = yaml_params["parameters"]["n_jobs"]
        
        # Define labels of output
        labels = ["logloss",
                  "miss_err",
                  "prec",
                  "recall",
                  "f1",
                  "n_estimators",
                  "criterion",   
                  "max_features",
                  "max_depth"]
        
        # Run many simulations in parallel using as many cores as necessary
        if yaml_params["simulations"]:
            # Run simulation
            result = rf_simulation(X_train      = X_train,
                                   Y_train      = Y_train,
                                   X_test       = X_test,
                                   Y_test       = Y_test,
                                   n_estimators = n_estimators,
                                   criterion    = criterion,
                                   max_features = max_features,
                                   max_depth    = max_depth)

            # Write into csv
            write(output,result,labels)
        
        # Run a single simulation
        else:
            labels.append("n_jobs")
            # Run simulation
            result = rf(X_train     = X_train,
                       Y_train      = Y_train,
                       X_test       = X_test,
                       Y_test       = Y_test,
                       n_estimators = n_estimators,
                       criterion    = criterion,
                       max_features = max_features,
                       max_depth    = max_depth)
            
            result = np.array([result])
            # Write into csv
            write(output,result,labels)

In [19]:
# Run the model
run_model(inp_yaml,X_train,Y_train,X_test,Y_test)

In [ ]: