In [1]:
import sys
import imp
import yaml
import csv
import pandas as pd
import re
from rf import *
from svm import *
#from nnet import *
modl = imp.load_source('read_model_yaml', 'read_model_yaml.py')

In [2]:
# Import argument
#inp_yaml = "model/spec/SS/SS_RF_1.yaml"
inp_yaml = sys.argv[1]

In [3]:
# Open test and train sets
df_test  = pd.read_csv("data/output/model_clean_data/test.tar.gz",compression='gzip', index_col = None)
df_train = pd.read_csv("data/output/model_clean_data/train.tar.gz",compression='gzip', index_col = None)

# Define test/training set
X_test   =  np.array(df_test.drop(['labels'], axis = 1))
Y_test   =  np.array(df_test[['labels']])[:,0]
X_train  =  np.array(df_train.drop(['labels'], axis = 1))
Y_train  =  np.array(df_train[['labels']])[:,0]

In [4]:
import inspect
print(inspect.getsource(rf))


def rf(X_train,
       Y_train,
       X_test,
       Y_test,
       n_estimators=10,
       criterion="gini",
       max_features="auto",
       max_depth=-1,
       n_jobs=1):
    """
    Parameters
    ----------
    X_train       : pandas data frame
        data frame of features for the training set
    Y_train       : pandas data frame
        data frame of labels for the training set
    X_test        : pandas data frame
        data frame of features for the test set
    Y_test        : pandas data frame
        data frame of labels for the test set
    n_estimators : integer, optional (default=10)
        The number of trees in the forest.
    criterion : string, optional (default=”gini”)
        The function to measure the quality of a split.
        Supported criteria are “gini” for the Gini impurity and “entropy”
        for the information gain.
    max_features : int, float, string or None, optional (default=”auto”)
        The number of features to consider when looking for the best split:
        If int, then consider max_features features at each split.
        If float, then max_features is a percentage and int(max_features * n_features)
        features are considered at each split.
        If “auto”, then max_features=sqrt(n_features).
        If “sqrt”, then max_features=sqrt(n_features) (same as “auto”).
        If “log2”, then max_features=log2(n_features).
        If None, then max_features=n_features.
    max_depth : integer or None, optional (default=None)
        The maximum depth of the tree.
        If None, then nodes are expanded until all leaves are pure or
        until all leaves contain less than min_samples_split samples.
        Ignored if max_leaf_nodes is not None.
    n_jobs : integer, optional (default=1)
        The number of jobs to run in parallel for both fit and predict.
        If -1, then the number of jobs is set to the number of cores.

    Result:
    -------
    numpy array
        logloss    : averaged logarithmic loss
        miss_err   : missclassification error rate
        prec       : precision
        recall     : recall
        f1         : f1 score
        parameters : previous parameters in the order previously specified
    """
    if max_depth==-1:
        max_depth = None

    labels = np.unique(Y_train)

    ## # Scale Data
    scaler = MinMaxScaler()
    X_test = scaler.fit_transform(X_test)
    X_train = scaler.fit_transform(X_train)

    ## # Run rf
    # Define classifier
    rf = RandomForestClassifier(n_estimators=n_estimators,
                                criterion=criterion,
                                max_features=max_features,
                                max_depth=max_depth,
                                n_jobs=n_jobs)
    # Fit
    rf.fit(X_train, Y_train)
    # Predict
    Y_hat = rf.predict(X_test)
    Y_probs = rf.predict_proba(X_test)

    ## # Misclassification error rate
    miss_err = 1-accuracy_score(Y_test, Y_hat)
    ## # Log Loss
    eps = 10^(-15)
    logloss = log_loss(Y_test, Y_probs, eps = eps)

    ##confusion_matrix
    confusion_matrix1 = confusion_matrix(y_true=Y_test, y_pred=Y_hat
                                         , labels=labels)

    # classification_report
    classification_report1 = classification_report(y_true=Y_test, y_pred=Y_hat)

    # Output results in a list format
    result = []
    result.append("confusion_matrix")
    result.append(confusion_matrix1)
    result.append("classification_report")
    result.append(classification_report1)
    result.append("logloss")
    result.append(logloss)
    result.append("miss_err")
    result.append(miss_err)
    return result


In [5]:
def write_results_txt(filename, result):
    """
    Write results into csv file.
    
    Parameters
    ----------
    filename : string
        filename to output the result
    results : list or numpy array
        results of some simulation
    labels : list
        labels for the results, i.e. names of parameters and metrics
    """
    with open(filename, "w") as fp:
        for item in result:
            fp.write("%s\n\n" % item)


def run_model(inp_yaml,X_train,Y_train,X_test,Y_test):
    """Apply trees in the forest to X, return leaf indices.
        Parameters
        ----------
        inp_yaml : A yaml file with model specifications

        Returns
        -------
        parameters_dict : A python dictionary with the model specifications
                          to be used to encode metadata for the model
                          and pass into specific model functions e.g. random
                          forest
        """    
    
    # Define output file name based on input
    folder_name = re.split("/","model/spec/SS/SS_RF_1.yaml")[2]
    file_name   = re.split("/","model/spec/SS/SS_RF_1.yaml")[3][:-5]
    output      = 'data/output/'+folder_name+'/'+file_name+'.txt'
    
    # Read in and parse all parameters from the YAML file
    yaml_params = modl.read_model_yaml(inp_yaml)
    
    #-------------------------------------------------
    # Run RF (RANDOM FOREST)
    #-------------------------------------------------    
    
    if yaml_params["model_type"] == "RF":        
        
        # Extract the RF model variables from the YAML file        
        n_estimators  = yaml_params["parameters"]["n_estimators"]
        criterion     = yaml_params["parameters"]["criterion"]   
        max_features  = yaml_params["parameters"]["max_features"]          
        max_depth     = yaml_params["parameters"]["max_depth"]      
        n_jobs        = yaml_params["parameters"]["n_jobs"]
        
       
        # Run many simulations in parallel using as many cores as necessary
        if yaml_params["simulations"]:
            print("running RF WITH simulation...")
            # Run simulation
            result = rf_simulation(X_train        = X_train
                                   , Y_train      = Y_train
                                   , X_test       = X_test
                                   , Y_test       = Y_test
                                   , n_estimators = n_estimators
                                   , criterion    = criterion
                                   , max_features = max_features
                                   , max_depth    = max_depth)

            print("finished - RF WITH simulation")
            # Write into csv
            write_results_txt(output, result)
            
        # Run a single simulation
        else:
            print("running RF WITHOUT simulation...")
            # Run simulation
            result = rf(X_train        = X_train
                        , Y_train      = Y_train
                        , X_test       = X_test
                        , Y_test       = Y_test
                        , n_estimators = n_estimators
                        , criterion    = criterion
                        , max_features = max_features
                        , max_depth    = max_depth)
            
            print("finished - rf without simulation")
            # Write into csv
            write_results_txt(output, result)
            
    #-------------------------------------------------
    # Run SVM (SUPPORT VECTOR MACHINE)
    #-------------------------------------------------
    
    # Extract the SVM model variables from the YAML file        
    if yaml_params["model_type"] == "SVM":        
        kernel  = yaml_params["parameters"]["kernel"] 
        degree  = yaml_params["parameters"]["degree"]  
        gamma   = yaml_params["parameters"]["gamma"] 
        tol     = yaml_params["parameters"]["tol"]
        
        # Define labels of output
        labels = ["logloss"
                  , "miss_err"
                  , "prec"
                  , "recall"
                  , "f1"
                  , "C"
                  , "kernel"
                  , "degree" 
                  , "gamma"
                  , "tol"
                  , "decision_function_shape"]
        
        # Run many simulations in parallel using as many cores as necessary
        if yaml_params["simulations"]:
            # Run simulation
            result = svm_simulation(X_train                   = X_train
                                    , Y_train                 = Y_train
                                    , X_test                  = X_test
                                    , Y_test                  = Y_test
                                    , kernel                  = kernel
                                    , C                       = 1.0
                                    , degree                  = degree 
                                    , gamma                   = gamma
                                    , tol                     = tol
                                    , decision_function_shape ='ovr')

            # Write into csv
            write_results_csv(output, result, labels)
        
        # Run a single simulation
        else:
            # Run simulation
            result = svm(X_train        = X_train
                         , Y_train      = Y_train
                         , X_test       = X_test
                         , Y_test       = Y_test
                         , kernel       = kernel
                         , C            = 1.0
                         , degree       = degree
                         , gamma        = gamma
                         , tol          = tol
                         , decision_function_shape='ovr')
            
            # Write into csv
            write_results_csv(output, result, labels)

In [6]:
# Run the model
run_model(inp_yaml,X_train,Y_train,X_test,Y_test)


running RF WITHOUT simulation...
finished - rf without simulation
/Users/shamindra/anaconda3/envs/py3_stat222_finance/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [ ]: