The best model parameters are given by

author : SHAMINDRA
data_source_dir : SC_shuffle
test_type : validation
model_type : RF
RF:
    n_estimators : 100
    criterion : 'gini'
    max_features : 'auto'
    max_depth : 20
    n_jobs : 1
SVM:
    kernel : 'rbf'
    degree : 3
    gamma : 'auto'
    tol : 0.001
NNET:
    method1 : 'Tanh'
    neurons1 : 24
    method2 : 'Tanh'
    neurons2 : 39
    decay : 0.0001
    learning_rate : 0.001
    n_iter : 25
    random_state : 1

In [1]:
# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import imp
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

from  sklearn.ensemble import RandomForestClassifier

import pandas as pd
import yaml
import re

We looked at the top features from the best performing random forest. They are as below:


In [2]:
# The top variables are:
var_importance = [(1, 'P_1_bid', 0.020001165389254737)
                  , (2, 'V_1_bid', 0.018358575666246449)
                  , (3, 'P_1_ask', 0.017058479215839299)
                  , (4, 'V_1_ask', 0.016953559068869958)
                  , (5, 'P_2_bid', 0.016908649059514971)
                  , (6, 'V_2_bid', 0.016219220215427665)
                  , (7, 'P_2_ask', 0.015039647893425838)
                  , (8, 'V_2_ask', 0.014497773408233052)
                  , (9, 'P_3_bid', 0.014321084019596746)
                  , (10, 'V_3_bid', 0.014158850118003859)
                  , (11, 'P_3_ask', 0.014101386932514923)
                  , (12, 'V_3_ask', 0.013911823640617986)
                  , (13, 'P_4_bid', 0.013838322603744435)
                  , (14, 'V_4_bid', 0.013668619218980316)
                  , (15, 'P_4_ask', 0.013413471959983998)]

In [3]:
# Train test datasets
train_ds_ref      = "data/output/model_clean_data/SC_shuffle/train_test_validation.tar.gz"
test_ds_ref       = "data/output/model_clean_data/SC_shuffle/strategy_validation.tar.gz"

# Open test and train sets
df_train = pd.read_csv(train_ds_ref
                       , compression='gzip', index_col = None)
df_test  = pd.read_csv(test_ds_ref
                       , compression='gzip', index_col = None)

# Drop the first columns - they are not useful
df_train_clean = df_train.iloc[:,1:]
df_test_clean  = df_test.iloc[:,1:]

In [4]:
def predict_simple_linear(df_train_clean, df_test_clean):
    X_train_cols  =  list(df_train_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask'
                          , 'V_2_ask']].columns.values)

    X_train  =  np.array(df_train_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask'
                              , 'V_2_ask']])
    Y_train  =  np.array(df_train_clean[['labels']])[:,0]

    X_test  =  np.array(df_test_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask'
                                       , 'V_2_ask']])
    Y_test  =  np.array(df_test_clean[['labels']])[:,0]
    
    # Define the labels
    labels = np.unique(Y_train)

    ## # Scale Data
    scaler = MinMaxScaler()
    X_test = scaler.fit_transform(X_test)
    X_train = scaler.fit_transform(X_train)

    # Set up the data
    logreg = linear_model.LogisticRegression(C=1e5)

    # Fit
    logreg.fit(X_train, Y_train)

    # Predict
    Y_hat   = logreg.predict(X_test)
    Y_probs = logreg.predict_proba(X_test)

    ## # Misclassification error rate
    miss_err = 1-accuracy_score(Y_test, Y_hat)
    ## # Log Loss
    eps = 10^(-15)
    logloss = log_loss(Y_test, Y_probs, eps = eps)

    ##confusion_matrix
    confusion_matrix1 = confusion_matrix(y_true=Y_test, y_pred=Y_hat
                                         , labels=labels)

    # classification_report
    classification_report1 = classification_report(y_true=Y_test, y_pred=Y_hat)

    # Output results in a list format
    result = []
    result.append("confusion_matrix")
    result.append(confusion_matrix1)
    result.append("classification_report")
    result.append(classification_report1)
    result.append("logloss")
    result.append(logloss)
    result.append("miss_err")
    result.append(miss_err)
    result.append("Y_hat")
    result.append(Y_hat)
    
    return result

In [5]:
linear_simple_predict = predict_simple_linear(df_train_clean = df_train_clean
                                              , df_test_clean = df_test_clean)

In [6]:
print(linear_simple_predict[3])


             precision    recall  f1-score   support

         -1       0.38      0.26      0.31     25829
          0       0.45      0.70      0.55     31757
          1       0.31      0.15      0.20     18721

avg / total       0.39      0.42      0.38     76307


In [6]:
# Get the predicted outcomes
linear_simple_predict_vals = linear_simple_predict[len(linear_simple_predict) -1]
#len(list(linear_simple_predict_vals))
len(list(linear_simple_predict_vals))-len(df_test_clean)


Out[6]:
0

In [7]:
def rf(X_train_cols,
       X_train,
       Y_train,
       X_test,
       Y_test,
       n_estimators=10,
       criterion="gini",
       max_features="auto",
       max_depth=-1,
       n_jobs=1):
    """
    Parameters
    ----------
    X_train_cols  : list of feature column names
        from the training set
    X_train       : pandas data frame
        data frame of features for the training set
    Y_train       : pandas data frame
        data frame of labels for the training set
    X_test        : pandas data frame
        data frame of features for the test set
    Y_test        : pandas data frame
        data frame of labels for the test set
    n_estimators : integer, optional (default=10)
        The number of trees in the forest.
    criterion : string, optional (default=”gini”)
        The function to measure the quality of a split.
        Supported criteria are “gini” for the Gini impurity and “entropy”
        for the information gain.
    max_features : int, float, string or None, optional (default=”auto”)
        The number of features to consider when looking for the best split:
        If int, then consider max_features features at each split.
        If float, then max_features is a percentage and int(max_features * n_features)
        features are considered at each split.
        If “auto”, then max_features=sqrt(n_features).
        If “sqrt”, then max_features=sqrt(n_features) (same as “auto”).
        If “log2”, then max_features=log2(n_features).
        If None, then max_features=n_features.
    max_depth : integer or None, optional (default=None)
        The maximum depth of the tree.
        If None, then nodes are expanded until all leaves are pure or
        until all leaves contain less than min_samples_split samples.
        Ignored if max_leaf_nodes is not None.
    n_jobs : integer, optional (default=1)
        The number of jobs to run in parallel for both fit and predict.
        If -1, then the number of jobs is set to the number of cores.

    Result:
    -------
    numpy array
        logloss    : averaged logarithmic loss
        miss_err   : missclassification error rate
        prec       : precision
        recall     : recall
        f1         : f1 score
        parameters : previous parameters in the order previously specified
    """
    if max_depth==-1:
        max_depth = None

    labels = np.unique(Y_train)

    ## # Run rf
    # Define classifier
    rf = RandomForestClassifier(n_estimators = n_estimators,
                                criterion    = criterion,
                                max_features = max_features,
                                max_depth    = max_depth,
                                n_jobs       = n_jobs)
    # Fit
    rf.fit(X_train, Y_train)

    # Predict
    Y_hat   = rf.predict(X_test)
    Y_probs = rf.predict_proba(X_test)

    ## # Misclassification error rate
    miss_err = 1-accuracy_score(Y_test, Y_hat)
    ## # Log Loss
    eps = 10^(-15)
    logloss = log_loss(Y_test, Y_probs, eps = eps)

    ##confusion_matrix
    confusion_matrix1 = confusion_matrix(y_true=Y_test, y_pred=Y_hat
                                         , labels=labels)

    # classification_report
    classification_report1 = classification_report(y_true=Y_test, y_pred=Y_hat)

    # Variable importance
    importances = rf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in rf.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Return tuple of (rank, feature name, variable importance)
    var_importance = [(f+1, X_train_cols[f], importances[indices[f]]) for f in range(X_train.shape[1])]

    # Output results in a list format
    result = []
    result.append("confusion_matrix")
    result.append(confusion_matrix1)
    result.append("classification_report")
    result.append(classification_report1)
    result.append("number of trees")
    result.append(n_estimators)
    result.append("max depth")
    result.append(max_depth)
    result.append("logloss")
    result.append(logloss)
    result.append("miss_err")
    result.append(miss_err)
    result.append("var_importance")
    result.append(var_importance)
    result.append("Y_hat")
    result.append(Y_hat)    
    return result

In [8]:
def execute_model(inp_yaml):
    """Apply trees in the forest to X, return leaf indices.
        Parameters
        ----------
        inp_yaml : A yaml file with model specifications

        Returns
        -------
        parameters_dict : A python dictionary with the model specifications
                          to be used to encode metadata for the model
                          and pass into specific model functions e.g. random
                          forest
        """

    # Read in and parse all parameters from the YAML file
    yaml_params = read_model_yaml(inp_yaml)

    # Define output file name based on input
    folder_name = re.split("/", inp_yaml)[2]
    file_name   = re.split("/", inp_yaml)[3][:-5]
    output_txt_file = 'data/output/' + folder_name + '/' + file_name + '.txt'

    #-------------------------------------------------
    # Create Train and Test Datasets
    #-------------------------------------------------

    data_source_dir = yaml_params["data_source_dir"]
    test_type       = yaml_params["test_type"]

    print('data source dir is: %s' % (data_source_dir))
    print('test type is: %s' % (test_type))

    if test_type == "test":
        train_ds_name = "train.tar.gz"
        test_ds_name  = "test.tar.gz"
    elif test_type == "validation":
        train_ds_name = "train_test.tar.gz"
        test_ds_name  = "validation.tar.gz"
    else:
        #train_ds_name = "train_test_validation.tar.gz"
        #TODO - delete
        train_ds_name = "train_test.tar.gz"
        test_ds_name  = "strategy_validation.tar.gz"

    train_ds_ref      = "data/output/model_clean_data/" + data_source_dir + "/" + train_ds_name
    test_ds_ref       = "data/output/model_clean_data/" + data_source_dir + "/" + test_ds_name

    print('training dataset is: %s' % (train_ds_ref))
    print('test dataset is: %s' % (test_ds_ref))

    # Open test and train sets
    df_train = pd.read_csv(train_ds_ref
                           , compression='gzip', index_col = None)
    df_test  = pd.read_csv(test_ds_ref
                           , compression='gzip', index_col = None)

    # Drop the first columns - they are not useful
    df_train_clean = df_train.iloc[:,1:]
    df_test_clean  = df_test.iloc[:,1:]

    # Traning data column names - used for variale importance
    X_train_cols  =  list(df_train_clean.drop(['labels', 'index', 'Time'], axis=1).columns.values)

    # Define test/training set
    X_train  =  np.array(df_train_clean.drop(['labels', 'index', 'Time'], axis = 1))
    Y_train  =  np.array(df_train_clean[['labels']])[:,0]
    X_test   =  np.array(df_test_clean.drop(['labels', 'index', 'Time'], axis = 1))
    Y_test   =  np.array(df_test_clean[['labels']])[:,0]


    #-------------------------------------------------
    # Run RF (RANDOM FOREST)
    #-------------------------------------------------

    if yaml_params["model_type"] == "RF":

        # Extract the RF model variables from the YAML file
        n_estimators  = yaml_params["parameters"]["n_estimators"]
        criterion     = yaml_params["parameters"]["criterion"]
        max_features  = yaml_params["parameters"]["max_features"]
        max_depth     = yaml_params["parameters"]["max_depth"]
        n_jobs        = yaml_params["parameters"]["n_jobs"]

        print('number of trees is: %d' % (n_estimators))
        print('max depth is: %d' % (max_depth))

        print("running RF WITHOUT simulation...")

        # Run simulation
        result = rf(X_train_cols   = X_train_cols
                    , X_train      = X_train
                    , Y_train      = Y_train
                    , X_test       = X_test
                    , Y_test       = Y_test
                    , n_estimators = n_estimators
                    , criterion    = criterion
                    , max_features = max_features
                    , max_depth    = max_depth)

        print("finished - rf without simulation")

        return result
        # Write into text file
        #write_results_txt(output_txt_file, result)

    #-------------------------------------------------
    # Run SVM (SUPPORT VECTOR MACHINE)
    #-------------------------------------------------

    # Extract the SVM model variables from the YAML file
    if yaml_params["model_type"] == "SVM":
        kernel  = yaml_params["parameters"]["kernel"]
        degree  = yaml_params["parameters"]["degree"]
        gamma   = yaml_params["parameters"]["gamma"]
        tol     = yaml_params["parameters"]["tol"]
        C       = yaml_params["parameters"]["C"]

        print('The value of C is: %.2f' % (C))

        print("running SVM WITHOUT simulation...")

        # Run a single simulation
        result = svm(X_train        = X_train
                     , Y_train      = Y_train
                     , X_test       = X_test
                     , Y_test       = Y_test
                     , kernel       = kernel
                     , C            = C
                     , degree       = degree
                     , gamma        = gamma
                     , tol          = tol
                     , decision_function_shape='ovr')

        # Write into text file
        #write_results_txt(output_txt_file, result)
        return result
        print("finished - SVM without simulation")

In [9]:
def read_model_yaml(inp_yaml):
    """Apply trees in the forest to X, return leaf indices.
        Parameters
        ----------
        inp_yaml : A yaml file with model specifications

        Returns
        -------
        parameters_dict : A python dictionary with the model specifications
                          to be used to encode metadata for the model
                          and pass into specific model functions e.g. random
                          forest
        """
    # with open("../model/spec/SS/SS_RF_1.yaml") as stream:
    with open(inp_yaml) as stream:
        data = yaml.load(stream)
        parameters_dict = {
        "author"                   : data["author"]
        , "data_source_dir"        : data["data_source_dir"]
        , "model_type"             : data["model_type"]
        , "test_type"              : data["test_type"]        
        , "parameters"             : data[data["model_type"]]
        }
    return parameters_dict

In [10]:
result_best_RF = execute_model(inp_yaml = "./model/spec/TB/TB_RF_SC_shuffle_strategy_4.yaml")


data source dir is: SC_shuffle
test type is: strategy_validation
training dataset is: data/output/model_clean_data/SC_shuffle/train_test.tar.gz
test dataset is: data/output/model_clean_data/SC_shuffle/strategy_validation.tar.gz
number of trees is: 100
max depth is: 20
running RF WITHOUT simulation...
finished - rf without simulation

In [11]:
print(result_best_RF[3])


             precision    recall  f1-score   support

         -1       0.29      0.35      0.31     25829
          0       0.45      0.26      0.33     31757
          1       0.27      0.37      0.31     18721

avg / total       0.35      0.32      0.32     76307


In [12]:
test_ds_ref       = "data/output/model_clean_data/SC_shuffle/strategy_validation.tar.gz"
df_test           = pd.read_csv(test_ds_ref, compression='gzip', index_col = None)
df_test2          = df_test
df_test2["predicted"] = list(result_best_RF[len(result_best_RF) - 1])

In [13]:
# Direct Python to plot all figures inline (i.e., not in a separate window)
%matplotlib inline

# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# Division of two integers in Python 2.7 does not return a floating point result. The default is to round down 
# to the nearest integer. The following piece of code changes the default.
from __future__ import division

def profit_calculator(data, delta_t = 30, simple = False):
    """Calculate the profit of trading strategy based on precisely the prediction of the model
        Parameters
        ----------
        data    : a data frame with "predicted" "P_1_bid" "P_1_ask"
        delta_t : time gap between 
        simple  : a dummy, True, means we make transection decisions only every delta_t period. False, means we track the current 
                  hand every period, only if we don't have anything at hand, we make new transactions

        Returns
        -------
        profit        : a numeric, the net profit at the end
        
        """    
    if simple == True:
        data_effective = data.loc[np.arange(len(data)) % delta_t == 0]
        bid            = data_effective['P_1_bid']
        ask            = data_effective['P_1_ask']
        trade_decision = data_effective['predicted'][:-1]
        buy_profit     = np.array(bid[1:]) - np.array(ask[:-1])
        sell_profit    = np.array(bid[:-1]) - np.array(ask[1:])
        profit         = sum((np.array(trade_decision) > 0) * buy_profit + (np.array(trade_decision) < 0) * sell_profit)
        return profit
    else:
        buy_profit           = np.array(data['P_1_bid'][delta_t:]) - np.array(data['P_1_ask'][:(-1 * delta_t)])
        sell_profit           = np.array(data['P_1_bid'][:(-1 * delta_t)]) - np.array(data['P_1_ask'][delta_t:])
        trade_decision_draft = data['predicted'][:(-1 * delta_t)]
        T                    = len(buy_profit)
        current_state        = [0] * T
        trade_decision       = [0] * T
        profit               = 0
        for i in range(T):
            if current_state[i] == 1:
                trade_decision[i] = 0
            else:
                trade_decision[i] = trade_decision_draft[i]
                    
            if i < T-1:
                current_state[i+1] = int(sum(abs(np.array(trade_decision[max(0, i + 1 - delta_t):i + 1]))) != 0)
        profit = sum((np.array(trade_decision) > 0) * buy_profit + (np.array(trade_decision) < 0) * sell_profit)
        return profit

In [14]:
profit_calculator(data = df_test2, delta_t = 1000, simple = False)


Out[14]:
-13.950000000000159

In [15]:
profit_calculator(data = df_test2, delta_t = 1000, simple = True)


Out[15]:
-11.989999999999782