In [1]:
import sys
import imp
import yaml
import csv
import pandas as pd
import re
import numpy as np
# Import the random forest package
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
def label_percent(v):
    l=len(v)
    pos=sum(x > 0 for x in v)/l
    neg=sum(x < 0 for x in v)/l
    zero=sum(x == 0 for x in v)/l
    print('Pos:'+str("{0:.2f}".format(pos))+"; Neg:"+str("{0:.2f}".format(neg))+'; Zero:'+str("{0:.2f}".format(zero))+';')

In [3]:
# Open test and train sets
df_train = pd.read_csv("data/output/model_clean_data/train1.tar.gz", compression='gzip', index_col = None)
df_test  = pd.read_csv("data/output/model_clean_data/test1.tar.gz" , compression='gzip', index_col = None)

In [4]:
df_train.head()


Out[4]:
train.csv index Time P_1_bid V_1_bid P_1_ask V_1_ask P_2_bid V_2_bid P_2_ask ... V_bid_8_deriv P_ask_9_deriv P_bid_9_deriv V_ask_9_deriv V_bid_9_deriv P_ask_10_deriv P_bid_10_deriv V_ask_10_deriv V_bid_10_deriv labels
0 30 31 0.286 569.61 100 569.95 100 569.58 8 570.0 ... -3.333333 -0.016667 0.026667 -0.666667 -6.633333 -0.019000 0.028000 -6.333333 1.500000 1
1 31 32 0.308 569.61 100 569.95 100 569.58 8 570.0 ... -3.333333 -0.019667 0.026667 -3.333333 -6.633333 -0.016667 0.028000 -0.666667 1.500000 1
2 32 33 0.308 569.61 100 569.95 100 569.58 8 570.0 ... -3.333333 -0.005667 0.026667 3.000000 -6.633333 -0.010667 0.028000 -6.333333 1.500000 1
3 33 34 0.335 569.64 8 569.95 100 569.61 100 570.0 ... 0.000000 -0.005667 0.040667 3.000000 -3.333333 -0.010667 0.026667 -6.333333 -6.633333 1
4 34 35 0.335 569.64 8 569.95 100 569.61 100 570.0 ... 0.000000 -0.001667 0.025000 3.066667 -6.633333 0.000000 0.023333 0.000000 -3.166667 1

5 rows × 130 columns


In [5]:
df_test.columns


Out[5]:
Index(['test.csv', 'index', 'Time', 'P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask',
       'P_2_bid', 'V_2_bid', 'P_2_ask',
       ...
       'V_bid_8_deriv', 'P_ask_9_deriv', 'P_bid_9_deriv', 'V_ask_9_deriv',
       'V_bid_9_deriv', 'P_ask_10_deriv', 'P_bid_10_deriv', 'V_ask_10_deriv',
       'V_bid_10_deriv', 'labels'],
      dtype='object', length=130)

In [6]:
#X_test_new = df_test.drop(df_test.columns[['labels', 'train.csv', 'index']], axis = 1)
x = df_test.drop(['labels', 'test.csv', 'index'], axis=1)

In [7]:
x.head()


Out[7]:
Time P_1_bid V_1_bid P_1_ask V_1_ask P_2_bid V_2_bid P_2_ask V_2_ask P_3_bid ... V_ask_8_deriv V_bid_8_deriv P_ask_9_deriv P_bid_9_deriv V_ask_9_deriv V_bid_9_deriv P_ask_10_deriv P_bid_10_deriv V_ask_10_deriv V_bid_10_deriv
0 2332.252 570.69 100 570.85 100 570.68 100 570.88 750 570.67 ... 136.2 6.366667 0.001333 0.001667 -136.0 -6.366667 0.001333 0.001333 -3.333333 -3.033333
1 2332.337 570.69 100 570.85 200 570.68 100 570.88 750 570.67 ... 136.2 6.366667 0.001333 0.001667 -136.0 -6.366667 0.001333 0.001333 -3.333333 -3.033333
2 2332.510 570.69 100 570.85 200 570.67 20 570.88 750 570.64 ... 136.2 -3.033333 0.001333 0.000000 -136.0 0.000000 0.001333 0.000000 -3.333333 0.000000
3 2332.650 570.67 20 570.85 200 570.64 100 570.88 750 570.52 ... 136.2 -3.033333 0.001333 -0.000333 -136.0 6.366667 0.001333 -0.001000 -3.333333 -3.333333
4 2332.651 570.67 20 570.85 200 570.64 100 570.88 750 570.52 ... 136.2 -3.033333 0.001000 -0.000333 -136.0 6.366667 0.001000 -0.001000 -5.166667 -3.333333

5 rows × 127 columns


In [8]:
# Define test/training set
X_test   =  np.array(df_test.drop(['labels', 'test.csv', 'index', 'Time'], axis = 1))
Y_test   =  np.array(df_test[['labels']])[:,0]
X_train  =  np.array(df_train.drop(['labels', 'train.csv', 'index', 'Time'], axis = 1))
Y_train  =  np.array(df_train[['labels']])[:,0]

In [9]:
label_percent(Y_train)
Y_train.size


Pos:0.38; Neg:0.39; Zero:0.23;
Out[9]:
101660

In [10]:
label_percent(Y_test)
Y_test.size


Pos:0.34; Neg:0.40; Zero:0.26;
Out[10]:
50830

In [12]:
# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 250, max_depth=15)

# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(X_train, Y_train)

# Take the same decision trees and run it on the test data
output = forest.predict(X_test)

classification_report1 = classification_report(y_true=Y_test, y_pred=output)
print(classification_report1)


             precision    recall  f1-score   support

         -1       0.46      0.41      0.43     20200
          0       0.39      0.00      0.00     13307
          1       0.38      0.71      0.49     17323

avg / total       0.41      0.40      0.34     50830


In [23]:
print(output)


[-1 -1 -1 ...,  1  1  1]

In [24]:
label_percent(output)


Pos:0.64; Neg:0.36; Zero:0.00;

In [13]:
from sklearn.cross_validation import StratifiedShuffleSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)
len(sss)


Out[13]:
3

In [14]:
sss


Out[14]:
StratifiedShuffleSplit(labels=[0 0 1 1], n_iter=3, test_size=0.5, random_state=0)

In [15]:
print(sss)


StratifiedShuffleSplit(labels=[0 0 1 1], n_iter=3, test_size=0.5, random_state=0)

In [ ]: