In [1]:
import sys
import imp
import yaml
import csv
import pandas as pd
import re
import numpy as np
# Import the random forest package
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
def label_percent(v):
    l=len(v)
    pos=sum(x > 0 for x in v)/l
    neg=sum(x < 0 for x in v)/l
    zero=sum(x == 0 for x in v)/l
    print('Pos:'+str("{0:.2f}".format(pos))+"; Neg:"+str("{0:.2f}".format(neg))+'; Zero:'+str("{0:.2f}".format(zero))+';')

In [3]:
# Open test and train sets
df_train = pd.read_csv("data/output/model_clean_data/train_2.tar.gz", compression='gzip', index_col = None)
df_test  = pd.read_csv("data/output/model_clean_data/test_2.tar.gz" , compression='gzip', index_col = None)

In [4]:
# df_train.head()

In [5]:
# df_test.columns

In [6]:
#X_test_new = df_test.drop(df_test.columns[['labels', 'train.csv', 'index']], axis = 1)
x = df_test.drop(['labels', 'test.csv', 'index'], axis=1)

In [7]:
x.head()


Out[7]:
Time P_1_bid V_1_bid P_1_ask V_1_ask P_2_bid V_2_bid P_2_ask V_2_ask P_3_bid ... V_ask_8_deriv V_bid_8_deriv P_ask_9_deriv P_bid_9_deriv V_ask_9_deriv V_bid_9_deriv P_ask_10_deriv P_bid_10_deriv V_ask_10_deriv V_bid_10_deriv
0 1449.383 571.01 14 571.31 200 571.00 145 571.36 42 570.97 ... 6.666667 -30.000000 -0.001667 -0.002000 0.000000 0.000000 -0.003667 -0.004667 -2.433333 0.000000
1 4533.713 569.32 300 569.69 100 569.30 8 569.76 60 569.29 ... -6.600000 0.433333 0.002333 0.000667 5.000000 2.900000 0.002333 0.000333 -1.666667 -3.333333
2 3003.506 568.71 91 568.92 112 568.70 210 568.93 100 568.62 ... -10.000000 -1.266667 -0.000667 0.001000 -1.433333 0.000000 -0.001333 0.000333 -9.800000 -1.666667
3 4852.284 567.01 100 567.28 800 567.00 1278 567.46 400 566.95 ... 0.000000 -3.333333 0.001333 0.000000 3.333333 0.000000 0.002333 -0.001667 -5.000000 -6.666667
4 4676.789 567.77 200 568.00 150 567.76 100 568.02 43 567.74 ... 0.000000 3.333333 -0.000667 0.000667 4.966667 -3.333333 -0.000667 0.000333 1.233333 -61.400000

5 rows × 127 columns


In [8]:
# Define test/training set
X_test   =  np.array(df_test.drop(['labels', 'test.csv', 'index', 'Time'], axis = 1))
Y_test   =  np.array(df_test[['labels']])[:,0]
X_train  =  np.array(df_train.drop(['labels', 'train.csv', 'index', 'Time'], axis = 1))
Y_train  =  np.array(df_train[['labels']])[:,0]

In [9]:
X_train_cols  =  list(df_train.drop(['labels', 'train.csv', 'index', 'Time'], axis=1).columns.values)
X_train_cols


Out[9]:
['P_1_bid',
 'V_1_bid',
 'P_1_ask',
 'V_1_ask',
 'P_2_bid',
 'V_2_bid',
 'P_2_ask',
 'V_2_ask',
 'P_3_bid',
 'V_3_bid',
 'P_3_ask',
 'V_3_ask',
 'P_4_bid',
 'V_4_bid',
 'P_4_ask',
 'V_4_ask',
 'P_5_bid',
 'V_5_bid',
 'P_5_ask',
 'V_5_ask',
 'P_6_bid',
 'V_6_bid',
 'P_6_ask',
 'V_6_ask',
 'P_7_bid',
 'V_7_bid',
 'P_7_ask',
 'V_7_ask',
 'P_8_bid',
 'V_8_bid',
 'P_8_ask',
 'V_8_ask',
 'P_9_bid',
 'V_9_bid',
 'P_9_ask',
 'V_9_ask',
 'P_10_bid',
 'V_10_bid',
 'P_10_ask',
 'V_10_ask',
 'spreads_1',
 'mid_price_1',
 'spreads_2',
 'mid_price_2',
 'spreads_3',
 'mid_price_3',
 'spreads_4',
 'mid_price_4',
 'spreads_5',
 'mid_price_5',
 'spreads_6',
 'mid_price_6',
 'spreads_7',
 'mid_price_7',
 'spreads_8',
 'mid_price_8',
 'spreads_9',
 'mid_price_9',
 'spreads_10',
 'mid_price_10',
 'P_diff_ask_10_1',
 'P_diff_bid_1_10',
 'P_diff_ask_1_2',
 'P_diff_bid_1_2',
 'P_diff_ask_2_3',
 'P_diff_bid_2_3',
 'P_diff_ask_3_4',
 'P_diff_bid_3_4',
 'P_diff_ask_4_5',
 'P_diff_bid_4_5',
 'P_diff_ask_5_6',
 'P_diff_bid_5_6',
 'P_diff_ask_6_7',
 'P_diff_bid_6_7',
 'P_diff_ask_7_8',
 'P_diff_bid_7_8',
 'P_diff_ask_8_9',
 'P_diff_bid_8_9',
 'P_diff_ask_9_10',
 'P_diff_bid_9_10',
 'Mean_ask_price',
 'Mean_bid_price',
 'Mean_ask_volumn',
 'Mean_bid_volumn',
 'Accum_diff_price',
 'Accum_diff_volumn',
 'P_ask_1_deriv',
 'P_bid_1_deriv',
 'V_ask_1_deriv',
 'V_bid_1_deriv',
 'P_ask_2_deriv',
 'P_bid_2_deriv',
 'V_ask_2_deriv',
 'V_bid_2_deriv',
 'P_ask_3_deriv',
 'P_bid_3_deriv',
 'V_ask_3_deriv',
 'V_bid_3_deriv',
 'P_ask_4_deriv',
 'P_bid_4_deriv',
 'V_ask_4_deriv',
 'V_bid_4_deriv',
 'P_ask_5_deriv',
 'P_bid_5_deriv',
 'V_ask_5_deriv',
 'V_bid_5_deriv',
 'P_ask_6_deriv',
 'P_bid_6_deriv',
 'V_ask_6_deriv',
 'V_bid_6_deriv',
 'P_ask_7_deriv',
 'P_bid_7_deriv',
 'V_ask_7_deriv',
 'V_bid_7_deriv',
 'P_ask_8_deriv',
 'P_bid_8_deriv',
 'V_ask_8_deriv',
 'V_bid_8_deriv',
 'P_ask_9_deriv',
 'P_bid_9_deriv',
 'V_ask_9_deriv',
 'V_bid_9_deriv',
 'P_ask_10_deriv',
 'P_bid_10_deriv',
 'V_ask_10_deriv',
 'V_bid_10_deriv']

In [10]:
label_percent(Y_train)
Y_train.size


Pos:0.36; Neg:0.39; Zero:0.25;
Out[10]:
101660

In [11]:
label_percent(Y_test)
Y_test.size


Pos:0.36; Neg:0.39; Zero:0.25;
Out[11]:
50830

In [12]:
# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 100, max_depth=10)

# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(X_train, Y_train)

# Take the same decision trees and run it on the test data
output = forest.predict(X_test)

classification_report1 = classification_report(y_true=Y_test, y_pred=output)
print(classification_report1)


             precision    recall  f1-score   support

         -1       0.55      0.77      0.64     19726
          0       0.89      0.10      0.18     12698
          1       0.57      0.68      0.62     18406

avg / total       0.64      0.57      0.52     50830


In [13]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [16]:
# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    #print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
    print("%d. feature %s (%f)" % (f + 1, X_train_cols[f], importances[indices[f]]))


Feature ranking:
1. feature P_1_bid (0.038041)
2. feature V_1_bid (0.023346)
3. feature P_1_ask (0.023254)
4. feature V_1_ask (0.019272)
5. feature P_2_bid (0.019214)
6. feature V_2_bid (0.017794)
7. feature P_2_ask (0.017557)
8. feature V_2_ask (0.016292)
9. feature P_3_bid (0.015924)
10. feature V_3_bid (0.015410)
11. feature P_3_ask (0.014684)
12. feature V_3_ask (0.014511)
13. feature P_4_bid (0.014487)
14. feature V_4_bid (0.014132)
15. feature P_4_ask (0.014011)
16. feature V_4_ask (0.013961)
17. feature P_5_bid (0.013046)
18. feature V_5_bid (0.012974)
19. feature P_5_ask (0.012429)
20. feature V_5_ask (0.012217)
21. feature P_6_bid (0.011674)
22. feature V_6_bid (0.011509)
23. feature P_6_ask (0.011460)
24. feature V_6_ask (0.010520)
25. feature P_7_bid (0.010049)
26. feature V_7_bid (0.009809)
27. feature P_7_ask (0.009640)
28. feature V_7_ask (0.009042)
29. feature P_8_bid (0.009036)
30. feature V_8_bid (0.008851)
31. feature P_8_ask (0.008837)
32. feature V_8_ask (0.008774)
33. feature P_9_bid (0.008647)
34. feature V_9_bid (0.008490)
35. feature P_9_ask (0.008316)
36. feature V_9_ask (0.008141)
37. feature P_10_bid (0.008083)
38. feature V_10_bid (0.008077)
39. feature P_10_ask (0.007916)
40. feature V_10_ask (0.007827)
41. feature spreads_1 (0.007681)
42. feature mid_price_1 (0.007546)
43. feature spreads_2 (0.007522)
44. feature mid_price_2 (0.007429)
45. feature spreads_3 (0.007426)
46. feature mid_price_3 (0.007326)
47. feature spreads_4 (0.007311)
48. feature mid_price_4 (0.007252)
49. feature spreads_5 (0.007174)
50. feature mid_price_5 (0.007086)
51. feature spreads_6 (0.006878)
52. feature mid_price_6 (0.006813)
53. feature spreads_7 (0.006801)
54. feature mid_price_7 (0.006794)
55. feature spreads_8 (0.006754)
56. feature mid_price_8 (0.006627)
57. feature spreads_9 (0.006579)
58. feature mid_price_9 (0.006537)
59. feature spreads_10 (0.006521)
60. feature mid_price_10 (0.006491)
61. feature P_diff_ask_10_1 (0.006466)
62. feature P_diff_bid_1_10 (0.006411)
63. feature P_diff_ask_1_2 (0.006328)
64. feature P_diff_bid_1_2 (0.006300)
65. feature P_diff_ask_2_3 (0.006299)
66. feature P_diff_bid_2_3 (0.006282)
67. feature P_diff_ask_3_4 (0.006244)
68. feature P_diff_bid_3_4 (0.006169)
69. feature P_diff_ask_4_5 (0.006157)
70. feature P_diff_bid_4_5 (0.006135)
71. feature P_diff_ask_5_6 (0.006083)
72. feature P_diff_bid_5_6 (0.005976)
73. feature P_diff_ask_6_7 (0.005928)
74. feature P_diff_bid_6_7 (0.005915)
75. feature P_diff_ask_7_8 (0.005879)
76. feature P_diff_bid_7_8 (0.005878)
77. feature P_diff_ask_8_9 (0.005875)
78. feature P_diff_bid_8_9 (0.005858)
79. feature P_diff_ask_9_10 (0.005798)
80. feature P_diff_bid_9_10 (0.005715)
81. feature Mean_ask_price (0.005709)
82. feature Mean_bid_price (0.005689)
83. feature Mean_ask_volumn (0.005645)
84. feature Mean_bid_volumn (0.005625)
85. feature Accum_diff_price (0.005565)
86. feature Accum_diff_volumn (0.005530)
87. feature P_ask_1_deriv (0.005529)
88. feature P_bid_1_deriv (0.005526)
89. feature V_ask_1_deriv (0.005394)
90. feature V_bid_1_deriv (0.005381)
91. feature P_ask_2_deriv (0.005348)
92. feature P_bid_2_deriv (0.005245)
93. feature V_ask_2_deriv (0.005220)
94. feature V_bid_2_deriv (0.005196)
95. feature P_ask_3_deriv (0.005162)
96. feature P_bid_3_deriv (0.005129)
97. feature V_ask_3_deriv (0.005122)
98. feature V_bid_3_deriv (0.005041)
99. feature P_ask_4_deriv (0.004935)
100. feature P_bid_4_deriv (0.004860)
101. feature V_ask_4_deriv (0.004857)
102. feature V_bid_4_deriv (0.004741)
103. feature P_ask_5_deriv (0.004736)
104. feature P_bid_5_deriv (0.004720)
105. feature V_ask_5_deriv (0.004691)
106. feature V_bid_5_deriv (0.004496)
107. feature P_ask_6_deriv (0.004461)
108. feature P_bid_6_deriv (0.004444)
109. feature V_ask_6_deriv (0.004434)
110. feature V_bid_6_deriv (0.004389)
111. feature P_ask_7_deriv (0.004355)
112. feature P_bid_7_deriv (0.004322)
113. feature V_ask_7_deriv (0.004307)
114. feature V_bid_7_deriv (0.004276)
115. feature P_ask_8_deriv (0.004209)
116. feature P_bid_8_deriv (0.004059)
117. feature V_ask_8_deriv (0.004029)
118. feature V_bid_8_deriv (0.003980)
119. feature P_ask_9_deriv (0.003979)
120. feature P_bid_9_deriv (0.003737)
121. feature V_ask_9_deriv (0.003668)
122. feature V_bid_9_deriv (0.003504)
123. feature P_ask_10_deriv (0.003240)
124. feature P_bid_10_deriv (0.003167)
125. feature V_ask_10_deriv (0.002854)
126. feature V_bid_10_deriv (0.000000)

In [18]:
var_importance = [(f+1, X_train_cols[f], importances[indices[f]]) for f in range(X_train.shape[1])]

In [20]:
print("\n".join(var_importance))


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-20-39fcb83360c1> in <module>()
----> 1 print("\n".join(var_importance))

TypeError: sequence item 0: expected str instance, tuple found

In [ ]:
print(output)

In [ ]:
label_percent(output)

In [ ]:
from sklearn.cross_validation import StratifiedShuffleSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)
len(sss)

In [ ]:
sss

In [ ]:
print(sss)

In [ ]: