In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle

In [2]:
data = pd.read_csv('SBERP_prepared.csv').dropna()

In [3]:
data.head()


Out[3]:
DATE TIME1 OPEN1 HIGH1 LOW1 CLOSE1 VOL1 TIME2 OPEN2 HIGH2 ... HIGH8 LOW8 CLOSE8 VOL8 TIME9 OPEN9 HIGH9 LOW9 CLOSE9 VOL9
0 20120301 110000 75.38 75.45 74.91 74.95 2334800 120000 74.95 75.21 ... 75.30 75.12 75.16 754600 190000.0 75.14 75.61 75.12 75.58 2269200.0
1 20120302 110000 76.01 76.35 75.79 76.13 3527600 120000 76.14 76.30 ... 76.72 76.03 76.72 4305900 190000.0 76.72 77.48 76.68 77.09 6021500.0
2 20120305 110000 77.09 78.59 77.09 78.39 5464300 120000 78.38 78.40 ... 78.11 77.72 78.10 1385100 190000.0 78.10 78.11 77.53 77.59 2659900.0
3 20120306 110000 77.15 77.20 76.71 77.03 2307800 120000 77.03 77.11 ... 75.83 75.03 75.07 1457600 190000.0 75.07 75.11 73.80 73.81 5285200.0
4 20120307 110000 74.00 74.20 73.23 73.86 3703100 120000 73.86 74.37 ... 74.56 73.90 74.40 2827700 190000.0 74.40 74.78 74.21 74.53 5004100.0

5 rows × 55 columns


In [4]:
data = shuffle(data)

Задача на регрессию:

Будет ли цена закрытия больше цены открытия ?


In [5]:
data['TARGET'] = data['CLOSE9'] > data['OPEN9']
data.head()


Out[5]:
DATE TIME1 OPEN1 HIGH1 LOW1 CLOSE1 VOL1 TIME2 OPEN2 HIGH2 ... LOW8 CLOSE8 VOL8 TIME9 OPEN9 HIGH9 LOW9 CLOSE9 VOL9 TARGET
157 20121010 110000 66.96 67.31 66.82 67.10 1717500 120000 67.08 67.47 ... 67.04 67.08 517700 190000.0 67.12 67.27 66.95 67.24 1065900.0 True
1435 20171115 110000 190.20 194.49 190.12 190.46 1399100 120000 190.31 191.31 ... 189.41 189.72 1431600 190000.0 189.61 191.31 189.06 190.00 755700.0 True
609 20140805 110000 57.04 57.25 56.40 56.56 2222700 120000 56.55 56.57 ... 55.94 56.10 528300 190000.0 56.08 56.31 56.00 56.25 1176700.0 True
252 20130301 110000 74.01 74.08 73.06 73.06 3996700 120000 73.05 73.10 ... 73.42 73.79 870400 190000.0 73.80 74.09 73.36 73.40 1277200.0 False
1231 20170124 110000 125.00 126.81 125.00 126.81 903100 120000 126.81 127.10 ... 127.10 127.68 234400 190000.0 127.74 128.16 127.70 127.70 385300.0 False

5 rows × 56 columns


In [6]:
train_data = data[:1000]
test_data = data[1000:]

In [7]:
train_features = train_data.drop(["DATE", "CLOSE9", "LOW9", "HIGH9", "VOL9",
                                 "TIME1", "TIME2", "TIME3", "TIME4", "TIME5", "TIME6", "TIME7","TIME8","TIME9",
                                 "VOL1", "VOL2", "VOL3", "VOL4", "VOL5", "VOL6", "VOL7","VOL8","VOL9"
                                 ], axis=1)
train_target = train_data["TARGET"]

test_features = test_data.drop(["DATE", "CLOSE9", "LOW9", "HIGH9", "VOL9",
                                 "TIME1", "TIME2", "TIME3", "TIME4", "TIME5", "TIME6", "TIME7","TIME8","TIME9",
                                 "VOL1", "VOL2", "VOL3", "VOL4", "VOL5", "VOL6", "VOL7","VOL8","VOL9"
                               ], axis=1)
test_target = test_data["TARGET"]

In [8]:
scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)

In [9]:
acs = []
pss = []
rss = []
f1s = []
ras = []
result = []

for i in range(1, 1000, 1):    
    train_features_scaled = shuffle(train_features_scaled)
    lr = LogisticRegression()
    lr.fit(train_features_scaled, train_target)

    test_features_scaled = shuffle(test_features_scaled)
    predictions = lr.predict(test_features_scaled)

    acs.append(accuracy_score(test_target, predictions))
    pss.append(precision_score(test_target, predictions))
    rss.append(recall_score(test_target, predictions))
    f1s.append(f1_score(test_target, predictions))
    ras.append(roc_auc_score(test_target, predictions))
    
    res = pd.DataFrame(test_data["CLOSE9"].tolist(), columns = ["ACTUAL_CLOSE"])
    res["OPEN"] = pd.Series(test_features["OPEN9"].tolist())
    res["PREDICTED_CLOSE"] = pd.Series(predictions)
    res["RESULT"] = np.where(res['PREDICTED_CLOSE'] == True, res['ACTUAL_CLOSE'] - res['OPEN'], 0)
    result.append(np.sum(res['RESULT']))

print("Доля правильных ответов:\t{}\nТочность\t\t\t{}\nПолнота\t\t\t{}\nF1 score\t\t{}\nROC\t\t\t{}\nResult:\t\t\t{}"
      .format(np.mean(acs), np.mean(pss), np.mean(rss), np.mean(f1s), np.mean(ras), np.mean(result)))


Доля правильных ответов:	0.503692789894
Точность			0.504432691076
Полнота			0.889555095636
F1 score		0.641002165785
ROC			0.500377821631
Result:			5.83966966967