notebook.community



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle



In [2]:

    
data = pd.read_csv('SBERP_prepared.csv').dropna()



In [3]:

    
data.head()









    Out[3]:







  
    
      
      DATE
      TIME1
      OPEN1
      HIGH1
      LOW1
      CLOSE1
      VOL1
      TIME2
      OPEN2
      HIGH2
      ...
      HIGH8
      LOW8
      CLOSE8
      VOL8
      TIME9
      OPEN9
      HIGH9
      LOW9
      CLOSE9
      VOL9
    
  
  
    
      0
      20120301
      110000
      75.38
      75.45
      74.91
      74.95
      2334800
      120000
      74.95
      75.21
      ...
      75.30
      75.12
      75.16
      754600
      190000.0
      75.14
      75.61
      75.12
      75.58
      2269200.0
    
    
      1
      20120302
      110000
      76.01
      76.35
      75.79
      76.13
      3527600
      120000
      76.14
      76.30
      ...
      76.72
      76.03
      76.72
      4305900
      190000.0
      76.72
      77.48
      76.68
      77.09
      6021500.0
    
    
      2
      20120305
      110000
      77.09
      78.59
      77.09
      78.39
      5464300
      120000
      78.38
      78.40
      ...
      78.11
      77.72
      78.10
      1385100
      190000.0
      78.10
      78.11
      77.53
      77.59
      2659900.0
    
    
      3
      20120306
      110000
      77.15
      77.20
      76.71
      77.03
      2307800
      120000
      77.03
      77.11
      ...
      75.83
      75.03
      75.07
      1457600
      190000.0
      75.07
      75.11
      73.80
      73.81
      5285200.0
    
    
      4
      20120307
      110000
      74.00
      74.20
      73.23
      73.86
      3703100
      120000
      73.86
      74.37
      ...
      74.56
      73.90
      74.40
      2827700
      190000.0
      74.40
      74.78
      74.21
      74.53
      5004100.0
    
  

5 rows × 55 columns



In [4]:

    
data = shuffle(data)

Задача на регрессию:

Будет ли цена закрытия больше цены открытия ?



In [5]:

    
data['TARGET'] = data['CLOSE9'] > data['OPEN9']
data.head()









    Out[5]:







  
    
      
      DATE
      TIME1
      OPEN1
      HIGH1
      LOW1
      CLOSE1
      VOL1
      TIME2
      OPEN2
      HIGH2
      ...
      LOW8
      CLOSE8
      VOL8
      TIME9
      OPEN9
      HIGH9
      LOW9
      CLOSE9
      VOL9
      TARGET
    
  
  
    
      157
      20121010
      110000
      66.96
      67.31
      66.82
      67.10
      1717500
      120000
      67.08
      67.47
      ...
      67.04
      67.08
      517700
      190000.0
      67.12
      67.27
      66.95
      67.24
      1065900.0
      True
    
    
      1435
      20171115
      110000
      190.20
      194.49
      190.12
      190.46
      1399100
      120000
      190.31
      191.31
      ...
      189.41
      189.72
      1431600
      190000.0
      189.61
      191.31
      189.06
      190.00
      755700.0
      True
    
    
      609
      20140805
      110000
      57.04
      57.25
      56.40
      56.56
      2222700
      120000
      56.55
      56.57
      ...
      55.94
      56.10
      528300
      190000.0
      56.08
      56.31
      56.00
      56.25
      1176700.0
      True
    
    
      252
      20130301
      110000
      74.01
      74.08
      73.06
      73.06
      3996700
      120000
      73.05
      73.10
      ...
      73.42
      73.79
      870400
      190000.0
      73.80
      74.09
      73.36
      73.40
      1277200.0
      False
    
    
      1231
      20170124
      110000
      125.00
      126.81
      125.00
      126.81
      903100
      120000
      126.81
      127.10
      ...
      127.10
      127.68
      234400
      190000.0
      127.74
      128.16
      127.70
      127.70
      385300.0
      False
    
  

5 rows × 56 columns



In [6]:

    
train_data = data[:1000]
test_data = data[1000:]



In [7]:

    
train_features = train_data.drop(["DATE", "CLOSE9", "LOW9", "HIGH9", "VOL9",
                                 "TIME1", "TIME2", "TIME3", "TIME4", "TIME5", "TIME6", "TIME7","TIME8","TIME9",
                                 "VOL1", "VOL2", "VOL3", "VOL4", "VOL5", "VOL6", "VOL7","VOL8","VOL9"
                                 ], axis=1)
train_target = train_data["TARGET"]

test_features = test_data.drop(["DATE", "CLOSE9", "LOW9", "HIGH9", "VOL9",
                                 "TIME1", "TIME2", "TIME3", "TIME4", "TIME5", "TIME6", "TIME7","TIME8","TIME9",
                                 "VOL1", "VOL2", "VOL3", "VOL4", "VOL5", "VOL6", "VOL7","VOL8","VOL9"
                               ], axis=1)
test_target = test_data["TARGET"]



In [8]:

    
scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)



In [9]:

    
acs = []
pss = []
rss = []
f1s = []
ras = []
result = []

for i in range(1, 1000, 1):    
    train_features_scaled = shuffle(train_features_scaled)
    lr = LogisticRegression()
    lr.fit(train_features_scaled, train_target)

    test_features_scaled = shuffle(test_features_scaled)
    predictions = lr.predict(test_features_scaled)

    acs.append(accuracy_score(test_target, predictions))
    pss.append(precision_score(test_target, predictions))
    rss.append(recall_score(test_target, predictions))
    f1s.append(f1_score(test_target, predictions))
    ras.append(roc_auc_score(test_target, predictions))
    
    res = pd.DataFrame(test_data["CLOSE9"].tolist(), columns = ["ACTUAL_CLOSE"])
    res["OPEN"] = pd.Series(test_features["OPEN9"].tolist())
    res["PREDICTED_CLOSE"] = pd.Series(predictions)
    res["RESULT"] = np.where(res['PREDICTED_CLOSE'] == True, res['ACTUAL_CLOSE'] - res['OPEN'], 0)
    result.append(np.sum(res['RESULT']))

print("Доля правильных ответов:\t{}\nТочность\t\t\t{}\nПолнота\t\t\t{}\nF1 score\t\t{}\nROC\t\t\t{}\nResult:\t\t\t{}"
      .format(np.mean(acs), np.mean(pss), np.mean(rss), np.mean(f1s), np.mean(ras), np.mean(result)))









    



Доля правильных ответов:	0.503692789894
Точность			0.504432691076
Полнота			0.889555095636
F1 score		0.641002165785
ROC			0.500377821631
Result:			5.83966966967

	DATE	TIME1	OPEN1	HIGH1	LOW1	CLOSE1	VOL1	TIME2	OPEN2	HIGH2	...	HIGH8	LOW8	CLOSE8	VOL8	TIME9	OPEN9	HIGH9	LOW9	CLOSE9	VOL9
0	20120301	110000	75.38	75.45	74.91	74.95	2334800	120000	74.95	75.21	...	75.30	75.12	75.16	754600	190000.0	75.14	75.61	75.12	75.58	2269200.0
1	20120302	110000	76.01	76.35	75.79	76.13	3527600	120000	76.14	76.30	...	76.72	76.03	76.72	4305900	190000.0	76.72	77.48	76.68	77.09	6021500.0
2	20120305	110000	77.09	78.59	77.09	78.39	5464300	120000	78.38	78.40	...	78.11	77.72	78.10	1385100	190000.0	78.10	78.11	77.53	77.59	2659900.0
3	20120306	110000	77.15	77.20	76.71	77.03	2307800	120000	77.03	77.11	...	75.83	75.03	75.07	1457600	190000.0	75.07	75.11	73.80	73.81	5285200.0
4	20120307	110000	74.00	74.20	73.23	73.86	3703100	120000	73.86	74.37	...	74.56	73.90	74.40	2827700	190000.0	74.40	74.78	74.21	74.53	5004100.0

	DATE	TIME1	OPEN1	HIGH1	LOW1	CLOSE1	VOL1	TIME2	OPEN2	HIGH2	...	LOW8	CLOSE8	VOL8	TIME9	OPEN9	HIGH9	LOW9	CLOSE9	VOL9	TARGET
157	20121010	110000	66.96	67.31	66.82	67.10	1717500	120000	67.08	67.47	...	67.04	67.08	517700	190000.0	67.12	67.27	66.95	67.24	1065900.0	True
1435	20171115	110000	190.20	194.49	190.12	190.46	1399100	120000	190.31	191.31	...	189.41	189.72	1431600	190000.0	189.61	191.31	189.06	190.00	755700.0	True
609	20140805	110000	57.04	57.25	56.40	56.56	2222700	120000	56.55	56.57	...	55.94	56.10	528300	190000.0	56.08	56.31	56.00	56.25	1176700.0	True
252	20130301	110000	74.01	74.08	73.06	73.06	3996700	120000	73.05	73.10	...	73.42	73.79	870400	190000.0	73.80	74.09	73.36	73.40	1277200.0	False
1231	20170124	110000	125.00	126.81	125.00	126.81	903100	120000	126.81	127.10	...	127.10	127.68	234400	190000.0	127.74	128.16	127.70	127.70	385300.0	False