knn-sberbank-stock-price-prediction


# Задача: Спрогнозировать цену закрытия последней свечи торгового дня

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv('SBERP_prepared.csv')

In [4]:
print(data[:10])
print(len(data))


       DATE   TIME1  OPEN1  HIGH1   LOW1  CLOSE1     VOL1   TIME2  OPEN2  \
0  20120301  110000  75.38  75.45  74.91   74.95  2334800  120000  74.95   
1  20120302  110000  76.01  76.35  75.79   76.13  3527600  120000  76.14   
2  20120305  110000  77.09  78.59  77.09   78.39  5464300  120000  78.38   
3  20120306  110000  77.15  77.20  76.71   77.03  2307800  120000  77.03   
4  20120307  110000  74.00  74.20  73.23   73.86  3703100  120000  73.86   
5  20120311  110000  76.00  76.19  75.81   75.93  3123500  120000  75.89   
6  20120312  110000  75.78  75.79  75.30   75.78  2782200  120000  75.79   
7  20120313  110000  77.39  77.92  77.33   77.61  1430000  120000  77.62   
8  20120314  110000  80.05  80.60  80.00   80.25  5095200  120000  80.25   
9  20120315  110000  83.37  83.38  81.92   82.40  6368600  120000  82.40   

   HIGH2    ...      HIGH8   LOW8  CLOSE8     VOL8     TIME9  OPEN9  HIGH9  \
0  75.21    ...      75.30  75.12   75.16   754600  190000.0  75.14  75.61   
1  76.30    ...      76.72  76.03   76.72  4305900  190000.0  76.72  77.48   
2  78.40    ...      78.11  77.72   78.10  1385100  190000.0  78.10  78.11   
3  77.11    ...      75.83  75.03   75.07  1457600  190000.0  75.07  75.11   
4  74.37    ...      74.56  73.90   74.40  2827700  190000.0  74.40  74.78   
5  76.04    ...      75.94  75.90   75.94   318400  190000.0  75.94  76.25   
6  76.05    ...      77.19  76.91   77.00  1332100  190000.0  76.99  77.10   
7  77.88    ...      78.84  78.15   78.77  3240000  190000.0  78.77  79.45   
8  80.30    ...      83.19  82.60   83.09  3638200  190000.0  83.09  83.60   
9  82.56    ...      81.90  80.11   80.72  9559500  190000.0  80.75  81.08   

    LOW9  CLOSE9       VOL9  
0  75.12   75.58  2269200.0  
1  76.68   77.09  6021500.0  
2  77.53   77.59  2659900.0  
3  73.80   73.81  5285200.0  
4  74.21   74.53  5004100.0  
5  75.92   76.13  2409400.0  
6  76.90   77.08  1374200.0  
7  78.72   79.22  5444000.0  
8  83.01   83.24  8129900.0  
9  80.50   80.99  3224700.0  

[10 rows x 55 columns]
1590

In [5]:
train_data = data[:1000].dropna()
test_data = data[1000:].dropna()

In [7]:
train_features = train_data.drop(["DATE", "CLOSE9", "LOW9", "HIGH9", "VOL9","TIME1", "TIME2", "TIME3", "TIME4", 
                                  "TIME5", "TIME6", "TIME7", "TIME8", "TIME9"], axis=1)
train_target = train_data["CLOSE9"]

test_features = test_data.drop(["DATE", "CLOSE9", "LOW9", "HIGH9", "VOL9","TIME1", "TIME2", "TIME3", "TIME4", 
                                  "TIME5", "TIME6", "TIME7", "TIME8", "TIME9"], axis=1)
test_target = test_data["CLOSE9"]

In [8]:
train_features.head()


Out[8]:
OPEN1 HIGH1 LOW1 CLOSE1 VOL1 OPEN2 HIGH2 LOW2 CLOSE2 VOL2 ... HIGH7 LOW7 CLOSE7 VOL7 OPEN8 HIGH8 LOW8 CLOSE8 VOL8 OPEN9
0 75.38 75.45 74.91 74.95 2334800 74.95 75.21 74.83 74.90 1617800 ... 75.45 75.09 75.19 1137900 75.20 75.30 75.12 75.16 754600 75.14
1 76.01 76.35 75.79 76.13 3527600 76.14 76.30 75.95 76.22 2527000 ... 76.20 75.65 76.15 4247300 76.06 76.72 76.03 76.72 4305900 76.72
2 77.09 78.59 77.09 78.39 5464300 78.38 78.40 77.49 78.00 3856900 ... 77.75 77.42 77.73 287600 77.72 78.11 77.72 78.10 1385100 78.10
3 77.15 77.20 76.71 77.03 2307800 77.03 77.11 76.70 76.72 1415800 ... 75.90 75.34 75.71 2142600 75.71 75.83 75.03 75.07 1457600 75.07
4 74.00 74.20 73.23 73.86 3703100 73.86 74.37 73.86 74.15 1540400 ... 74.12 73.69 73.94 867300 73.91 74.56 73.90 74.40 2827700 74.40

5 rows × 41 columns


In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features.dropna())

In [23]:
test_features_scaled[:10]


Out[23]:
array([[ 0.68690351,  0.67092634,  0.63475935,  0.64903916, -0.73922116,
         0.64458445,  0.64056894,  0.62933942,  0.65490576, -0.7694075 ,
         0.65498757,  0.74830473,  0.67590819,  0.77136266,  0.07125648,
         0.77507511,  0.79106297,  0.7964571 ,  0.79594853,  0.89827193,
         0.79797789,  0.78361049,  0.79361029,  0.79056248, -0.33122932,
         0.79131609,  0.7720053 ,  0.77653772,  0.75879514, -0.65599519,
         0.75958877,  0.76165937,  0.77479855,  0.77911745, -0.92728655,
         0.77804419,  0.77221618,  0.75552748,  0.75389648, -0.73844219,
         0.74724916],
       [ 0.83643222,  0.85223209,  0.85737366,  0.87025839, -0.38284844,
         0.86673926,  0.88830257,  0.88885508,  0.90193789,  0.67112094,
         0.90017134,  0.92673952,  0.92753246,  0.95083171, -0.2916796 ,
         0.95081369,  0.94447161,  0.96096849,  0.96150053, -0.40409452,
         0.9616756 ,  0.9474978 ,  0.94980842,  0.94328789, -0.56637054,
         0.94123265,  0.91906104,  0.93186017,  0.93356215, -0.75090049,
         0.93433992,  0.94288998,  0.95396134,  0.9415165 , -0.62251011,
         0.94322007,  0.92741442,  0.93370737,  0.92219218, -0.82252115,
         0.91739687],
       [ 0.87172472,  0.85688096,  0.87128706,  0.88791875, -0.77702898,
         0.88625914,  0.91344846,  0.90461139,  0.9130822 , -0.57579492,
         0.91038734,  0.91001126,  0.9080339 ,  0.93595345, -0.80391277,
         0.93500652,  0.9556286 ,  0.95632127,  0.96708094, -0.50752941,
         0.96725621,  0.95215369,  0.95631667,  0.95446292, -0.90231808,
         0.95054424,  0.93209129,  0.93651055,  0.9214772 , -0.7555354 ,
         0.92411512,  0.94381937,  0.93261033,  0.95079644, -0.78132559,
         0.95342757,  0.95622368,  0.96061995,  0.96775289, -0.67383782,
         0.96667463],
       [ 1.08812341,  1.10884948,  1.09575649,  1.10913799, -0.33999285,
         1.10934347,  1.14628082,  1.1140776 ,  1.14154048, -0.15017111,
         1.14442639,  1.12097322,  1.09094896,  1.09589478, -0.37115379,
         1.09586775,  1.11926449,  1.10874991,  1.11217146, -0.51673526,
         1.11514221,  1.10672923,  1.09763879,  1.08949454, -0.80540872,
         1.09394269,  1.09683094,  1.07509166,  1.10368111, -0.60677685,
         1.09793675,  1.13434387,  1.11084481,  1.13175538, -0.59528065,
         1.13159481,  1.11420992,  1.10075101,  1.11745238, -0.63727922,
         1.11636743],
       [ 1.19678713,  1.16835496,  1.1736715 ,  1.14817668, -0.79939061,
         1.14094709,  1.12579157,  1.12334602,  1.11646579, -0.80277502,
         1.11192097,  1.08937539,  1.05659436,  1.05590945, -0.31521206,
         1.05216557,  1.05232253,  1.04554779,  1.06101775, -0.73483168,
         1.06119637,  1.05085856,  1.05394051,  1.06341947, -0.84018727,
         1.05855865,  1.06797824,  1.06486084,  1.0637078 , -0.82807542,
         1.07190998,  1.04976958,  1.0477201 ,  1.02874798, -0.70634091,
         1.02859188,  1.01477152,  1.00052482,  1.02075209, -0.43629434,
         1.01595238],
       [ 0.95716969,  0.96380487,  0.98166665,  0.97993852, -0.83361451,
         0.98385854,  1.0000621 ,  1.00749081,  0.98552019, -0.76067527,
         0.98189927,  0.99365256,  1.0055267 ,  1.01406433, -0.84484138,
         1.01404238,  1.01885156,  1.02974726,  1.03869613, -0.57923029,
         1.03887395,  1.01919851,  1.02604798,  1.01685684, -0.88187741,
         1.01572535,  1.00561916,  1.00905636,  1.01722721, -0.7739279 ,
         1.01706786,  1.02653488,  0.95303303,  0.93409254, -0.12567612,
         0.93208462,  0.93949572,  0.92349914,  0.92777104, -0.49285231,
         0.92948387],
       [ 1.00082093,  0.99541716,  0.99650761,  0.97622055, -0.66673515,
         0.97549287,  0.99819944,  0.99544187,  1.01245226, -0.89749945,
         1.01254724,  0.99272321,  1.0045982 ,  0.99453661, -0.91329403,
         0.99079654,  1.01048381,  1.01022896,  1.03032552, -0.74770491,
         1.02957295,  1.02757911,  1.03534549,  1.04106941, -0.54262113,
         1.04645359,  1.04843285,  1.05649017,  1.04976363, -0.54541947,
         1.05424896,  1.03954631,  1.04029366,  1.025964  , -0.8471186 ,
         1.02673597,  1.03057014,  1.04506979,  1.03655887, -0.64894065,
         1.03826684],
       [ 1.12248715,  1.1116388 ,  1.13100375,  1.11564444, -0.97157012,
         1.11492058,  1.13417354,  1.12890707,  1.13875441, -0.88159639,
         1.13885404,  1.13212539,  1.14573063,  1.13123066, -0.94749057,
         1.1302716 ,  1.13785947,  1.15057483,  1.15588463, -0.66410378,
         1.15792684,  1.14025164,  1.14784533,  1.14257593, -0.85415751,
         1.14236295,  1.13964464,  1.15321792,  1.14644325, -0.86890678,
         1.15184934,  1.13527326,  1.15447513,  1.13732335, -0.86080244,
         1.13716254,  1.12907921,  1.15457618,  1.13511878, -0.50381406,
         1.13589258],
       [ 1.1308459 ,  1.10420061,  1.09575649,  1.06173387, -0.71302323,
         1.06100854,  1.05314788,  1.0538329 ,  1.0561008 , -0.80832663,
         1.05898356,  1.06707104,  1.06866489,  1.04661053, -0.67326293,
         1.0503059 ,  1.03372755,  1.04740668,  1.03590593, -0.32550796,
         1.03608365,  1.02664793,  1.0502215 ,  1.04758817, -0.61144295,
         1.04459127,  1.03726406,  1.04532927,  1.02466411, -0.05595824,
         1.02450408,  1.06278101,  1.04400688,  1.02874798, -0.27474705,
         1.03601551,  1.02127684,  1.04506979,  1.03841849, -0.64322655,
         1.03547753],
       [ 1.03889967,  1.00843398,  1.03175487,  1.02548366, -0.92301055,
         1.02568685,  1.06059851,  1.05197921,  1.08303287, -0.40074556,
         1.08405918,  1.08008191,  1.08166393,  1.07543717, -0.26725351,
         1.07541141,  1.06533902,  1.04183002,  1.03404579, -0.69561325,
         1.03515355,  1.04992738,  1.05487026,  1.05131318, -0.23887868,
         1.05204054,  1.04005626,  1.05463002,  1.04046751, -0.51775715,
         1.03472888,  1.0479108 ,  1.05421823,  1.03895592, -0.09934509,
         1.03879938,  1.02592349,  0.98846056,  0.98262986,  0.07634245,
         0.98155093]])

In [130]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

knr = KNeighborsRegressor(n_neighbors=5, weights='distance')
knr.fit(train_features_scaled, train_target)


Out[130]:
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='distance')

In [145]:
knr_predictions = knr.predict(test_features_scaled)
plt.plot(knr_predictions[:200], c='blue')
plt.plot(test_target.values.tolist()[:200], c='green')
plt.show()



In [181]:
print("Правильность на обучающем наборе: %.4f" % knr.score(train_features_scaled, train_target))
print("Правильность на тестовом наборе: %.4f" % knr.score(test_features_scaled, test_target))


Правильность на обучающем наборе: 1.0000
Правильность на тестовом наборе: -1.5759

In [147]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(train_features_scaled, train_target)


Out[147]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [150]:
lr_predictions = lr.predict(test_features_scaled)
plt.plot(lr_predictions, c='blue')
plt.plot(test_target.values.tolist(), c='green')
plt.show()



In [180]:
print("Правильность на обучающем наборе: %.4f" % lr.score(train_features_scaled, train_target))
print("Правильность на тестовом наборе: %.4f" % lr.score(test_features_scaled, test_target))


Правильность на обучающем наборе: 0.9989
Правильность на тестовом наборе: 0.9998

In [18]:
from sklearn.linear_model import Ridge

ridge = Ridge()
ridge.fit(train_features_scaled, train_target)


Out[18]:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [24]:
p = ridge.predict([[0.68690351,  0.67092634,  0.63475935,  0.64903916, -0.73922116,
         0.64458445,  0.64056894,  0.62933942,  0.65490576, -0.7694075 ,
         0.65498757,  0.74830473,  0.67590819,  0.77136266,  0.07125648,
         0.77507511,  0.79106297,  0.7964571 ,  0.79594853,  0.89827193,
         0.79797789,  0.78361049,  0.79361029,  0.79056248, -0.33122932,
         0.79131609,  0.7720053 ,  0.77653772,  0.75879514, -0.65599519,
         0.75958877,  0.76165937,  0.77479855,  0.77911745, -0.92728655,
         0.77804419,  0.77221618,  0.75552748,  0.75389648, -0.73844219,
         0.74724916]])

In [26]:
print(p)

test_target


[73.62203801]
Out[26]:
1000     73.76
1001     75.21
1002     76.90
1003     77.31
1004     75.20
1005     75.80
1006     76.82
1007     77.82
1008     76.51
1009     75.56
1010     77.03
1011     76.80
1012     75.62
1013     77.00
1014     78.45
1015     79.70
1016     79.81
1017     80.03
1018     78.93
1019     78.75
1020     79.65
1021     78.65
1022     77.60
1023     79.09
1024     79.09
1025     77.29
1026     76.26
1027     75.13
1028     76.10
1029     76.80
         ...  
1560    197.62
1561    195.49
1562    198.68
1563    198.27
1564    195.29
1565    196.70
1566    196.03
1567    197.00
1568    193.80
1569    195.50
1570    196.00
1571    196.02
1572    195.39
1573    191.30
1574    193.10
1575    192.50
1576    189.32
1577    189.20
1578    188.00
1579    190.14
1580    189.26
1581    187.45
1582    185.00
1583    183.50
1584    188.44
1585    186.43
1586    191.50
1587    179.90
1588    178.27
1589    183.66
Name: CLOSE9, Length: 589, dtype: float64

In [182]:
ridge_predictions = ridge.predict(test_features_scaled)
plt.plot(ridge_predictions, c='blue')
plt.plot(test_target.values.tolist(), c='green')
plt.show()



In [183]:
print("Правильность на обучающем наборе: %.4f" % ridge.score(train_features_scaled, train_target))
print("Правильность на тестовом наборе: %.4f" % ridge.score(test_features_scaled, test_target))


Правильность на обучающем наборе: 0.9988
Правильность на тестовом наборе: 0.9997

In [184]:
from sklearn.linear_model import Lasso

lasso = Lasso()
lasso.fit(train_features_scaled, train_target)


/usr/local/lib/python2.7/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Out[184]:
Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [185]:
lasso_predictions = lasso.predict(test_features_scaled)
plt.plot(lasso_predictions, c='blue')
plt.plot(test_target.values.tolist(), c='green')
plt.show()



In [192]:
print("Правильность на обучающем наборе: %.4f" % lasso.score(train_features_scaled, train_target))
print("Правильность на тестовом наборе: %.4f" % lasso.score(test_features_scaled, test_target))
print("Количество использованных признаков: %d" % np.sum(lasso.coef_ != 0))


Правильность на обучающем наборе: 0.9900
Правильность на тестовом наборе: 0.9657
Количество использованных признаков: 8

In [195]:
to_day_data = [[182.8000000,184.5000000,182.4000000,183.9400000,1073500,
    183.8500000,184.0200000,183.3800000,183.9000000,435300,
    183.8300000,184.4500000,183.1300000,183.3400000,603300,
    183.3400000,183.6900000,182.6400000,182.7500000,351600,
    182.7500000,183.8900000,182.7500000,183.0600000,248400,
    183.0600000,183.1800000,181.7800000,182.3800000,507800,
    182.2400000,183.4000000,182.0300000,183.2400000,525900,
    183.1400000,183.7500000,182.9000000,183.3000000,556000,
    183.3000000]]

print("Прогноз от knr: %.4f" % knr.predict(scaler.transform(to_day_data))[0])
print("Прогноз от lr: %.4f" % lr.predict(scaler.transform(to_day_data))[0])
print("Прогноз от ridge: %.4f" % ridge.predict(scaler.transform(to_day_data))[0])
print("Прогноз от lasso: %.4f" % lasso.predict(scaler.transform(to_day_data))[0])
print("Реальное значение: %.4f" % 182.9100000)


Прогноз от knr: 85.2905
Прогноз от lr: 183.4307
Прогноз от ridge: 183.2162
Прогноз от lasso: 172.3347
Реальное значение: 182.9100

In [1]:
res = pd.DataFrame(test_data["CLOSE9"].tolist(), columns = ["ACTUAL_CLOSE9"])
res["OPEN9"] = pd.Series(test_features["OPEN9"].tolist())
res["PREDICTED_CLOSE9"] = pd.Series(lr_predictions)
res["RESULT"] = np.where(res['PREDICTED_CLOSE9'] >= res['OPEN9'], res.ACTUAL_CLOSE9 - res.OPEN9, (res.OPEN9 - res.ACTUAL_CLOSE9))

print(np.sum(res["RESULT"]))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-0b80c4929cfe> in <module>()
----> 1 res = pd.DataFrame(test_data["CLOSE9"].tolist(), columns = ["ACTUAL_CLOSE9"])
      2 res["OPEN9"] = pd.Series(test_features["OPEN9"].tolist())
      3 res["PREDICTED_CLOSE9"] = pd.Series(lr_predictions)
      4 res["RESULT"] = np.where(res['PREDICTED_CLOSE9'] >= res['OPEN9'], res.ACTUAL_CLOSE9 - res.OPEN9, (res.OPEN9 - res.ACTUAL_CLOSE9))
      5 

NameError: name 'pd' is not defined

In [ ]: