In [20]:
import warnings
warnings.filterwarnings('ignore')

In [399]:
import numpy as np
from pprint import pprint
import pandas as pd
from scipy import stats
import quandl
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARMA
from matplotlib import pylab as plt

%matplotlib inline

In [7]:
def get_dataframe (name) :
    return quandl.get(name, start_date='2015-1-1', end_date='2017-11-1',
                      collapse='weekly')

In [9]:
dataframe1 = get_dataframe("NIKKEI/INDEX.4")
dataframe2 = get_dataframe("TSE/2502.4")
dataframe3 = get_dataframe("TSE/9020.4")
dataframe4 = get_dataframe("TSE/7203.4")
dataframe5 = get_dataframe("TSE/4503.4")
dataframe6 = get_dataframe("TSE/6758.4")

In [333]:
ts1 = dataframe1['Close Price'].rename('Close')
ts2 = dataframe2['Close'][:120]
ts3 = dataframe3['Close'][:120]
ts4 = dataframe4['Close'][:120]
ts5 = dataframe5['Close'][:120]
ts6 = dataframe6['Close'][:120]

In [436]:
pprint(sm.tsa.stattools.adfuller(ts2, regression='ctt', autolag = 'AIC'))
pprint(sm.tsa.stattools.adfuller(ts3, regression='ct', autolag = 'AIC'))
pprint(sm.tsa.stattools.adfuller(ts4, regression='ctt', autolag = 'AIC'))
pprint(sm.tsa.stattools.adfuller(ts5, regression='ct', autolag = 'AIC'))
pprint(sm.tsa.stattools.adfuller(ts6, regression='ctt', autolag = 'AIC'))


(-3.4243993894646616,
 0.13326170365782197,
 0,
 119,
 {'1%': -4.471237472469957,
  '10%': -3.584424431130831,
  '5%': -3.882969922547368},
 1307.5889250985836)
(-2.904793565308496,
 0.16067015581233307,
 0,
 119,
 {'1%': -4.036933565633866,
  '10%': -3.1490681814297643,
  '5%': -3.4480491338265407},
 1537.811468467928)
(-2.6037424086219305,
 0.5096036701299048,
 0,
 119,
 {'1%': -4.471237472469957,
  '10%': -3.584424431130831,
  '5%': -3.882969922547368},
 1449.4672932660785)
(-3.4174043810519117,
 0.04910564863187634,
 0,
 119,
 {'1%': -4.036933565633866,
  '10%': -3.1490681814297643,
  '5%': -3.4480491338265407},
 1143.8350636832258)
(-4.137776413232316,
 0.02100681603520161,
 1,
 118,
 {'1%': -4.472110860871852,
  '10%': -3.584692378310343,
  '5%': -3.883407308731662},
 1337.068933005826)

In [436]:


In [402]:


In [402]:


In [403]:


In [447]:
pprint(sm.tsa.arma_order_select_ic(ts2, ic='aic'))
pprint(sm.tsa.arma_order_select_ic(ts3, ic='aic'))
pprint(sm.tsa.arma_order_select_ic(ts4, ic='aic'))
pprint(sm.tsa.arma_order_select_ic(ts5, ic='aic'))
pprint(sm.tsa.arma_order_select_ic((ts6 - ts6.shift(1)).dropna(), ic='aic'))


{'aic':              0            1            2
0  1679.536657  1577.743635  1763.332522
1  1488.936634  1490.614433  1488.660757
2  1490.733981  1490.708809  1490.657758
3  1489.374765  1491.192556  1492.459610
4  1491.350072  1493.041001  1494.459551, 'aic_min_order': (1, 2)}
{'aic':              0            1            2
0  1979.585644  1869.081432  1809.600127
1  1737.428598  1738.866964  1739.981652
2  1738.990991  1740.494877  1741.980017
3  1739.657859  1741.646606  1735.021824
4  1741.609854  1743.301390          NaN,
 'aic_min_order': (3, 2)}
{'aic':              0            1            2
0  2002.003998  1868.465319  1964.683360
1  1640.157010  1641.641261  1643.141242
2  1641.711249  1642.628899  1644.606973
3  1643.150901  1644.607821  1647.420408
4  1645.148941  1646.595370  1648.894668,
 'aic_min_order': (1, 0)}
{'aic':              0            1            2
0  1545.805878  1436.081643  1388.521781
1  1302.593986  1303.591920  1304.847993
2  1303.801149  1305.297028  1306.731610
3  1304.443329  1306.432100  1292.806527
4  1306.403755  1310.297566  1294.654617,
 'aic_min_order': (3, 2)}
{'aic':              0            1            2
0  1525.435271  1522.249768  1523.248768
1  1521.388158  1522.796172  1524.758002
2  1522.815016  1524.769149          NaN
3  1524.725640  1526.723993          NaN
4  1526.710486  1528.435723          NaN,
 'aic_min_order': (1, 0)}

In [446]:


In [446]:


In [446]:


In [446]:


In [266]:
ts2_arima = ARMA(ts2, order = (1, 2)).fit()
ts3_arima = ARMA(ts3, order = (3, 2)).fit()
ts4_arima = ARMA(ts4, order = (1, 0)).fit()
ts5_arima = ARMA(ts5, order = (3, 2)).fit()
ts6_arima = ARIMA(ts6, order = (1, 1, 0)).fit()

In [448]:
r_forest = RandomForestRegressor(n_estimators = 1000, criterion = 'mse', random_state = 1, n_jobs = 1)
r_forest.fit(np.vstack((ts2.values,ts3.values,ts4.values,ts5.values,ts6.values)).T,
             ts1[:120].values)


Out[448]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [448]:


In [338]:
ts2_predict = ts2_arima.predict('2015-1-18','2017-11-5')
ts3_predict = ts3_arima.predict('2015-1-18','2017-11-5')
ts4_predict = ts4_arima.predict('2015-1-18','2017-11-5')
ts5_predict = ts5_arima.predict('2015-1-18','2017-11-5')
ts6_predict = ts6_arima.predict('2015-1-18','2017-11-5')

In [449]:
ts1_pred_with_RF = r_forest.predict(
    np.vstack((ts2_predict, ts3_predict, ts4_predict, ts5_predict, ts6_predict)).T)
ts1_pred_with_RF_dataframe = \
    pd.DataFrame({'Close': ts1_pred_with_RF.tolist()},index =  pd.DatetimeIndex(periods = 147, freq = 'W', start = '2015-1-18'))
hold(True)
plt.plot(ts1_pred_with_RF_dataframe)
plt.plot(ts1)


Out[449]:
[<matplotlib.lines.Line2D at 0x7ffa86555c50>]

In [341]:


In [449]:


In [276]:
sm.tsa.stattools.adfuller(ts1[:120], regression='ctt', autolag = 'AIC')


Out[276]:
(-2.8812138487142485,
 0.3547039460728244,
 0,
 119,
 {'1%': -4.471237472469957,
  '10%': -3.584424431130831,
  '5%': -3.882969922547368},
 1624.5165654066914)

In [254]:
sm.tsa.arma_order_select_ic(ts1[:120], ic='aic', trend='nc')


Out[254]:
{'aic':              0            1            2
 0          NaN  2542.277892  2404.711719
 1  1844.871705  1846.440274  1848.143599
 2  1846.488236          NaN  1849.942657
 3  1848.083854          NaN          NaN
 4  1850.083283  1851.458068  1853.167285, 'aic_min_order': (1, 0)}

In [321]:
ts1_arima = ARMA(ts1[:120], order = (1, 0)).fit()
ts1_pred_with_ARMA_dataframe = ts1_arima.predict('2015-1-18','2017-11-5')
hold(True)
plt.plot(ts1_pred_with_ARMA_dataframe)
plt.plot(ts1)


Out[321]:
[<matplotlib.lines.Line2D at 0x7ffa85d25278>]

In [301]:
ts1_s1 = ts1.shift(1).dropna()[:120]
ts1_y = ts1[1:][:120]
r_forest2 = RandomForestRegressor(n_estimators = 1000, criterion = 'mse', random_state = 1, n_jobs = 1)
r_forest.fit(np.vstack((ts1_s1.values)),ts1_y.values)
ts1_pred_with_RF2 = r_forest.predict(np.vstack((ts1.shift(1).dropna().values)))
ts1_pred_with_RF2_dataframe = \
    pd.DataFrame({'Close': ts1_pred_with_RF2.T.tolist()},
                 index =  pd.DatetimeIndex(periods = 147,
                                           freq = 'W', start = '2015-1-18'))
hold(True)
plt.plot(ts1_pred_with_RF2_dataframe)
plt.plot(ts1)


Out[301]:
[<matplotlib.lines.Line2D at 0x7ffa85aeb240>]

In [451]:
train_with_ARMA_resid = ts1[1:][:120] - ts1_pred_with_ARMA_dataframe.tolist()[:120]
train_with_RF2_resid = ts1[1:][:120] - ts1_pred_with_RF2[:120]
MSE_train_RF = np.array([(elem * elem) for elem in train_with_RF_resid]).mean()
MSE_train_ARMA = np.array([(elem * elem) for elem in train_with_ARMA_resid]).mean()
MSE_train_RF2 = np.array([(elem * elem) for elem in train_with_RF2_resid]).mean()
print('train: MSE')
print('RandomForest and ARIMA model: ', MSE_train_RF)
print('      AR(I)MA model         : ', MSE_train_ARMA)
print('      Random Forest         : ', MSE_train_RF2)


train: MSE
RandomForest and ARIMA model:  1216335.6349572965
      AR(I)MA model         :  247038.67430585716
      Random Forest         :  42615.21572554316

In [394]:
ts1_pred_with_ARMA_dataframe2 = ts1_arima.predict('2015-1-18','2017-11-5')
train_with_ARMA_resid2 = ts1[:120] - ts1_pred_with_ARMA_dataframe2.tolist()[:120]
MSE_train_ARMA2 = np.array([(elem * elem) for elem in train_with_ARMA_resid2]).mean()
print('train: MSE')
print('      AR(I)MA model?        : ', MSE_train_ARMA2)


train: MSE
      AR(I)MA model?        :  8580.270341196769

In [361]:
pred_with_RF_resid = ts1[1:][120:] - ts1_pred_with_RF[120:]
pred_with_ARMA_resid = ts1[1:][120:] - ts1_pred_with_ARMA_dataframe.tolist()[120:]
pred_with_RF2_resid = ts1[1:][120:] - ts1_pred_with_RF2[120:]
MSE_pred_RF = np.array([(elem * elem) for elem in pred_with_RF_resid]).mean()
MSE_pred_ARMA = np.array([(elem * elem) for elem in pred_with_ARMA_resid]).mean()
MSE_pred_RF2 = np.array([(elem * elem) for elem in pred_with_RF2_resid]).mean()
print('predict: MSE')
print('RandomForest and ARIMA model: ', MSE_pred_RF)
print('      AR(I)MA model         : ', MSE_pred_ARMA)
print('      Random Forest         : ', MSE_pred_RF2)

pred_with_ARMA_resid2 = ts1[1:][120:] - ts1_pred_with_ARMA_dataframe2.tolist()[120:]
MSE_pred_ARMA2 = np.array([(elem * elem) for elem in pred_with_ARMA_resid2]).mean()
print('      AR(I)MA model?        : ', MSE_pred_ARMA2)


predict: MSE
RandomForest and ARIMA model:  7279251.12718855
      AR(I)MA model         :  4002781.362739798
      Random Forest         :  415257.4516347963
      AR(I)MA model?        :  4002781.362739798

In [369]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
fig = plt.figure(figsize = (12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(ts1, lags=40, ax = ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(ts1, lags=40, ax = ax2)



In [469]:
ts1_SARIMA = sm.tsa.SARIMAX(ts1, order=(1,0,0), seasonal_order=(1,1,0,12)).fit()
ts1_pred_with_SARIMA_dataframe =  ts1_SARIMA.predict('2015-1-18','2017-11-5')

hold(True)
plt.plot(ts1_pred_with_SARIMA_dataframe)
plt.plot(ts1)


Out[469]:
[<matplotlib.lines.Line2D at 0x7ffa857bec88>]

In [382]:
plt.plot(ts1_pred_with_SARIMA_dataframe[15:])
plt.plot(ts1[15:])


Out[382]:
[<matplotlib.lines.Line2D at 0x7ffa8686da58>]

In [393]:
pred_with_SARIMA_resid2 = ts1[15:][1:][105:] - ts1_pred_with_SARIMA_dataframe.tolist()[15:][105:]
MSE_pred_SARIMA = np.array([(elem * elem) for elem in pred_with_SARIMA_resid2]).mean()
print('train: MSE')
print('      SARIMA model?        : ', MSE_pred_SARIMA)
pred_with_SARIMA_resid2 = ts1[15:][1:][:105] - ts1_pred_with_SARIMA_dataframe.tolist()[15:][:105]
MSE_pred_SARIMA = np.array([(elem * elem) for elem in pred_with_SARIMA_resid2]).mean()
print('predict: MSE')
print('      SARIMA model?        : ', MSE_pred_SARIMA)


train: MSE
      SARIMA model?        :  301597.867612935
predict: MSE
      SARIMA model?        :  568400.3322973659

In [327]:


In [395]:
ts2.tail()


Out[395]:
Date
2017-03-26    4267.0
2017-04-02    4208.0
2017-04-09    4305.0
2017-04-16    4287.0
2017-04-23    4172.0
Name: Close, dtype: float64

In [396]:
ts1[:120].tail()


Out[396]:
Date
2017-03-26    19262.53
2017-04-02    18909.26
2017-04-09    18664.63
2017-04-16    18335.63
2017-04-23    18620.75
Name: Close, dtype: float64

In [ ]:


In [387]: