In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/data/nifty50-index.csv")
df.head()


Out[2]:
Date Open High Low Close Shares Traded Turnover (Rs. Cr)
0 01-Jan-15 8272.80 8294.70 8248.75 8284.00 56560411 2321.88
1 02-Jan-15 8288.70 8410.60 8288.70 8395.45 101887024 4715.72
2 05-Jan-15 8407.95 8445.60 8363.90 8378.40 118160545 5525.52
3 06-Jan-15 8325.30 8327.85 8111.35 8127.35 172799618 8089.19
4 07-Jan-15 8118.65 8151.20 8065.45 8102.10 164075424 7464.33

In [3]:
df.index


Out[3]:
RangeIndex(start=0, stop=1233, step=1)

In [4]:
df.tail()


Out[4]:
Date Open High Low Close Shares Traded Turnover (Rs. Cr)
1228 24-Dec-19 12269.25 12283.70 12202.10 12214.55 470290298 13864.56
1229 26-Dec-19 12211.85 12221.55 12118.85 12126.55 520326632 16362.31
1230 27-Dec-19 12172.90 12258.45 12157.90 12245.80 383788556 13676.20
1231 30-Dec-19 12274.90 12286.45 12213.80 12255.85 411084614 14556.73
1232 31-Dec-19 12247.10 12247.10 12151.80 12168.45 426931711 14812.89

In [5]:
pd.to_datetime(df.Date)


Out[5]:
0      2015-01-01
1      2015-01-02
2      2015-01-05
3      2015-01-06
4      2015-01-07
          ...    
1228   2019-12-24
1229   2019-12-26
1230   2019-12-27
1231   2019-12-30
1232   2019-12-31
Name: Date, Length: 1233, dtype: datetime64[ns]

In [6]:
df.index = pd.to_datetime(df.Date)

In [7]:
df.head()


Out[7]:
Date Open High Low Close Shares Traded Turnover (Rs. Cr)
Date
2015-01-01 01-Jan-15 8272.80 8294.70 8248.75 8284.00 56560411 2321.88
2015-01-02 02-Jan-15 8288.70 8410.60 8288.70 8395.45 101887024 4715.72
2015-01-05 05-Jan-15 8407.95 8445.60 8363.90 8378.40 118160545 5525.52
2015-01-06 06-Jan-15 8325.30 8327.85 8111.35 8127.35 172799618 8089.19
2015-01-07 07-Jan-15 8118.65 8151.20 8065.45 8102.10 164075424 7464.33

In [8]:
price = df[["Close"]]

In [9]:
price.head(10)


Out[9]:
Close
Date
2015-01-01 8284.00
2015-01-02 8395.45
2015-01-05 8378.40
2015-01-06 8127.35
2015-01-07 8102.10
2015-01-08 8234.60
2015-01-09 8284.50
2015-01-12 8323.00
2015-01-13 8299.40
2015-01-14 8277.55

In [10]:
price.index


Out[10]:
DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-05', '2015-01-06',
               '2015-01-07', '2015-01-08', '2015-01-09', '2015-01-12',
               '2015-01-13', '2015-01-14',
               ...
               '2019-12-17', '2019-12-18', '2019-12-19', '2019-12-20',
               '2019-12-23', '2019-12-24', '2019-12-26', '2019-12-27',
               '2019-12-30', '2019-12-31'],
              dtype='datetime64[ns]', name='Date', length=1233, freq=None)

In [11]:
price = price.asfreq("d", method="ffill")

In [12]:
price.index


Out[12]:
DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04',
               '2015-01-05', '2015-01-06', '2015-01-07', '2015-01-08',
               '2015-01-09', '2015-01-10',
               ...
               '2019-12-22', '2019-12-23', '2019-12-24', '2019-12-25',
               '2019-12-26', '2019-12-27', '2019-12-28', '2019-12-29',
               '2019-12-30', '2019-12-31'],
              dtype='datetime64[ns]', name='Date', length=1826, freq='D')

In [13]:
price


Out[13]:
Close
Date
2015-01-01 8284.00
2015-01-02 8395.45
2015-01-03 8395.45
2015-01-04 8395.45
2015-01-05 8378.40
... ...
2019-12-27 12245.80
2019-12-28 12245.80
2019-12-29 12245.80
2019-12-30 12255.85
2019-12-31 12168.45

1826 rows × 1 columns


In [14]:
price.asfreq("h", method="ffill").head(50)


Out[14]:
Close
Date
2015-01-01 00:00:00 8284.00
2015-01-01 01:00:00 8284.00
2015-01-01 02:00:00 8284.00
2015-01-01 03:00:00 8284.00
2015-01-01 04:00:00 8284.00
2015-01-01 05:00:00 8284.00
2015-01-01 06:00:00 8284.00
2015-01-01 07:00:00 8284.00
2015-01-01 08:00:00 8284.00
2015-01-01 09:00:00 8284.00
2015-01-01 10:00:00 8284.00
2015-01-01 11:00:00 8284.00
2015-01-01 12:00:00 8284.00
2015-01-01 13:00:00 8284.00
2015-01-01 14:00:00 8284.00
2015-01-01 15:00:00 8284.00
2015-01-01 16:00:00 8284.00
2015-01-01 17:00:00 8284.00
2015-01-01 18:00:00 8284.00
2015-01-01 19:00:00 8284.00
2015-01-01 20:00:00 8284.00
2015-01-01 21:00:00 8284.00
2015-01-01 22:00:00 8284.00
2015-01-01 23:00:00 8284.00
2015-01-02 00:00:00 8395.45
2015-01-02 01:00:00 8395.45
2015-01-02 02:00:00 8395.45
2015-01-02 03:00:00 8395.45
2015-01-02 04:00:00 8395.45
2015-01-02 05:00:00 8395.45
2015-01-02 06:00:00 8395.45
2015-01-02 07:00:00 8395.45
2015-01-02 08:00:00 8395.45
2015-01-02 09:00:00 8395.45
2015-01-02 10:00:00 8395.45
2015-01-02 11:00:00 8395.45
2015-01-02 12:00:00 8395.45
2015-01-02 13:00:00 8395.45
2015-01-02 14:00:00 8395.45
2015-01-02 15:00:00 8395.45
2015-01-02 16:00:00 8395.45
2015-01-02 17:00:00 8395.45
2015-01-02 18:00:00 8395.45
2015-01-02 19:00:00 8395.45
2015-01-02 20:00:00 8395.45
2015-01-02 21:00:00 8395.45
2015-01-02 22:00:00 8395.45
2015-01-02 23:00:00 8395.45
2015-01-03 00:00:00 8395.45
2015-01-03 01:00:00 8395.45

In [15]:
price.asfreq("y", method="ffill").head(50)


Out[15]:
Close
Date
2015-12-31 7896.25
2016-12-31 8185.80
2017-12-31 10530.70
2018-12-31 10862.55
2019-12-31 12168.45

In [16]:
price.resample("1m").mean()


Out[16]:
Close
Date
2015-01-31 8534.720968
2015-02-28 8758.116071
2015-03-31 8670.338710
2015-04-30 8550.611667
2015-05-31 8298.500000
2015-06-30 8190.355000
2015-07-31 8483.308065
2015-08-31 8333.683871
2015-09-30 7821.831667
2015-10-31 8163.330645
2015-11-30 7887.303333
2015-12-31 7790.312903
2016-01-31 7552.598387
2016-02-29 7193.734483
2016-03-31 7566.227419
2016-04-30 7788.623333
2016-05-31 7869.033871
2016-06-30 8183.521667
2016-07-31 8490.112903
2016-08-31 8643.579032
2016-09-30 8792.241667
2016-10-31 8661.932258
2016-11-30 8246.631667
2016-12-31 8117.480645
2017-01-31 8392.527419
2017-02-28 8820.869643
2017-03-31 9037.745161
2017-04-30 9204.785000
2017-05-31 9430.198387
2017-06-30 9609.683333
2017-07-31 9834.132258
2017-08-31 9887.720968
2017-09-30 9974.761667
2017-10-31 10120.348387
2017-11-30 10334.656667
2017-12-31 10336.356452
2018-01-31 10788.514516
2018-02-28 10535.233929
2018-03-31 10225.032258
2018-04-30 10472.746667
2018-05-31 10664.832258
2018-06-30 10750.785000
2018-07-31 10989.675806
2018-08-31 11487.404839
2018-09-30 11322.266667
2018-10-31 10379.559677
2018-11-30 10608.510000
2018-12-31 10781.030645
2019-01-31 10807.653226
2019-02-28 10835.121429
2019-03-31 11297.588710
2019-04-30 11686.363333
2019-05-31 11588.950000
2019-06-30 11840.750000
2019-07-31 11521.482258
2019-08-31 10989.051613
2019-09-30 11134.281667
2019-10-31 11466.019355
2019-11-30 11948.978333
2019-12-31 12108.280645

In [17]:
price.resample("3m").mean()


Out[17]:
Close
Date
2015-01-31 8534.720968
2015-04-30 8657.596629
2015-07-31 8325.507609
2015-10-31 8109.373913
2016-01-31 7741.840761
2016-04-30 7520.333889
2016-07-31 8180.860870
2016-10-31 8698.240217
2017-01-31 8252.273913
2017-04-30 9025.820225
2017-07-31 9624.834239
2017-10-31 9994.489130
2018-01-31 10488.159783
2018-04-30 10406.123034
2018-07-31 10802.318478
2018-10-31 11060.259783
2019-01-31 10733.744565
2019-04-30 11283.141011
2019-07-31 11648.325000
2019-10-31 11197.126630
2020-01-31 12029.935246

In [18]:
price["diff1"] = price.diff(1)
price.head(20)


Out[18]:
Close diff1
Date
2015-01-01 8284.00 NaN
2015-01-02 8395.45 111.45
2015-01-03 8395.45 0.00
2015-01-04 8395.45 0.00
2015-01-05 8378.40 -17.05
2015-01-06 8127.35 -251.05
2015-01-07 8102.10 -25.25
2015-01-08 8234.60 132.50
2015-01-09 8284.50 49.90
2015-01-10 8284.50 0.00
2015-01-11 8284.50 0.00
2015-01-12 8323.00 38.50
2015-01-13 8299.40 -23.60
2015-01-14 8277.55 -21.85
2015-01-15 8494.15 216.60
2015-01-16 8513.80 19.65
2015-01-17 8513.80 0.00
2015-01-18 8513.80 0.00
2015-01-19 8550.70 36.90
2015-01-20 8695.60 144.90

In [19]:
price = df[["Close"]]
price = price.asfreq("B")
price.head(20)


Out[19]:
Close
Date
2015-01-01 8284.00
2015-01-02 8395.45
2015-01-05 8378.40
2015-01-06 8127.35
2015-01-07 8102.10
2015-01-08 8234.60
2015-01-09 8284.50
2015-01-12 8323.00
2015-01-13 8299.40
2015-01-14 8277.55
2015-01-15 8494.15
2015-01-16 8513.80
2015-01-19 8550.70
2015-01-20 8695.60
2015-01-21 8729.50
2015-01-22 8761.40
2015-01-23 8835.60
2015-01-26 NaN
2015-01-27 8910.50
2015-01-28 8914.30

In [20]:
price.index


Out[20]:
DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-05', '2015-01-06',
               '2015-01-07', '2015-01-08', '2015-01-09', '2015-01-12',
               '2015-01-13', '2015-01-14',
               ...
               '2019-12-18', '2019-12-19', '2019-12-20', '2019-12-23',
               '2019-12-24', '2019-12-25', '2019-12-26', '2019-12-27',
               '2019-12-30', '2019-12-31'],
              dtype='datetime64[ns]', name='Date', length=1304, freq='B')

In [21]:
price["diff1"] = price.diff(1)
price.head(20)


Out[21]:
Close diff1
Date
2015-01-01 8284.00 NaN
2015-01-02 8395.45 111.45
2015-01-05 8378.40 -17.05
2015-01-06 8127.35 -251.05
2015-01-07 8102.10 -25.25
2015-01-08 8234.60 132.50
2015-01-09 8284.50 49.90
2015-01-12 8323.00 38.50
2015-01-13 8299.40 -23.60
2015-01-14 8277.55 -21.85
2015-01-15 8494.15 216.60
2015-01-16 8513.80 19.65
2015-01-19 8550.70 36.90
2015-01-20 8695.60 144.90
2015-01-21 8729.50 33.90
2015-01-22 8761.40 31.90
2015-01-23 8835.60 74.20
2015-01-26 NaN NaN
2015-01-27 8910.50 NaN
2015-01-28 8914.30 3.80

In [22]:
price["pct1"] = price.Close.diff(1)/price.Close
price.head(20)


Out[22]:
Close diff1 pct1
Date
2015-01-01 8284.00 NaN NaN
2015-01-02 8395.45 111.45 0.013275
2015-01-05 8378.40 -17.05 -0.002035
2015-01-06 8127.35 -251.05 -0.030890
2015-01-07 8102.10 -25.25 -0.003116
2015-01-08 8234.60 132.50 0.016091
2015-01-09 8284.50 49.90 0.006023
2015-01-12 8323.00 38.50 0.004626
2015-01-13 8299.40 -23.60 -0.002844
2015-01-14 8277.55 -21.85 -0.002640
2015-01-15 8494.15 216.60 0.025500
2015-01-16 8513.80 19.65 0.002308
2015-01-19 8550.70 36.90 0.004315
2015-01-20 8695.60 144.90 0.016664
2015-01-21 8729.50 33.90 0.003883
2015-01-22 8761.40 31.90 0.003641
2015-01-23 8835.60 74.20 0.008398
2015-01-26 NaN NaN NaN
2015-01-27 8910.50 NaN NaN
2015-01-28 8914.30 3.80 0.000426

In [23]:
price["pct1"] = price.Close.pct_change(1)
price.head(20)


Out[23]:
Close diff1 pct1
Date
2015-01-01 8284.00 NaN NaN
2015-01-02 8395.45 111.45 0.013454
2015-01-05 8378.40 -17.05 -0.002031
2015-01-06 8127.35 -251.05 -0.029964
2015-01-07 8102.10 -25.25 -0.003107
2015-01-08 8234.60 132.50 0.016354
2015-01-09 8284.50 49.90 0.006060
2015-01-12 8323.00 38.50 0.004647
2015-01-13 8299.40 -23.60 -0.002836
2015-01-14 8277.55 -21.85 -0.002633
2015-01-15 8494.15 216.60 0.026167
2015-01-16 8513.80 19.65 0.002313
2015-01-19 8550.70 36.90 0.004334
2015-01-20 8695.60 144.90 0.016946
2015-01-21 8729.50 33.90 0.003899
2015-01-22 8761.40 31.90 0.003654
2015-01-23 8835.60 74.20 0.008469
2015-01-26 NaN NaN 0.000000
2015-01-27 8910.50 NaN 0.008477
2015-01-28 8914.30 3.80 0.000426

In [24]:
price["lag1"] = price.Close.pct_change(1).shift(1) * 100
price["lag2"] = price.Close.pct_change(1).shift(2) * 100
price["lag3"] = price.Close.pct_change(1).shift(3) * 100
price["lag4"] = price.Close.pct_change(1).shift(4) * 100

price.head(20)


Out[24]:
Close diff1 pct1 lag1 lag2 lag3 lag4
Date
2015-01-01 8284.00 NaN NaN NaN NaN NaN NaN
2015-01-02 8395.45 111.45 0.013454 NaN NaN NaN NaN
2015-01-05 8378.40 -17.05 -0.002031 1.345365 NaN NaN NaN
2015-01-06 8127.35 -251.05 -0.029964 -0.203086 1.345365 NaN NaN
2015-01-07 8102.10 -25.25 -0.003107 -2.996395 -0.203086 1.345365 NaN
2015-01-08 8234.60 132.50 0.016354 -0.310679 -2.996395 -0.203086 1.345365
2015-01-09 8284.50 49.90 0.006060 1.635378 -0.310679 -2.996395 -0.203086
2015-01-12 8323.00 38.50 0.004647 0.605980 1.635378 -0.310679 -2.996395
2015-01-13 8299.40 -23.60 -0.002836 0.464723 0.605980 1.635378 -0.310679
2015-01-14 8277.55 -21.85 -0.002633 -0.283552 0.464723 0.605980 1.635378
2015-01-15 8494.15 216.60 0.026167 -0.263272 -0.283552 0.464723 0.605980
2015-01-16 8513.80 19.65 0.002313 2.616716 -0.263272 -0.283552 0.464723
2015-01-19 8550.70 36.90 0.004334 0.231336 2.616716 -0.263272 -0.283552
2015-01-20 8695.60 144.90 0.016946 0.433414 0.231336 2.616716 -0.263272
2015-01-21 8729.50 33.90 0.003899 1.694598 0.433414 0.231336 2.616716
2015-01-22 8761.40 31.90 0.003654 0.389852 1.694598 0.433414 0.231336
2015-01-23 8835.60 74.20 0.008469 0.365428 0.389852 1.694598 0.433414
2015-01-26 NaN NaN 0.000000 0.846897 0.365428 0.389852 1.694598
2015-01-27 8910.50 NaN 0.008477 0.000000 0.846897 0.365428 0.389852
2015-01-28 8914.30 3.80 0.000426 0.847707 0.000000 0.846897 0.365428

In [25]:
price["lag1"] = price.Close.shift(1)
price["lag2"] = price.Close.shift(2)
price.head(20)


Out[25]:
Close diff1 pct1 lag1 lag2 lag3 lag4
Date
2015-01-01 8284.00 NaN NaN NaN NaN NaN NaN
2015-01-02 8395.45 111.45 0.013454 8284.00 NaN NaN NaN
2015-01-05 8378.40 -17.05 -0.002031 8395.45 8284.00 NaN NaN
2015-01-06 8127.35 -251.05 -0.029964 8378.40 8395.45 NaN NaN
2015-01-07 8102.10 -25.25 -0.003107 8127.35 8378.40 1.345365 NaN
2015-01-08 8234.60 132.50 0.016354 8102.10 8127.35 -0.203086 1.345365
2015-01-09 8284.50 49.90 0.006060 8234.60 8102.10 -2.996395 -0.203086
2015-01-12 8323.00 38.50 0.004647 8284.50 8234.60 -0.310679 -2.996395
2015-01-13 8299.40 -23.60 -0.002836 8323.00 8284.50 1.635378 -0.310679
2015-01-14 8277.55 -21.85 -0.002633 8299.40 8323.00 0.605980 1.635378
2015-01-15 8494.15 216.60 0.026167 8277.55 8299.40 0.464723 0.605980
2015-01-16 8513.80 19.65 0.002313 8494.15 8277.55 -0.283552 0.464723
2015-01-19 8550.70 36.90 0.004334 8513.80 8494.15 -0.263272 -0.283552
2015-01-20 8695.60 144.90 0.016946 8550.70 8513.80 2.616716 -0.263272
2015-01-21 8729.50 33.90 0.003899 8695.60 8550.70 0.231336 2.616716
2015-01-22 8761.40 31.90 0.003654 8729.50 8695.60 0.433414 0.231336
2015-01-23 8835.60 74.20 0.008469 8761.40 8729.50 1.694598 0.433414
2015-01-26 NaN NaN 0.000000 8835.60 8761.40 0.389852 1.694598
2015-01-27 8910.50 NaN 0.008477 NaN 8835.60 0.365428 0.389852
2015-01-28 8914.30 3.80 0.000426 8910.50 NaN 0.846897 0.365428

In [26]:
price.dropna(inplace=True)

In [27]:
price


Out[27]:
Close diff1 pct1 lag1 lag2 lag3 lag4
Date
2015-01-08 8234.60 132.50 0.016354 8102.10 8127.35 -0.203086 1.345365
2015-01-09 8284.50 49.90 0.006060 8234.60 8102.10 -2.996395 -0.203086
2015-01-12 8323.00 38.50 0.004647 8284.50 8234.60 -0.310679 -2.996395
2015-01-13 8299.40 -23.60 -0.002836 8323.00 8284.50 1.635378 -0.310679
2015-01-14 8277.55 -21.85 -0.002633 8299.40 8323.00 0.605980 1.635378
... ... ... ... ... ... ... ...
2019-12-20 12271.80 12.10 0.000987 12259.70 12221.65 0.921275 -0.270959
2019-12-23 12262.75 -9.05 -0.000737 12271.80 12259.70 0.465680 0.921275
2019-12-24 12214.55 -48.20 -0.003931 12262.75 12271.80 0.311333 0.465680
2019-12-30 12255.85 10.05 0.000821 12245.80 12126.55 0.000000 -0.393060
2019-12-31 12168.45 -87.40 -0.007131 12255.85 12245.80 -0.720452 0.000000

1090 rows × 7 columns


In [28]:
date_column = price.reset_index().Date
price["year"] = price.index.year
price["month"] = price.index.month
price["day"] = price.index.day 
price["weekday"] = price.index.weekday

price.head(20)


Out[28]:
Close diff1 pct1 lag1 lag2 lag3 lag4 year month day weekday
Date
2015-01-08 8234.60 132.50 0.016354 8102.10 8127.35 -0.203086 1.345365 2015 1 8 3
2015-01-09 8284.50 49.90 0.006060 8234.60 8102.10 -2.996395 -0.203086 2015 1 9 4
2015-01-12 8323.00 38.50 0.004647 8284.50 8234.60 -0.310679 -2.996395 2015 1 12 0
2015-01-13 8299.40 -23.60 -0.002836 8323.00 8284.50 1.635378 -0.310679 2015 1 13 1
2015-01-14 8277.55 -21.85 -0.002633 8299.40 8323.00 0.605980 1.635378 2015 1 14 2
2015-01-15 8494.15 216.60 0.026167 8277.55 8299.40 0.464723 0.605980 2015 1 15 3
2015-01-16 8513.80 19.65 0.002313 8494.15 8277.55 -0.283552 0.464723 2015 1 16 4
2015-01-19 8550.70 36.90 0.004334 8513.80 8494.15 -0.263272 -0.283552 2015 1 19 0
2015-01-20 8695.60 144.90 0.016946 8550.70 8513.80 2.616716 -0.263272 2015 1 20 1
2015-01-21 8729.50 33.90 0.003899 8695.60 8550.70 0.231336 2.616716 2015 1 21 2
2015-01-22 8761.40 31.90 0.003654 8729.50 8695.60 0.433414 0.231336 2015 1 22 3
2015-01-23 8835.60 74.20 0.008469 8761.40 8729.50 1.694598 0.433414 2015 1 23 4
2015-01-29 8952.35 38.05 0.004268 8914.30 8910.50 0.000000 0.846897 2015 1 29 3
2015-01-30 8808.90 -143.45 -0.016024 8952.35 8914.30 0.847707 0.000000 2015 1 30 4
2015-02-02 8797.40 -11.50 -0.001305 8808.90 8952.35 0.042646 0.847707 2015 2 2 0
2015-02-03 8756.55 -40.85 -0.004643 8797.40 8808.90 0.426842 0.042646 2015 2 3 1
2015-02-04 8723.70 -32.85 -0.003751 8756.55 8797.40 -1.602373 0.426842 2015 2 4 2
2015-02-05 8711.70 -12.00 -0.001376 8723.70 8756.55 -0.130550 -1.602373 2015 2 5 3
2015-02-06 8661.05 -50.65 -0.005814 8711.70 8723.70 -0.464342 -0.130550 2015 2 6 4
2015-02-09 8526.35 -134.70 -0.015552 8661.05 8711.70 -0.375148 -0.464342 2015 2 9 0

In [29]:
price.Close.plot()


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x12156d7d0>

In [30]:
X = price[["lag1", "lag2", "lag3", "lag4", "year", "month", "day", "weekday"]]
X.head()


Out[30]:
lag1 lag2 lag3 lag4 year month day weekday
Date
2015-01-08 8102.1 8127.35 -0.203086 1.345365 2015 1 8 3
2015-01-09 8234.6 8102.10 -2.996395 -0.203086 2015 1 9 4
2015-01-12 8284.5 8234.60 -0.310679 -2.996395 2015 1 12 0
2015-01-13 8323.0 8284.50 1.635378 -0.310679 2015 1 13 1
2015-01-14 8299.4 8323.00 0.605980 1.635378 2015 1 14 2

In [31]:
import numpy as np

In [32]:
y = np.log(price.Close)
y


Out[32]:
Date
2015-01-08    9.016100
2015-01-09    9.022142
2015-01-12    9.026778
2015-01-13    9.023939
2015-01-14    9.021302
                ...   
2019-12-20    9.415059
2019-12-23    9.414321
2019-12-24    9.410383
2019-12-30    9.413759
2019-12-31    9.406602
Name: Close, Length: 1090, dtype: float64

In [33]:
from sklearn import *

In [34]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 1)

In [35]:
import xgboost as xgb

In [36]:
est = xgb.XGBRegressor(objective='reg:squarederror')

In [37]:
est.fit(X_train, y_train)


Out[37]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [38]:
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train rmse: ",  metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ",  metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ",  metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ",  metrics.r2_score(y_test, y_test_pred) ** 0.5)


train rmse:  0.006766157722792414
train r2:  0.9989144182983775
test rmse:  0.01010136099230877
test r2:  0.9976744472076405

In [39]:
pd.DataFrame({"actual": y_test, "prediction": y_test_pred}).plot()


Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a258f0510>

In [40]:
pd.DataFrame({"feature": X.columns, "importance": est.feature_importances_})


Out[40]:
feature importance
0 lag1 0.933634
1 lag2 0.037965
2 lag3 0.001306
3 lag4 0.001685
4 year 0.020283
5 month 0.002423
6 day 0.001648
7 weekday 0.001055

In [41]:
X_train.head()


Out[41]:
lag1 lag2 lag3 lag4 year month day weekday
Date
2019-11-20 11940.1 11884.5 0.196680 0.267304 2019 11 20 2
2017-10-05 9914.9 9859.5 0.000000 0.201148 2017 10 5 3
2018-07-23 11010.2 10957.1 -0.250726 0.651010 2018 7 23 0
2016-07-12 8467.9 8323.2 0.023393 0.000000 2016 7 12 1
2019-01-31 10651.8 10652.2 -1.103840 -0.638261 2019 1 31 3

In [42]:
est = xgb.XGBRegressor(objective='reg:squarederror' , booster= "gblinear")
est.fit(X_train, y_train)

y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train rmse: ",  metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ",  metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ",  metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ",  metrics.r2_score(y_test, y_test_pred) ** 0.5)

pd.DataFrame({"actual": y_test, "prediction": y_test_pred}).plot()


train rmse:  0.4054536260770504
train r2:  nan
test rmse:  0.4108711882306158
test r2:  nan
/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:7: RuntimeWarning: invalid value encountered in double_scalars
  import sys
/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:9: RuntimeWarning: invalid value encountered in double_scalars
  if __name__ == '__main__':
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a258dab50>

In [43]:
lasso = linear_model.Lasso(alpha=0.001)

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=3, include_bias=False)),
    ("std", preprocessing.StandardScaler()),
    ("est", linear_model.Lasso(alpha=0.001))
])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
print("train rmse: ",  metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ",  metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ",  metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ",  metrics.r2_score(y_test, y_test_pred) ** 0.5)

pd.DataFrame({"actual": y_test, "prediction": y_test_pred}).plot()


train rmse:  0.012110449269312613
train r2:  0.9965180723383401
test rmse:  0.012708091035329811
test r2:  0.9963168254487239
Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a25921f10>

In [48]:
import matplotlib.pyplot as plt

In [54]:
plt.rcParams["figure.figsize"] = 15, 8

In [55]:
y.plot()


Out[55]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a26cc8290>

In [56]:
import scipy.stats

In [59]:
plt.rcParams["figure.figsize"] = 8, 8
scipy.stats.probplot(y, plot = plt);



In [60]:
import numpy as np

In [63]:
wn = np.random.normal(loc = np.mean(y), scale = np.std(y), size = len(y))

In [64]:
scipy.stats.probplot(wn, plot = plt);



In [66]:
plt.hist(wn, bins = 50)


Out[66]:
(array([ 1.,  0.,  0.,  1.,  1.,  1.,  3.,  1.,  7.,  4.,  9., 11.,  7.,
        23., 16., 15., 24., 29., 28., 42., 44., 45., 60., 52., 52., 55.,
        76., 55., 54., 44., 52., 42., 54., 35., 30., 23., 19., 15., 20.,
        10.,  8.,  3.,  6.,  3.,  4.,  2.,  2.,  0.,  0.,  2.]),
 array([8.63860475, 8.65862461, 8.67864448, 8.69866435, 8.71868421,
        8.73870408, 8.75872394, 8.77874381, 8.79876368, 8.81878354,
        8.83880341, 8.85882327, 8.87884314, 8.89886301, 8.91888287,
        8.93890274, 8.9589226 , 8.97894247, 8.99896234, 9.0189822 ,
        9.03900207, 9.05902193, 9.0790418 , 9.09906167, 9.11908153,
        9.1391014 , 9.15912127, 9.17914113, 9.199161  , 9.21918086,
        9.23920073, 9.2592206 , 9.27924046, 9.29926033, 9.31928019,
        9.33930006, 9.35931993, 9.37933979, 9.39935966, 9.41937952,
        9.43939939, 9.45941926, 9.47943912, 9.49945899, 9.51947885,
        9.53949872, 9.55951859, 9.57953845, 9.59955832, 9.61957819,
        9.63959805]),
 <a list of 50 Patch objects>)

In [68]:
pd.Series(wn).plot.kde()


Out[68]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a25cc0a90>

In [72]:
price["wn"] = wn

price.head()


Out[72]:
Close diff1 pct1 lag1 lag2 lag3 lag4 year month day weekday wn
Date
2015-01-08 8234.60 132.50 0.016354 8102.1 8127.35 -0.203086 1.345365 2015 1 8 3 8.923274
2015-01-09 8284.50 49.90 0.006060 8234.6 8102.10 -2.996395 -0.203086 2015 1 9 4 9.168817
2015-01-12 8323.00 38.50 0.004647 8284.5 8234.60 -0.310679 -2.996395 2015 1 12 0 9.207621
2015-01-13 8299.40 -23.60 -0.002836 8323.0 8284.50 1.635378 -0.310679 2015 1 13 1 9.106359
2015-01-14 8277.55 -21.85 -0.002633 8299.4 8323.00 0.605980 1.635378 2015 1 14 2 9.225113

In [74]:
plt.rcParams["figure.figsize"] = 16, 8

In [75]:
price.wn.plot()


Out[75]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a28685d50>

In [76]:
price.Close.plot()


Out[76]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a295d2650>

In [77]:
import statsmodels.tsa.stattools as sts

In [78]:
sts.adfuller(price.Close) #p-value is 0.9, hence the series is non stationary


Out[78]:
(-0.44553410431784485,
 0.9022141422868833,
 0,
 1089,
 {'1%': -3.436369082756128,
  '5%': -2.8641976875421524,
  '10%': -2.5681850407995137},
 12553.100879596106)

In [81]:
sts.adfuller(price.wn) #p-value is 0.0, hence the series is stationary


Out[81]:
(-33.036002048359606,
 0.0,
 0,
 1089,
 {'1%': -3.436369082756128,
  '5%': -2.8641976875421524,
  '10%': -2.5681850407995137},
 -1021.1168020780779)

In [88]:
airlines = pd.read_csv("/data/airline-passengers.csv")
airlines.index = pd.to_datetime(airlines.Month)
airlines = airlines[["Passengers"]]
airlines = airlines.asfreq("m", method= "ffill")
airlines.head()


Out[88]:
Passengers
Month
1949-01-31 112
1949-02-28 118
1949-03-31 132
1949-04-30 129
1949-05-31 121

In [89]:
airlines.plot()


Out[89]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2f4cddd0>

In [90]:
from statsmodels.tsa.seasonal import seasonal_decompose
s_decom_additive = seasonal_decompose(airlines.Passengers, model = "additive")
s_decom_additive.plot()


Out[90]:

In [95]:
price2 = price.copy()
price2 = price2.asfreq("d", method = "ffill")
s_decom_additive = seasonal_decompose(price2.Close, model = "additive")
s_decom_additive.plot()


Out[95]:

In [96]:
price2 = price.copy()
price2 = price2.asfreq("d", method = "ffill")
s_decom_additive = seasonal_decompose(price2.wn, model = "additive")
s_decom_additive.plot()


Out[96]:

In [97]:
import statsmodels.graphics.tsaplots as sgt
sgt.plot_acf(price2.Close, lags=40, zero=False)
plt.title("ACF - Nifty 50 CLOSE")


Out[97]:
Text(0.5, 1.0, 'ACF - Nifty 50 CLOSE')

In [98]:
import statsmodels.graphics.tsaplots as sgt
sgt.plot_acf(price2.wn, lags=40, zero=False)
plt.title("ACF - Nifty 50 CLOSE")


Out[98]:
Text(0.5, 1.0, 'ACF - Nifty 50 CLOSE')

In [100]:
import statsmodels.graphics.tsaplots as sgt
sgt.plot_acf(airlines.Passengers, lags=40, zero=False)


Out[100]:

In [101]:
sgt.plot_pacf(price2.Close, lags=40, zero=False, method = ("ols"))


Out[101]:

In [102]:
sgt.plot_pacf(airlines.Passengers, lags=40, zero=False, method = ("ols"))


Out[102]:

In [103]:
sgt.plot_pacf(price2.wn, lags=40, zero=False, method = ("ols"))


Out[103]:

In [105]:
def to_float(v):
    try:
        return float(v)
    except:
        pass
    
    
ftse = pd.read_csv("/data/FTSE.csv")
ftse.index = pd.to_datetime(ftse.Date)
ftse = ftse[["Adj Close"]]
ftse.columns = ["Close"]
ftse.Close = ftse.Close.apply(to_float)
ftse = ftse.dropna()
ftse = ftse.sort_index().asfreq(freq='B', method = "ffill")
ftse.head()


Out[105]:
Close
Date
2000-01-04 6665.9
2000-01-05 6535.9
2000-01-06 6447.2
2000-01-07 6504.8
2000-01-10 6607.7

In [107]:
ftse.Close.plot()


Out[107]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a32d48b90>

In [108]:
sgt.plot_pacf(ftse.Close, lags=40, zero=False, method = ("ols"))


Out[108]:

In [109]:
sts.adfuller(price2.Close)


Out[109]:
(-0.4362621268410783,
 0.9038659836735699,
 5,
 1813,
 {'1%': -3.433962014811523,
  '5%': -2.86313550332995,
  '10%': -2.5676193929209554},
 20167.42749086323)

In [110]:
price2.head()


Out[110]:
Close diff1 pct1 lag1 lag2 lag3 lag4 year month day weekday wn
Date
2015-01-08 8234.6 132.5 0.016354 8102.1 8127.35 -0.203086 1.345365 2015 1 8 3 8.923274
2015-01-09 8284.5 49.9 0.006060 8234.6 8102.10 -2.996395 -0.203086 2015 1 9 4 9.168817
2015-01-10 8284.5 49.9 0.006060 8234.6 8102.10 -2.996395 -0.203086 2015 1 9 4 9.168817
2015-01-11 8284.5 49.9 0.006060 8234.6 8102.10 -2.996395 -0.203086 2015 1 9 4 9.168817
2015-01-12 8323.0 38.5 0.004647 8284.5 8234.60 -0.310679 -2.996395 2015 1 12 0 9.207621

In [115]:
price["returns"] = price.Close.pct_change(1).mul(100)
price = price.asfreq('b', method = "ffill")
price.head()


Out[115]:
Close diff1 pct1 lag1 lag2 lag3 lag4 year month day weekday wn returns
Date
2015-01-08 8234.60 132.50 0.016354 8102.1 8127.35 -0.203086 1.345365 2015 1 8 3 8.923274 NaN
2015-01-09 8284.50 49.90 0.006060 8234.6 8102.10 -2.996395 -0.203086 2015 1 9 4 9.168817 0.605980
2015-01-12 8323.00 38.50 0.004647 8284.5 8234.60 -0.310679 -2.996395 2015 1 12 0 9.207621 0.464723
2015-01-13 8299.40 -23.60 -0.002836 8323.0 8284.50 1.635378 -0.310679 2015 1 13 1 9.106359 -0.283552
2015-01-14 8277.55 -21.85 -0.002633 8299.4 8323.00 0.605980 1.635378 2015 1 14 2 9.225113 -0.263272

In [116]:
price.index


Out[116]:
DatetimeIndex(['2015-01-08', '2015-01-09', '2015-01-12', '2015-01-13',
               '2015-01-14', '2015-01-15', '2015-01-16', '2015-01-19',
               '2015-01-20', '2015-01-21',
               ...
               '2019-12-18', '2019-12-19', '2019-12-20', '2019-12-23',
               '2019-12-24', '2019-12-25', '2019-12-26', '2019-12-27',
               '2019-12-30', '2019-12-31'],
              dtype='datetime64[ns]', name='Date', length=1299, freq='B')

In [120]:
returns = price["returns"][1:]
sts.adfuller(returns)


Out[120]:
(-21.342406124427804,
 0.0,
 1,
 1296,
 {'1%': -3.435405786614854,
  '5%': -2.86377270896149,
  '10%': -2.567958709443111},
 3292.290317576155)

In [121]:
sgt.plot_acf(returns, lags=40, zero=False)


Out[121]:

In [122]:
sgt.plot_pacf(returns, lags=40, zero=False, method = ("ols"))


Out[122]:

In [123]:
from statsmodels.tsa.arima_model import ARMA

In [129]:
model1 = ARMA(returns, order=(1, 0))
fit1 = model1.fit()
print(fit1.summary())


                              ARMA Model Results                              
==============================================================================
Dep. Variable:                returns   No. Observations:                 1298
Model:                     ARMA(1, 0)   Log Likelihood               -1680.621
Method:                       css-mle   S.D. of innovations              0.883
Date:                Mon, 06 Apr 2020   AIC                           3367.242
Time:                        15:18:48   BIC                           3382.748
Sample:                    01-09-2015   HQIC                          3373.060
                         - 12-31-2019                                         
=================================================================================
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.0557      0.029      1.913      0.056      -0.001       0.113
ar.L1.returns     0.1573      0.027      5.739      0.000       0.104       0.211
                                    Roots                                    
=============================================================================
                  Real          Imaginary           Modulus         Frequency
-----------------------------------------------------------------------------
AR.1            6.3568           +0.0000j            6.3568            0.0000
-----------------------------------------------------------------------------

In [130]:
model1 = ARMA(returns, order=(2, 0))
fit1 = model1.fit()
print(fit1.summary())


                              ARMA Model Results                              
==============================================================================
Dep. Variable:                returns   No. Observations:                 1298
Model:                     ARMA(2, 0)   Log Likelihood               -1675.423
Method:                       css-mle   S.D. of innovations              0.880
Date:                Mon, 06 Apr 2020   AIC                           3358.846
Time:                        15:18:57   BIC                           3379.521
Sample:                    01-09-2015   HQIC                          3366.604
                         - 12-31-2019                                         
=================================================================================
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.0557      0.032      1.751      0.080      -0.007       0.118
ar.L1.returns     0.1432      0.028      5.182      0.000       0.089       0.197
ar.L2.returns     0.0893      0.028      3.231      0.001       0.035       0.143
                                    Roots                                    
=============================================================================
                  Real          Imaginary           Modulus         Frequency
-----------------------------------------------------------------------------
AR.1            2.6392           +0.0000j            2.6392            0.0000
AR.2           -4.2433           +0.0000j            4.2433            0.5000
-----------------------------------------------------------------------------

In [131]:
len(returns)


Out[131]:
1298

In [132]:
train = returns[:1250]
test = returns[1250:]

In [133]:
model1 = ARMA(train, order=(2, 0))
fit1 = model1.fit()
print(fit1.summary())


                              ARMA Model Results                              
==============================================================================
Dep. Variable:                returns   No. Observations:                 1250
Model:                     ARMA(2, 0)   Log Likelihood               -1626.397
Method:                       css-mle   S.D. of innovations              0.889
Date:                Mon, 06 Apr 2020   AIC                           3260.793
Time:                        15:24:48   BIC                           3281.317
Sample:                    01-09-2015   HQIC                          3268.509
                         - 10-24-2019                                         
=================================================================================
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.0547      0.033      1.665      0.096      -0.010       0.119
ar.L1.returns     0.1466      0.028      5.205      0.000       0.091       0.202
ar.L2.returns     0.0880      0.028      3.124      0.002       0.033       0.143
                                    Roots                                    
=============================================================================
                  Real          Imaginary           Modulus         Frequency
-----------------------------------------------------------------------------
AR.1            2.6392           +0.0000j            2.6392            0.0000
AR.2           -4.3054           +0.0000j            4.3054            0.5000
-----------------------------------------------------------------------------

In [135]:
results = fit1.forecast(steps = 48)
results


Out[135]:
(array([-0.00045579, -0.018028  ,  0.03916037,  0.04599959,  0.05203529,
         0.05352221,  0.05427141,  0.05451212,  0.05461335,  0.05464938,
         0.05466357,  0.05466882,  0.05467084,  0.0546716 ,  0.05467188,
         0.05467199,  0.05467204,  0.05467205,  0.05467206,  0.05467206,
         0.05467206,  0.05467206,  0.05467206,  0.05467206,  0.05467206,
         0.05467206,  0.05467206,  0.05467206,  0.05467206,  0.05467206,
         0.05467206,  0.05467206,  0.05467206,  0.05467206,  0.05467206,
         0.05467206,  0.05467206,  0.05467206,  0.05467206,  0.05467206,
         0.05467206,  0.05467206,  0.05467206,  0.05467206,  0.05467206,
         0.05467206,  0.05467206,  0.05467206]),
 array([0.88884016, 0.89834481, 0.90360232, 0.90396892, 0.90405314,
        0.90406233, 0.9040639 , 0.9040641 , 0.90406413, 0.90406413,
        0.90406413, 0.90406413, 0.90406413, 0.90406413, 0.90406413,
        0.90406413, 0.90406413, 0.90406413, 0.90406413, 0.90406413,
        0.90406413, 0.90406413, 0.90406413, 0.90406413, 0.90406413,
        0.90406413, 0.90406413, 0.90406413, 0.90406413, 0.90406413,
        0.90406413, 0.90406413, 0.90406413, 0.90406413, 0.90406413,
        0.90406413, 0.90406413, 0.90406413, 0.90406413, 0.90406413,
        0.90406413, 0.90406413, 0.90406413, 0.90406413, 0.90406413,
        0.90406413, 0.90406413, 0.90406413]),
 array([[-1.74255048,  1.74163891],
        [-1.77875147,  1.74269547],
        [-1.73186762,  1.81018837],
        [-1.72574693,  1.81774611],
        [-1.71987631,  1.8239469 ],
        [-1.7184074 ,  1.82545181],
        [-1.71766127,  1.82620408],
        [-1.71742095,  1.82644519],
        [-1.71731978,  1.82654648],
        [-1.71728377,  1.82658252],
        [-1.71726958,  1.82659671],
        [-1.71726432,  1.82660196],
        [-1.71726231,  1.82660398],
        [-1.71726155,  1.82660474],
        [-1.71726126,  1.82660503],
        [-1.71726115,  1.82660514],
        [-1.71726111,  1.82660518],
        [-1.71726109,  1.82660519],
        [-1.71726109,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ],
        [-1.71726108,  1.8266052 ]]))

In [139]:
plt.plot(range(len(results[0])), results[0])
plt.fill_between(range(len(results[0])), results[0] + results[1], results[0] - results[1], alpha = 0.3)


Out[139]:
<matplotlib.collections.PolyCollection at 0x1a2c33ec90>

In [166]:
history = [x for x in train]
predictions = list()
for t in range(len(test)):
    model = ARMA(history, order=(2,0))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))
error = metrics.mean_squared_error(test, predictions) ** 0.5
print(f"RMSE: {error}" )

pd.DataFrame({"actual": test
              , "prediction": np.array(predictions).flatten()}).plot()


predicted=-0.000456, expected=0.011224
predicted=-0.016312, expected=0.011224
predicted=0.044510, expected=0.011224
predicted=0.044483, expected=0.011224
predicted=0.044456, expected=2.534121
predicted=0.416143, expected=0.110714
predicted=0.282853, expected=0.426387
predicted=0.115629, expected=-0.201821
predicted=0.051761, expected=0.409912
predicted=0.085515, expected=0.384421
predicted=0.136213, expected=-0.864965
predicted=-0.048621, expected=0.044507
predicted=-0.026526, expected=0.044507
predicted=0.053739, expected=0.044507
predicted=0.053732, expected=0.044507
predicted=0.053724, expected=-0.151090
predicted=0.025148, expected=-0.092052
predicted=0.016391, expected=0.467836
predicted=0.103290, expected=0.494133
predicted=0.156753, expected=-0.255853
predicted=0.049669, expected=-0.451188
predicted=-0.044985, expected=1.337457
predicted=0.198509, expected=-0.298582
predicted=0.117952, expected=0.523356
predicted=0.093120, expected=0.416918
predicted=0.150505, expected=-0.782642
predicted=-0.032357, expected=-0.065113
predicted=-0.034293, expected=-0.448200
predicted=-0.026897, expected=0.408531
predicted=0.062928, expected=-0.205925
predicted=0.049875, expected=-0.806264
predicted=-0.091228, expected=0.134211
predicted=-0.008484, expected=-0.676021
predicted=-0.042715, expected=0.449953
predicted=0.047459, expected=0.517626
predicted=0.157004, expected=0.959755
predicted=0.227369, expected=-0.270959
predicted=0.089195, expected=0.921275
predicted=0.151588, expected=0.465680
predicted=0.192735, expected=0.311333
predicted=0.130336, expected=0.098697
predicted=0.086146, expected=-0.073746
predicted=0.042438, expected=-0.393060
predicted=-0.018957, expected=-0.393060
predicted=-0.047677, expected=-0.393060
predicted=-0.048060, expected=-0.393060
predicted=-0.048442, expected=0.338121
predicted=0.056920, expected=-0.713129
RMSE: 0.5941342649382474
Out[166]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2c8fd5d0>

In [149]:
fit1.resid.plot()
plt.title("ARMA residual: mean: %.f, std: %.2f" % (np.mean(fit1.resid), np.std(fit1.resid)))


Out[149]:
Text(0.5, 1.0, 'ARMA residual: mean: -0, std: 0.89')

In [150]:
sts.adfuller(fit1.resid)


Out[150]:
(-35.293703121496016,
 0.0,
 0,
 1249,
 {'1%': -3.4355964295197743,
  '5%': -2.863856825923603,
  '10%': -2.5680035060041626},
 3192.2777559725373)

In [152]:
sgt.plot_acf(fit1.resid, lags=40, zero=False)


Out[152]:

In [151]:
sgt.plot_pacf(fit1.resid, lags=40, zero=False, method = ("ols"))


Out[151]:

In [153]:
sgt.plot_acf(train, lags=40, zero=False)


Out[153]:

In [158]:
model1 = ARMA(train, order=(0, 2))
fit1 = model1.fit()
print(fit1.summary())


                              ARMA Model Results                              
==============================================================================
Dep. Variable:                returns   No. Observations:                 1250
Model:                     ARMA(0, 2)   Log Likelihood               -1626.924
Method:                       css-mle   S.D. of innovations              0.889
Date:                Mon, 06 Apr 2020   AIC                           3261.848
Time:                        16:42:38   BIC                           3282.372
Sample:                    01-09-2015   HQIC                          3269.563
                         - 10-24-2019                                         
=================================================================================
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.0546      0.031      1.742      0.082      -0.007       0.116
ma.L1.returns     0.1428      0.028      5.080      0.000       0.088       0.198
ma.L2.returns     0.1049      0.028      3.780      0.000       0.051       0.159
                                    Roots                                    
=============================================================================
                  Real          Imaginary           Modulus         Frequency
-----------------------------------------------------------------------------
MA.1           -0.6803           -3.0110j            3.0869           -0.2854
MA.2           -0.6803           +3.0110j            3.0869            0.2854
-----------------------------------------------------------------------------

In [165]:
history = [x for x in train]
predictions = list()
for t in range(len(test)):
    model = ARMA(history, order=(0,2))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))
error = metrics.mean_squared_error(test, predictions) ** 0.5
print(f"RMSE: {error}" )

pd.DataFrame({"actual": test
              , "prediction": np.array(predictions).flatten()}).plot()


predicted=-0.016787, expected=0.011224
predicted=-0.030987, expected=0.011224
predicted=0.063664, expected=0.011224
predicted=0.051608, expected=0.011224
predicted=0.043370, expected=2.534121
predicted=0.407694, expected=0.110714
predicted=0.275473, expected=0.426387
predicted=0.047242, expected=-0.201821
predicted=0.037161, expected=0.409912
predicted=0.083687, expected=0.384421
predicted=0.139139, expected=-0.864965
predicted=-0.054122, expected=0.044507
predicted=-0.034669, expected=0.044507
predicted=0.078040, expected=0.044507
predicted=0.060054, expected=0.044507
predicted=0.050804, expected=-0.151090
predicted=0.026148, expected=-0.092052
predicted=0.018368, expected=0.467836
predicted=0.107794, expected=0.494133
predicted=0.158629, expected=-0.255853
predicted=0.038325, expected=-0.451188
predicted=-0.056313, expected=1.337457
predicted=0.203202, expected=-0.298582
predicted=0.131131, expected=0.523356
predicted=0.060061, expected=0.416918
predicted=0.148399, expected=-0.782642
predicted=-0.036266, expected=-0.065113
predicted=-0.043685, expected=-0.448200
predicted=-0.003079, expected=0.408531
predicted=0.072218, expected=-0.205925
predicted=0.060591, expected=-0.806264
predicted=-0.094279, expected=0.134211
predicted=-0.002088, expected=-0.676021
predicted=-0.014754, expected=0.449953
predicted=0.050017, expected=0.517626
predicted=0.169618, expected=0.959755
predicted=0.216049, expected=-0.270959
predicted=0.071055, expected=0.921275
predicted=0.124192, expected=0.465680
predicted=0.193962, expected=0.311333
predicted=0.109759, expected=0.098697
predicted=0.068450, expected=-0.073746
predicted=0.036630, expected=-0.393060
predicted=-0.017539, expected=-0.393060
predicted=-0.040542, expected=-0.393060
predicted=-0.032078, expected=-0.393060
predicted=-0.031267, expected=0.338121
predicted=0.070079, expected=-0.713129
RMSE: 0.5945495867167389
Out[165]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a39feedd0>

In [163]:
model1 = ARMA(train, order=(1, 1))
fit1 = model1.fit()
print(fit1.summary())


                              ARMA Model Results                              
==============================================================================
Dep. Variable:                returns   No. Observations:                 1250
Model:                     ARMA(1, 1)   Log Likelihood               -1627.669
Method:                       css-mle   S.D. of innovations              0.890
Date:                Mon, 06 Apr 2020   AIC                           3263.339
Time:                        16:45:18   BIC                           3283.863
Sample:                    01-09-2015   HQIC                          3271.054
                         - 10-24-2019                                         
=================================================================================
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.0547      0.033      1.659      0.097      -0.010       0.119
ar.L1.returns     0.4951      0.101      4.891      0.000       0.297       0.694
ma.L1.returns    -0.3386      0.108     -3.127      0.002      -0.551      -0.126
                                    Roots                                    
=============================================================================
                  Real          Imaginary           Modulus         Frequency
-----------------------------------------------------------------------------
AR.1            2.0197           +0.0000j            2.0197            0.0000
MA.1            2.9530           +0.0000j            2.9530            0.0000
-----------------------------------------------------------------------------

In [167]:
history = [x for x in train]
predictions = list()
for t in range(len(test)):
    model = ARMA(history, order=(1,1))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))
error = metrics.mean_squared_error(test, predictions) ** 0.5
print(f"RMSE: {error}" )

pd.DataFrame({"actual": test
              , "prediction": np.array(predictions).flatten()}).plot()


predicted=-0.012180, expected=0.011224
predicted=0.025263, expected=0.011224
predicted=0.037920, expected=0.011224
predicted=0.042189, expected=0.011224
predicted=0.043618, expected=2.534121
predicted=0.440683, expected=0.110714
predicted=0.194827, expected=0.426387
predicted=0.161720, expected=-0.201821
predicted=0.051905, expected=0.409912
predicted=0.110389, expected=0.384421
predicted=0.126416, expected=-0.864965
predicted=-0.063234, expected=0.044507
predicted=0.013954, expected=0.044507
predicted=0.040184, expected=0.044507
predicted=0.049102, expected=0.044507
predicted=0.052131, expected=-0.151090
predicted=0.022640, expected=-0.092052
predicted=0.021730, expected=0.467836
predicted=0.108708, expected=0.494133
predicted=0.142614, expected=-0.255853
predicted=0.037233, expected=-0.451188
predicted=-0.029204, expected=1.337457
predicted=0.226368, expected=-0.298582
predicted=0.059726, expected=0.523356
predicted=0.130133, expected=0.416918
predicted=0.137987, expected=-0.782642
predicted=-0.044769, expected=-0.065113
predicted=0.003303, expected=-0.448200
predicted=-0.039566, expected=0.408531
predicted=0.078031, expected=-0.205925
predicted=0.023487, expected=-0.806264
predicted=-0.088064, expected=0.134211
predicted=0.018664, expected=-0.676021
predicted=-0.069903, expected=0.449953
predicted=0.073111, expected=0.517626
predicted=0.132930, expected=0.959755
predicted=0.222085, expected=-0.270959
predicted=0.062698, expected=0.921275
predicted=0.191894, expected=0.465680
predicted=0.166610, expected=0.311333
predicted=0.134276, expected=0.098697
predicted=0.090408, expected=-0.073746
predicted=0.048700, expected=-0.393060
predicted=-0.014974, expected=-0.393060
predicted=-0.037111, expected=-0.393060
predicted=-0.044973, expected=-0.393060
predicted=-0.047953, expected=0.338121
predicted=0.064060, expected=-0.713129
RMSE: 0.5966373532590054
Out[167]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a3a065590>

In [171]:
from statsmodels.tsa.arima_model import ARIMA


model1 = ARIMA(price.Close, order=(1,1,2))
fit1 = model1.fit()
print(fit1.summary())


                             ARIMA Model Results                              
==============================================================================
Dep. Variable:                D.Close   No. Observations:                 1298
Model:                 ARIMA(1, 1, 2)   Log Likelihood               -7514.880
Method:                       css-mle   S.D. of innovations             79.094
Date:                Mon, 06 Apr 2020   AIC                          15039.761
Time:                        16:51:28   BIC                          15065.604
Sample:                    01-09-2015   HQIC                         15049.458
                         - 12-31-2019                                         
=================================================================================
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             3.0289      2.343      1.293      0.196      -1.562       7.620
ar.L1.D.Close    -0.6264      0.300     -2.087      0.037      -1.215      -0.038
ma.L1.D.Close     0.6654      0.299      2.223      0.026       0.079       1.252
ma.L2.D.Close     0.0701      0.028      2.495      0.013       0.015       0.125
                                    Roots                                    
=============================================================================
                  Real          Imaginary           Modulus         Frequency
-----------------------------------------------------------------------------
AR.1           -1.5965           +0.0000j            1.5965            0.5000
MA.1           -1.8724           +0.0000j            1.8724            0.5000
MA.2           -7.6152           +0.0000j            7.6152            0.5000
-----------------------------------------------------------------------------

In [172]:
len(price.Close)


Out[172]:
1299

In [175]:
close_train =  price.Close[:1250]
close_test = price.Close[1250:]

history = [x for x in close_train]
predictions = list()
for t in range(len(close_test)):
    model = ARIMA(history,  order=(1,1,2))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))
error = metrics.mean_squared_error(test, predictions) ** 0.5
print(f"RMSE: {error}" )

pd.DataFrame({"actual": test
              , "prediction": np.array(predictions).flatten()}).plot()


predicted=11664.296622, expected=0.011224
predicted=-518.530336, expected=0.011224
predicted=-590.251610, expected=0.011224
predicted=227.677055, expected=0.011224
predicted=142.303991, expected=2.534121
predicted=106.316212, expected=0.110714
predicted=80.812260, expected=0.426387
predicted=66.342098, expected=-0.201821
predicted=55.053808, expected=0.409912
predicted=47.716518, expected=0.384421
predicted=41.391650, expected=-0.864965
predicted=34.906359, expected=0.044507
predicted=31.558207, expected=0.044507
predicted=27.818015, expected=0.044507
predicted=24.557649, expected=0.044507
predicted=21.673433, expected=-0.151090
predicted=19.137642, expected=-0.092052
predicted=16.602576, expected=0.467836
predicted=14.876032, expected=0.494133
predicted=13.195641, expected=-0.255853
predicted=10.649479, expected=-0.451188
predicted=8.911916, expected=1.337457
predicted=8.055824, expected=-0.298582
predicted=6.396853, expected=0.523356
predicted=6.114061, expected=0.416918
predicted=5.027117, expected=-0.782642
predicted=2.827564, expected=-0.065113
predicted=2.717160, expected=-0.448200
predicted=1.620712, expected=0.408531
predicted=1.806155, expected=-0.205925
predicted=0.558672, expected=-0.806264
predicted=-0.588690, expected=0.134211
predicted=-0.147710, expected=-0.676021
predicted=-1.428701, expected=0.449953
predicted=-0.682052, expected=0.517626
predicted=-1.017643, expected=0.959755
predicted=-0.944090, expected=-0.270959
predicted=-2.488129, expected=0.921275
predicted=-1.590446, expected=0.465680
predicted=-2.326213, expected=0.311333
predicted=-2.743949, expected=0.098697
predicted=-3.185691, expected=-0.073746
predicted=-3.579437, expected=-0.393060
predicted=-4.114301, expected=-0.393060
predicted=-4.302213, expected=-0.393060
predicted=-4.481455, expected=-0.393060
predicted=-4.643392, expected=0.338121
predicted=-4.057625, expected=-0.713129
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
   4403         try:
-> 4404             return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
   4405         except KeyError as e1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.DatetimeEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 48

During handling of the above exception, another exception occurred:

IndexError                                Traceback (most recent call last)
<ipython-input-175-81a965c7d23a> in <module>
     10     yhat = output[0]
     11     predictions.append(yhat)
---> 12     obs = test[t]
     13     history.append(obs)
     14     print('predicted=%f, expected=%f' % (yhat, obs))

/anaconda3/lib/python3.7/site-packages/pandas/core/series.py in __getitem__(self, key)
    869         key = com.apply_if_callable(key, self)
    870         try:
--> 871             result = self.index.get_value(self, key)
    872 
    873             if not is_scalar(result):

/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/datetimes.py in get_value(self, series, key)
    649 
    650         try:
--> 651             value = Index.get_value(self, series, key)
    652         except KeyError:
    653             try:

/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
   4408 
   4409             try:
-> 4410                 return libindex.get_value_at(s, key)
   4411             except IndexError:
   4412                 raise

pandas/_libs/index.pyx in pandas._libs.index.get_value_at()

pandas/_libs/index.pyx in pandas._libs.index.get_value_at()

pandas/_libs/util.pxd in pandas._libs.util.get_value_at()

pandas/_libs/util.pxd in pandas._libs.util.validate_indexer()

IndexError: index out of bounds

In [ ]: