In [1]:
import pandas as pd
In [2]:
df = pd.read_csv("/data/nifty50-index.csv")
df.head()
Out[2]:
In [3]:
df.index
Out[3]:
In [4]:
df.tail()
Out[4]:
In [5]:
pd.to_datetime(df.Date)
Out[5]:
In [6]:
df.index = pd.to_datetime(df.Date)
In [7]:
df.head()
Out[7]:
In [8]:
price = df[["Close"]]
In [9]:
price.head(10)
Out[9]:
In [10]:
price.index
Out[10]:
In [11]:
price = price.asfreq("d", method="ffill")
In [12]:
price.index
Out[12]:
In [13]:
price
Out[13]:
In [14]:
price.asfreq("h", method="ffill").head(50)
Out[14]:
In [15]:
price.asfreq("y", method="ffill").head(50)
Out[15]:
In [16]:
price.resample("1m").mean()
Out[16]:
In [17]:
price.resample("3m").mean()
Out[17]:
In [18]:
price["diff1"] = price.diff(1)
price.head(20)
Out[18]:
In [19]:
price = df[["Close"]]
price = price.asfreq("B")
price.head(20)
Out[19]:
In [20]:
price.index
Out[20]:
In [21]:
price["diff1"] = price.diff(1)
price.head(20)
Out[21]:
In [22]:
price["pct1"] = price.Close.diff(1)/price.Close
price.head(20)
Out[22]:
In [23]:
price["pct1"] = price.Close.pct_change(1)
price.head(20)
Out[23]:
In [24]:
price["lag1"] = price.Close.pct_change(1).shift(1) * 100
price["lag2"] = price.Close.pct_change(1).shift(2) * 100
price["lag3"] = price.Close.pct_change(1).shift(3) * 100
price["lag4"] = price.Close.pct_change(1).shift(4) * 100
price.head(20)
Out[24]:
In [25]:
price["lag1"] = price.Close.shift(1)
price["lag2"] = price.Close.shift(2)
price.head(20)
Out[25]:
In [26]:
price.dropna(inplace=True)
In [27]:
price
Out[27]:
In [28]:
date_column = price.reset_index().Date
price["year"] = price.index.year
price["month"] = price.index.month
price["day"] = price.index.day
price["weekday"] = price.index.weekday
price.head(20)
Out[28]:
In [29]:
price.Close.plot()
Out[29]:
In [30]:
X = price[["lag1", "lag2", "lag3", "lag4", "year", "month", "day", "weekday"]]
X.head()
Out[30]:
In [31]:
import numpy as np
In [32]:
y = np.log(price.Close)
y
Out[32]:
In [33]:
from sklearn import *
In [34]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 1)
In [35]:
import xgboost as xgb
In [36]:
est = xgb.XGBRegressor(objective='reg:squarederror')
In [37]:
est.fit(X_train, y_train)
Out[37]:
In [38]:
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train rmse: ", metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ", metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ", metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ", metrics.r2_score(y_test, y_test_pred) ** 0.5)
In [39]:
pd.DataFrame({"actual": y_test, "prediction": y_test_pred}).plot()
Out[39]:
In [40]:
pd.DataFrame({"feature": X.columns, "importance": est.feature_importances_})
Out[40]:
In [41]:
X_train.head()
Out[41]:
In [42]:
est = xgb.XGBRegressor(objective='reg:squarederror' , booster= "gblinear")
est.fit(X_train, y_train)
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
print("train rmse: ", metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ", metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ", metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ", metrics.r2_score(y_test, y_test_pred) ** 0.5)
pd.DataFrame({"actual": y_test, "prediction": y_test_pred}).plot()
Out[42]:
In [43]:
lasso = linear_model.Lasso(alpha=0.001)
pipe = pipeline.Pipeline([
("poly", preprocessing.PolynomialFeatures(degree=3, include_bias=False)),
("std", preprocessing.StandardScaler()),
("est", linear_model.Lasso(alpha=0.001))
])
pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
print("train rmse: ", metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ", metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ", metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ", metrics.r2_score(y_test, y_test_pred) ** 0.5)
pd.DataFrame({"actual": y_test, "prediction": y_test_pred}).plot()
Out[43]:
In [48]:
import matplotlib.pyplot as plt
In [54]:
plt.rcParams["figure.figsize"] = 15, 8
In [55]:
y.plot()
Out[55]:
In [56]:
import scipy.stats
In [59]:
plt.rcParams["figure.figsize"] = 8, 8
scipy.stats.probplot(y, plot = plt);
In [60]:
import numpy as np
In [63]:
wn = np.random.normal(loc = np.mean(y), scale = np.std(y), size = len(y))
In [64]:
scipy.stats.probplot(wn, plot = plt);
In [66]:
plt.hist(wn, bins = 50)
Out[66]:
In [68]:
pd.Series(wn).plot.kde()
Out[68]:
In [72]:
price["wn"] = wn
price.head()
Out[72]:
In [74]:
plt.rcParams["figure.figsize"] = 16, 8
In [75]:
price.wn.plot()
Out[75]:
In [76]:
price.Close.plot()
Out[76]:
In [77]:
import statsmodels.tsa.stattools as sts
In [78]:
sts.adfuller(price.Close) #p-value is 0.9, hence the series is non stationary
Out[78]:
In [81]:
sts.adfuller(price.wn) #p-value is 0.0, hence the series is stationary
Out[81]:
In [88]:
airlines = pd.read_csv("/data/airline-passengers.csv")
airlines.index = pd.to_datetime(airlines.Month)
airlines = airlines[["Passengers"]]
airlines = airlines.asfreq("m", method= "ffill")
airlines.head()
Out[88]:
In [89]:
airlines.plot()
Out[89]:
In [90]:
from statsmodels.tsa.seasonal import seasonal_decompose
s_decom_additive = seasonal_decompose(airlines.Passengers, model = "additive")
s_decom_additive.plot()
Out[90]:
In [95]:
price2 = price.copy()
price2 = price2.asfreq("d", method = "ffill")
s_decom_additive = seasonal_decompose(price2.Close, model = "additive")
s_decom_additive.plot()
Out[95]:
In [96]:
price2 = price.copy()
price2 = price2.asfreq("d", method = "ffill")
s_decom_additive = seasonal_decompose(price2.wn, model = "additive")
s_decom_additive.plot()
Out[96]:
In [97]:
import statsmodels.graphics.tsaplots as sgt
sgt.plot_acf(price2.Close, lags=40, zero=False)
plt.title("ACF - Nifty 50 CLOSE")
Out[97]:
In [98]:
import statsmodels.graphics.tsaplots as sgt
sgt.plot_acf(price2.wn, lags=40, zero=False)
plt.title("ACF - Nifty 50 CLOSE")
Out[98]:
In [100]:
import statsmodels.graphics.tsaplots as sgt
sgt.plot_acf(airlines.Passengers, lags=40, zero=False)
Out[100]:
In [101]:
sgt.plot_pacf(price2.Close, lags=40, zero=False, method = ("ols"))
Out[101]:
In [102]:
sgt.plot_pacf(airlines.Passengers, lags=40, zero=False, method = ("ols"))
Out[102]:
In [103]:
sgt.plot_pacf(price2.wn, lags=40, zero=False, method = ("ols"))
Out[103]:
In [105]:
def to_float(v):
try:
return float(v)
except:
pass
ftse = pd.read_csv("/data/FTSE.csv")
ftse.index = pd.to_datetime(ftse.Date)
ftse = ftse[["Adj Close"]]
ftse.columns = ["Close"]
ftse.Close = ftse.Close.apply(to_float)
ftse = ftse.dropna()
ftse = ftse.sort_index().asfreq(freq='B', method = "ffill")
ftse.head()
Out[105]:
In [107]:
ftse.Close.plot()
Out[107]:
In [108]:
sgt.plot_pacf(ftse.Close, lags=40, zero=False, method = ("ols"))
Out[108]:
In [109]:
sts.adfuller(price2.Close)
Out[109]:
In [110]:
price2.head()
Out[110]:
In [115]:
price["returns"] = price.Close.pct_change(1).mul(100)
price = price.asfreq('b', method = "ffill")
price.head()
Out[115]:
In [116]:
price.index
Out[116]:
In [120]:
returns = price["returns"][1:]
sts.adfuller(returns)
Out[120]:
In [121]:
sgt.plot_acf(returns, lags=40, zero=False)
Out[121]:
In [122]:
sgt.plot_pacf(returns, lags=40, zero=False, method = ("ols"))
Out[122]:
In [123]:
from statsmodels.tsa.arima_model import ARMA
In [129]:
model1 = ARMA(returns, order=(1, 0))
fit1 = model1.fit()
print(fit1.summary())
In [130]:
model1 = ARMA(returns, order=(2, 0))
fit1 = model1.fit()
print(fit1.summary())
In [131]:
len(returns)
Out[131]:
In [132]:
train = returns[:1250]
test = returns[1250:]
In [133]:
model1 = ARMA(train, order=(2, 0))
fit1 = model1.fit()
print(fit1.summary())
In [135]:
results = fit1.forecast(steps = 48)
results
Out[135]:
In [139]:
plt.plot(range(len(results[0])), results[0])
plt.fill_between(range(len(results[0])), results[0] + results[1], results[0] - results[1], alpha = 0.3)
Out[139]:
In [166]:
history = [x for x in train]
predictions = list()
for t in range(len(test)):
model = ARMA(history, order=(2,0))
model_fit = model.fit(disp=0)
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
print('predicted=%f, expected=%f' % (yhat, obs))
error = metrics.mean_squared_error(test, predictions) ** 0.5
print(f"RMSE: {error}" )
pd.DataFrame({"actual": test
, "prediction": np.array(predictions).flatten()}).plot()
Out[166]:
In [149]:
fit1.resid.plot()
plt.title("ARMA residual: mean: %.f, std: %.2f" % (np.mean(fit1.resid), np.std(fit1.resid)))
Out[149]:
In [150]:
sts.adfuller(fit1.resid)
Out[150]:
In [152]:
sgt.plot_acf(fit1.resid, lags=40, zero=False)
Out[152]:
In [151]:
sgt.plot_pacf(fit1.resid, lags=40, zero=False, method = ("ols"))
Out[151]:
In [153]:
sgt.plot_acf(train, lags=40, zero=False)
Out[153]:
In [158]:
model1 = ARMA(train, order=(0, 2))
fit1 = model1.fit()
print(fit1.summary())
In [165]:
history = [x for x in train]
predictions = list()
for t in range(len(test)):
model = ARMA(history, order=(0,2))
model_fit = model.fit(disp=0)
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
print('predicted=%f, expected=%f' % (yhat, obs))
error = metrics.mean_squared_error(test, predictions) ** 0.5
print(f"RMSE: {error}" )
pd.DataFrame({"actual": test
, "prediction": np.array(predictions).flatten()}).plot()
Out[165]:
In [163]:
model1 = ARMA(train, order=(1, 1))
fit1 = model1.fit()
print(fit1.summary())
In [167]:
history = [x for x in train]
predictions = list()
for t in range(len(test)):
model = ARMA(history, order=(1,1))
model_fit = model.fit(disp=0)
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
print('predicted=%f, expected=%f' % (yhat, obs))
error = metrics.mean_squared_error(test, predictions) ** 0.5
print(f"RMSE: {error}" )
pd.DataFrame({"actual": test
, "prediction": np.array(predictions).flatten()}).plot()
Out[167]:
In [171]:
from statsmodels.tsa.arima_model import ARIMA
model1 = ARIMA(price.Close, order=(1,1,2))
fit1 = model1.fit()
print(fit1.summary())
In [172]:
len(price.Close)
Out[172]:
In [175]:
close_train = price.Close[:1250]
close_test = price.Close[1250:]
history = [x for x in close_train]
predictions = list()
for t in range(len(close_test)):
model = ARIMA(history, order=(1,1,2))
model_fit = model.fit(disp=0)
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
print('predicted=%f, expected=%f' % (yhat, obs))
error = metrics.mean_squared_error(test, predictions) ** 0.5
print(f"RMSE: {error}" )
pd.DataFrame({"actual": test
, "prediction": np.array(predictions).flatten()}).plot()
In [ ]: