In [1]:
# Import the library we need, which is Pandas and Matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Import statsmodel
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tsa.stattools import adfuller
In [2]:
# Set some parameters to get good visuals - style to ggplot and size to 15,10
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 10)
In [3]:
# Read the csv file of Monthwise Quantity and Price csv file we have.
df = pd.read_csv('MonthWiseMarketArrivals_clean.csv')
In [4]:
# Changing the date column to a Time Interval columnn
df.date = pd.DatetimeIndex(df.date)
In [5]:
# Change the index to the date column
df.index = pd.PeriodIndex(df.date, freq='M')
In [6]:
# Sort the data frame by date
df = df.sort_values(by = "date")
In [7]:
df.head()
Out[7]:
Get the priceMod for Bangalore Market
In [8]:
dfBang = df.loc[df.city == "BANGALORE"].copy()
In [9]:
dfBang.head()
Out[9]:
In [10]:
# Drop redundant columns
dfBang = dfBang.drop(["market", "month", "year", "state", "city", "priceMin", "priceMax"], axis = 1)
In [11]:
dfBang.head()
Out[11]:
In [12]:
dfBang.priceMod.plot()
Out[12]:
In [13]:
dfBang.quantity.plot()
Out[13]:
In [14]:
dfBang.priceMod.plot(kind = "hist", bins = 30)
Out[14]:
In [15]:
dfBang['priceModLog'] = np.log(dfBang.priceMod)
dfBang.head()
Out[15]:
In [16]:
dfBang.priceModLog.plot(kind = "hist", bins = 30)
Out[16]:
In [17]:
dfBang.priceModLog.plot()
Out[17]:
In [18]:
model_mean_pred = dfBang.priceModLog.mean()
In [19]:
# Let us store this as our Mean Predication Value
dfBang["priceMean"] = np.exp(model_mean_pred)
In [20]:
dfBang.plot(kind="line", x="date", y = ["priceMod", "priceMean"])
Out[20]:
Can we measure the error rate?
We will use Root Mean Squared Error (RMSE) to calculate our error values
$RMSE = \Sigma \sqrt{ (\hat{y} - y)^2/n} $ , where $\hat{y}$ is predicted value of y
In [21]:
def RMSE(predicted, actual):
mse = (predicted - actual)**2
rmse = np.sqrt(mse.sum()/mse.count())
return rmse
In [22]:
model_mean_RMSE = RMSE(dfBang.priceMean, dfBang.priceMod)
model_mean_RMSE
Out[22]:
In [23]:
# Save this in a dataframe
dfBangResults = pd.DataFrame(columns = ["Model", "Forecast", "RMSE"])
dfBangResults.head()
Out[23]:
In [24]:
dfBangResults.loc[0,"Model"] = "Mean"
dfBangResults.loc[0,"Forecast"] = np.exp(model_mean_pred)
dfBangResults.loc[0,"RMSE"] = model_mean_RMSE
dfBangResults.head()
Out[24]:
In [25]:
dfBang.head()
Out[25]:
In [26]:
dfBang.dtypes
Out[26]:
In [27]:
# What is the starting month of our data
dfBang.date.min()
Out[27]:
In [28]:
# Convert date in datetimedelta figure starting from zero
dfBang["timeIndex"] = dfBang.date - dfBang.date.min()
In [29]:
dfBang.head()
Out[29]:
In [30]:
dfBang.dtypes
Out[30]:
In [31]:
# Convert to months using the timedelta function
dfBang["timeIndex"] = dfBang["timeIndex"]/np.timedelta64(1, 'M')
In [32]:
dfBang.timeIndex.head()
Out[32]:
In [33]:
# Round the number to 0
dfBang["timeIndex"] = dfBang["timeIndex"].round(0).astype(int)
In [34]:
dfBang.timeIndex.tail()
Out[34]:
In [35]:
dfBang.head()
Out[35]:
In [36]:
## Now plot linear regression between priceMod and timeIndex
model_linear = smf.ols('priceModLog ~ timeIndex', data = dfBang).fit()
In [37]:
model_linear.summary()
Out[37]:
In [38]:
## Parameters for y = mx + c equation
model_linear.params
Out[38]:
In [39]:
c = model_linear.params[0]
c
Out[39]:
In [40]:
m = model_linear.params[1]
m
Out[40]:
In [41]:
model_linear_pred = model_linear.predict()
In [42]:
model_linear_pred
Out[42]:
In [43]:
# Plot the prediction line
dfBang.plot(kind="line", x="timeIndex", y = "priceModLog")
plt.plot(dfBang.timeIndex,model_linear_pred, '-')
Out[43]:
In [44]:
model_linear.resid.plot(kind = "bar")
Out[44]:
Is this a good model?
What measures can we check to see if the model is good?
It is seen here (and also evident on the regression line plot, if you look closely) that the linear trend model has a tendency to make an error of the same sign for many periods in a row. This tendency is measured in statistical terms by the lag-1 autocorrelation and Durbin-Watson statistic. If there is no time pattern, the lag-1 autocorrelation should be very close to zero, and the Durbin-Watson statistic ought to be very close to 2, which is not the case here. If the model has succeeded in extracting all the "signal" from the data, there should be no pattern at all in the errors: the error in the next period should not be correlated with any previous errors. The linear trend model obviously fails the autocorrelation test in this case.
In [45]:
# Manual Calculation
model_linear_forecast_manual = m * 146 + c
model_linear_forecast_manual
Out[45]:
In [46]:
# Using Predict Function
model_linear_forecast_auto = model_linear.predict(exog = dict(timeIndex=146))
model_linear_forecast_auto
Out[46]:
In [47]:
dfBang["priceLinear"] = np.exp(model_linear_pred)
In [48]:
dfBang.head()
Out[48]:
In [49]:
# Root Mean Squared Error (RMSE)
model_linear_RMSE = RMSE(dfBang.priceLinear, dfBang.priceMod)
model_linear_RMSE
Out[49]:
In [50]:
dfBangResults.loc[1,"Model"] = "Linear"
dfBangResults.loc[1,"Forecast"] = np.exp(model_linear_forecast_manual)
dfBangResults.loc[1,"RMSE"] = model_linear_RMSE
dfBangResults.head()
Out[50]:
In [51]:
dfBang.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMean", "priceLinear"])
Out[51]:
In [52]:
## Now plot linear regression between priceMod and timeIndex
model_linear_quantity = smf.ols('priceModLog ~ timeIndex + np.log(quantity)', data = dfBang).fit()
In [53]:
model_linear_quantity.summary()
Out[53]:
In [54]:
dfBang["priceLinearQuantity"] = np.exp(model_linear_quantity.predict())
In [55]:
dfBang.plot(kind = "line", x="timeIndex", y = "quantity")
dfBang.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMean",
"priceLinear", "priceLinearQuantity"])
Out[55]:
When faced with a time series that shows irregular growth, the best strategy may not be to try to directly predict the level of the series at each period (i.e., the quantity Yt). Instead, it may be better to try to predict the change that occurs from one period to the next (i.e., the quantity Yt - Yt-1). That is, it may be better to look at the first difference of the series, to see if a predictable pattern can be found there. For purposes of one-period-ahead forecasting, it is just as good to predict the next change as to predict the next level of the series, since the predicted change can be added to the current level to yield a predicted level. The simplest case of such a model is one that always predicts that the next change will be zero, as if the series is equally likely to go up or down in the next period regardless of what it has done in the past.
Random Walk Model $$ \hat{Y_t} = Y_{t-1} + \epsilon \\$$
Random Walk Model with drift $$ \hat{Y_t} = Y_{t-1} + c + \epsilon \\$$
In [56]:
dfBang["priceModLogShift1"] = dfBang.priceModLog.shift()
In [57]:
dfBang.head()
Out[57]:
In [58]:
dfBang.plot(kind= "scatter", y = "priceModLog", x = "priceModLogShift1", s = 50)
Out[58]:
In [59]:
# Lets plot the one-month difference curve
dfBang["priceModLogDiff"] = dfBang.priceModLog - dfBang.priceModLogShift1
In [60]:
dfBang.priceModLogDiff.plot()
Out[60]:
In [61]:
dfBang["priceRandom"] = np.exp(dfBang.priceModLogShift1)
dfBang.head()
Out[61]:
In [62]:
dfBang.tail()
Out[62]:
In [63]:
dfBang.plot(kind="line", x="timeIndex", y = ["priceMod","priceRandom"])
Out[63]:
In [64]:
# Root Mean Squared Error (RMSE)
model_random_RMSE = RMSE(dfBang.priceRandom, dfBang.priceMod)
model_random_RMSE
Out[64]:
In [65]:
dfBangResults.loc[2,"Model"] = "Random"
dfBangResults.loc[2,"Forecast"] = np.exp(dfBang.priceModLogShift1[-1])
dfBangResults.loc[2,"RMSE"] = model_random_RMSE
dfBangResults.head()
Out[65]:
In [66]:
dfBang.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMean", "priceLinear", "priceRandom"])
Out[66]:
Most of the time series models work on the assumption that the time series is stationary. Intuitively, we can see that if a time series has a particular behaviour over time, there is a very high probability that it will follow the same in the future. Also, the theories related to stationary series are more mature and easier to implement as compared to non-stationary series
Statistical stationarity: A stationary time series is one whose statistical properties such as mean, variance, autocorrelation, etc. are all constant over time. Most statistical forecasting methods are based on the assumption that the time series can be rendered approximately stationary (i.e., "stationarized") through the use of mathematical transformations. A stationarized series is relatively easy to predict: you simply predict that its statistical properties will be the same in the future as they have been in the past!
There are three basic criterion for a series to be classified as stationary series :
The Augmented Dickey-Fuller test can be used to test for a unit root in a univariate process in the presence of serial correlation.
$$ Y_t = \rho * Y_{t-1} + \epsilon_t \\$$$$ Y_t - Y_{t-1} = (\rho - 1) Y_{t - 1} + \epsilon_t \\$$We have to test if p – 1 is significantly different than zero or not. If the null hypothesis gets rejected, we’ll get a stationary time series.
In [67]:
def adf(ts):
# Determing rolling statistics
rolmean = pd.rolling_mean(ts, window=12)
rolstd = pd.rolling_std(ts, window=12)
#Plot rolling statistics:
orig = plt.plot(ts, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
# Calculate ADF factors
adftest = adfuller(ts, autolag='AIC')
adfoutput = pd.Series(adftest[0:4], index=['Test Statistic','p-value','# of Lags Used',
'Number of Observations Used'])
for key,value in adftest[4].items():
adfoutput['Critical Value (%s)'%key] = value
return adfoutput
Lets understand what is making a time series non-stationary. There are 3 major reasons behind non-stationarity:
where $y_t$ is the data at period t
, $S_t$ is the seasonal component at period 't', $T_t$ is the trend-cycle component at period tt and $E_t$ is the remainder (or irregular or error) component at period tt.
Alternatively, a multiplicative model would be written as
$$ y_t=S_t*T_t*E_t \\$$The additive model is most appropriate if the magnitude of the seasonal fluctuations or the variation around the trend-cycle does not vary with the level of the time series. When the variation in the seasonal pattern, or the variation around the trend-cycle, appears to be proportional to the level of the time series, then a multiplicative model is more appropriate. With economic time series, multiplicative models are common.
An alternative to using a multiplicative model, is to first transform the data until the variation in the series appears to be stable over time, and then use an additive model. When a log transformation has been used, this is equivalent to using a multiplicative decomposition because
$$ log y_t=log S_t+ log T_t + log E_t \\$$Sometimes, the trend-cycle component is simply called the “trend” component, even though it may contain cyclic behaviour as well.
The underlying principle is to model or estimate the trend and seasonality in the series and remove those from the series to get a stationary series. Then statistical forecasting techniques can be implemented on this series. The final step would be to convert the forecasted values into the original scale by applying trend and seasonality constraints back.
As a first step in moving beyond mean models, random walk model and linear trend models, nonseasonal patterns and trends can be extrapolated using a moving-average or smoothing model. The basic assumption behind averaging and smoothing models is that the time series is locally stationary with a slowly varying mean. Hence, we take a moving (local) average to estimate the current value of the mean and then use that as the forecast for the near future. This can be considered as a compromise between the mean model and the random-walk-without-drift-model. The same strategy can be used to estimate and extrapolate a local trend. A moving average is often called a "smoothed" version of the original series because short-term averaging has the effect of smoothing out the bumps in the original series. By adjusting the degree of smoothing (the width of the moving average), we can hope to strike some kind of optimal balance between the performance of the mean and random walk models.
Simple Moving Average (SMA)
$$ \hat{y_t} = \frac{y_{t-1} + y_{t-2} + y_{t-3} + ... + y_{t-m}}{m} \\$$
In [68]:
# For smoothing the values we can use 12 month Moving Averages
dfBang['priceModLogMA12'] = pd.rolling_mean(dfBang.priceModLog, window = 12)
In [69]:
dfBang.plot(kind ="line", y=["priceModLogMA12", "priceModLog"])
Out[69]:
The long-term forecasts from the SMA model are a horizontal straight line, just as in the random walk model. Thus, the SMA model assumes that there is no trend in the data. However, whereas the forecasts from the random walk model are simply equal to the last observed value, the forecasts from the SMA model are equal to a weighted average of recent values.
In [70]:
dfBang["priceMA12"] = np.exp(dfBang.priceModLogMA12)
dfBang.tail()
Out[70]:
In [71]:
model_MA12_forecast = dfBang.priceModLog.tail(12).mean()
In [72]:
# Root Mean Squared Error (RMSE)
model_MA12_RMSE = RMSE(dfBang.priceMA12, dfBang.priceMod)
model_MA12_RMSE
Out[72]:
In [73]:
dfBangResults.loc[3,"Model"] = "Moving Average 12"
dfBangResults.loc[3,"Forecast"] = np.exp(model_MA12_forecast)
dfBangResults.loc[3,"RMSE"] = model_MA12_RMSE
dfBangResults.head()
Out[73]:
In [74]:
dfBang.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMean", "priceLinear",
"priceRandom", "priceMA12"])
Out[74]:
In [75]:
# Test remaining part for Stationary
ts = dfBang.priceModLog - dfBang.priceModLogMA12
ts.dropna(inplace = True)
adf(ts)
Out[75]:
Instead of equally weighting each of the observation, in the SES model we give more weightage to the recent observations and less to the older ones. This is done by the using a smoothing variable like alpha
$$ \hat{y_t} = \alpha y_{t-1} + (1-\alpha)\hat{y_{t-1}} \\$$
In [76]:
dfBang['priceModLogExp12'] = pd.ewma(dfBang.priceModLog, halflife=12)
In [77]:
halflife = 12
alpha = 1 - np.exp(np.log(0.5)/halflife)
alpha
Out[77]:
In [78]:
dfBang.plot(kind ="line", y=["priceModLogExp12", "priceModLog"])
Out[78]:
In [79]:
dfBang["priceExp12"] = np.exp(dfBang.priceModLogExp12)
dfBang.tail()
Out[79]:
In [80]:
# Root Mean Squared Error (RMSE)
model_Exp12_RMSE = RMSE(dfBang.priceExp12, dfBang.priceMod)
model_Exp12_RMSE
Out[80]:
In [81]:
y_exp = dfBang.priceModLog[-1]
y_exp
Out[81]:
In [82]:
y_for = dfBang.priceModLogExp12[-1]
y_for
Out[82]:
In [83]:
model_Exp12_forecast = alpha * y_exp + (1 - alpha) * y_for
In [84]:
dfBangResults.loc[4,"Model"] = "Exp Smoothing 12"
dfBangResults.loc[4,"Forecast"] = np.exp(model_Exp12_forecast)
dfBangResults.loc[4,"RMSE"] = model_Exp12_RMSE
dfBangResults.head()
Out[84]:
In [85]:
dfBang.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMean", "priceLinear",
"priceRandom",
"priceMA12", "priceExp12"])
Out[85]:
In [86]:
# Test remaining part for Stationary
ts = dfBang.priceModLog - dfBang.priceModLogExp12
ts.dropna(inplace = True)
adf(ts)
Out[86]:
One of the most common methods of dealing with both trend and seasonality is differencing. In this technique, we take the difference of the observation at a particular instant with that at the previous instant. This mostly works well in improving stationarity. We have already done first order difference earlier
In [87]:
dfBang.priceModLogDiff.plot()
Out[87]:
In [88]:
# Test remaining part for Stationary
ts = dfBang.priceModLogDiff
ts.dropna(inplace = True)
adf(ts)
Out[88]:
In [89]:
from statsmodels.tsa.seasonal import seasonal_decompose
In [90]:
dfBang.index = dfBang.index.to_datetime()
In [91]:
dfBang.head()
Out[91]:
In [92]:
decomposition = seasonal_decompose(dfBang.priceModLog, model = "additive")
In [93]:
decomposition.plot()
Out[93]:
In [94]:
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
In [95]:
dfBang["priceDecomp"] = np.exp(trend + seasonal)
In [96]:
# Root Mean Squared Error (RMSE)
model_Decomp_RMSE = RMSE(dfBang.priceDecomp, dfBang.priceMod)
model_Decomp_RMSE
Out[96]:
In [97]:
dfBang.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMean", "priceLinear", "priceRandom",
"priceMA12", "priceExp12", "priceDecomp"])
Out[97]:
In [98]:
dfBang.plot(kind="line", x="timeIndex", y = ["priceMod",
"priceDecomp"])
Out[98]:
In [99]:
# Test remaining part for Stationary
ts = decomposition.resid
ts.dropna(inplace = True)
adf(ts)
Out[99]:
In an autoregression model, we forecast the variable of interest using a linear combination of past values of the variable. The term autoregression indicates that it is a regression of the variable against itself.
Thus an autoregressive model of order (p) can be written as
$$ y_t = c + m_1y_{t-1} + m_2y_{t-2} + m_3y_{t-3} + .. \\$$Random walk model is an AR(1) model with $$m_1=1, c = 0\\$$ Random walk model with drift model $$m_1=1, c \not= 0\\$$
We normally restrict autoregressive models to stationary data, and then some constraints on the values of the parameters are required.
For an AR(1) model: $$ −1<m_1<−1 \\$$
For an AR(2) model:
$$ −1<m_2<−1, m_1 + m_2 < 1, m_2 - m_1 <1 \\$$
Rather than use past values of the forecast variable in a regression, a moving average model uses past forecast errors in a regression-like model.
$$ y_t=c+e_t+l_1 e_{t−1}+l_2 e_{t−2} + ... + l_q e_{t-q} \\$$where e
is white noise. We refer to this as an MA(q
) model. Of course, we do not observe the values of e(t), so it is not really regression in the usual sense.
Notice that each value of y(t)
can be thought of as a weighted moving average of the past few forecast errors. However, moving average models should not be confused with moving average smoothing. A moving average model is used for forecasting future values while moving average smoothing is used for estimating the trend-cycle of past values.
If we combine differencing with autoregression and a moving average model, we obtain a non-seasonal ARIMA model. ARIMA is an acronym for AutoRegressive Integrated Moving Average model (“integration” in this context is the reverse of differencing). The full model can be written as
An importance concern here is how to determine the value of ‘p’ and ‘q’. We use two plots to determine these numbers. Lets discuss them first.
In MA model, noise / shock quickly vanishes with time. The AR model has a much lasting effect of the shock.
In [100]:
ts = dfBang.priceModLog
ts_diff = dfBang.priceModLogDiff
ts_diff.dropna(inplace = True)
In [101]:
#ACF and PACF plots:
from statsmodels.tsa.stattools import acf, pacf
In [102]:
lag_acf = acf(ts_diff, nlags=20)
In [103]:
lag_acf
Out[103]:
In [104]:
ACF = pd.Series(lag_acf)
In [105]:
ACF
Out[105]:
In [106]:
ACF.plot(kind = "bar")
Out[106]:
In [107]:
lag_pacf = pacf(ts_diff, nlags=20, method='ols')
In [108]:
PACF = pd.Series(lag_pacf)
In [109]:
PACF.plot(kind = "bar")
Out[109]:
In [110]:
from statsmodels.tsa.arima_model import ARIMA
In [111]:
ts_diff.head()
Out[111]:
In [112]:
# Running the ARIMA Model(1,0,1)
model_AR1MA = ARIMA(ts_diff, order=(1,0,1))
In [113]:
results_ARIMA = model_AR1MA.fit(disp = -1)
In [114]:
results_ARIMA.fittedvalues.head()
Out[114]:
In [115]:
ts_diff.plot()
results_ARIMA.fittedvalues.plot()
Out[115]:
In [116]:
ts_diff.sum()
Out[116]:
In [117]:
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_diff.tail()
Out[117]:
In [118]:
predictions_ARIMA_diff.sum()
Out[118]:
In [119]:
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
predictions_ARIMA_diff_cumsum.tail()
Out[119]:
In [120]:
ts.ix[0]
Out[120]:
In [121]:
predictions_ARIMA_log = pd.Series(ts.ix[0], index=ts.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.tail()
Out[121]:
In [122]:
dfBang['priceARIMA'] = np.exp(predictions_ARIMA_log)
In [123]:
dfBang.plot(kind="line", x="timeIndex", y = ["priceMod", "priceARIMA"])
Out[123]:
In [124]:
dfBang.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMean", "priceLinear", "priceRandom",
"priceMA12", "priceExp12", "priceDecomp", "priceARIMA"])
Out[124]: