In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
# download the dataset here:
## https://datamarket.com/data/set/22u3/international-airline-passengers-monthly-totals-in-thousands-jan-49-dec-60#!ds=22u3&display=line
In [3]:
np.random.seed(410)
In [4]:
series = pd.read_csv('international-airline-passengers.csv', header=0, parse_dates=[0], index_col=0, squeeze=True)
plt.plot(series)
plt.show()
In [5]:
series.head()
Out[5]:
In [6]:
series.columns = ['mothly_passengers']
In [7]:
series.head()
Out[7]:
In [9]:
from statsmodels.tsa.arima_model import ARIMA
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(series)
plt.show()
In [10]:
# Check Stationary
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = timeseries.rolling(window=12,center=False).mean()
rolstd = timeseries.rolling(window=12,center=False).std()
#Plot rolling statistics:
plt.figure(figsize=(9,7))
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='green', label='Rolling Mean')
std = plt.plot(rolstd, color='red', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
#Perform Dickey-Fuller test:
print 'Results of Dickey-Fuller Test:'
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print dfoutput
In [11]:
test_stationarity(series)
# not stationary
# mean is changing along the time
# Test Statistics is higher than any Critical Value
In [12]:
# try to make the data tom stationary first
## reduce trend by using log()
import numpy as np
ts_log = np.log(series)
plt.figure(figsize=(9,7))
plt.plot(ts_log)
plt.show()
In [13]:
## differentiating
## both series and ts_log are showing seasonality, so, I will just try differentiating and decomposing
ts_log_diff = ts_log - ts_log.shift()
plt.figure(figsize=(9,7))
plt.plot(ts_log_diff)
plt.show()
In [14]:
ts_log_diff.head()
Out[14]:
In [15]:
ts_log_diff.dropna(inplace=True)
test_stationarity(ts_log_diff)
# Test Statistic is lower than 10% Critical Value, which means there is 90% confidence that the data is stationary now
In [16]:
# try forcasting to see how it works
from statsmodels.tsa.stattools import acf, pacf
lag_acf = acf(ts_log_diff, nlags=20)
lag_pacf = pacf(ts_log_diff, nlags=20, method='ols')
plt.figure(figsize=(20,10))
#Plot ACF:
## q – The lag value where the ACF chart crosses the upper confidence interval for the first time (in this case q=2)
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
#Plot PACF:
## p – The lag value where the PACF chart crosses the upper confidence interval for the first time (in this case p=2)
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()
plt.show()
In [17]:
from statsmodels.tsa.arima_model import ARIMA
# RSS is for the values of residuals
# AR model
model = ARIMA(ts_log, order=(2, 1, 0))
results_AR = model.fit(disp=-1)
plt.figure(figsize=(20,10))
plt.plot(ts_log_diff)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('Differenting RSS: %.4f'% sum((results_AR.fittedvalues-ts_log_diff)**2))
plt.show()
In [18]:
# MA Model
model = ARIMA(ts_log, order=(0, 1, 2))
results_MA = model.fit(disp=-1)
plt.figure(figsize=(20,10))
plt.plot(ts_log_diff)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('Differenting RSS: %.4f'% sum((results_MA.fittedvalues-ts_log_diff)**2))
plt.show()
In [19]:
# combined mobel
model = ARIMA(ts_log, order=(2, 1, 2))
results_ARIMA = model.fit(disp=-1)
plt.figure(figsize=(20,10))
plt.plot(ts_log_diff)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('Differenting RSS: %.4f'% sum((results_ARIMA.fittedvalues-ts_log_diff)**2))
plt.show()
In [20]:
# Based on RSS above, Combined model for Differenting is better, try to see forcasted results
## Scale back to original value, and see how this combined value performs
# Step 1 - store predicted result as seperated result
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
print predictions_ARIMA_diff.head()
In [21]:
# Step 2 - Convert the differencing to log scale
## Add these differences consecutively to the base number
## An easy way to do it is to first determine the cumulative sum at index and then add it to the base number
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
print predictions_ARIMA_diff_cumsum.head()
In [22]:
predictions_ARIMA_log = pd.Series(ts_log.iloc[0], index=ts_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.head()
Out[22]:
In [23]:
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.figure(figsize=(20,10))
plt.plot(series, color='red')
plt.plot(predictions_ARIMA, color='green')
plt.title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA-series)**2)/len(series)))
plt.show()
In [24]:
## try decomposing, and see whether it could make it better
from statsmodels.tsa.seasonal import seasonal_decompose
# trend, seasonality are separated out from data, and we can model the residuals
decomposition = seasonal_decompose(ts_log)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
plt.figure(figsize=(10,10))
plt.subplot(411)
plt.plot(ts_log, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()
plt.show()
In [25]:
ts_log_decompose = residual
ts_log_decompose.dropna(inplace=True)
test_stationarity(ts_log_decompose)
# This one has even lower than 1% Critical Value, which means 99% confidence for stationary
## but it's mean and variance looks more unstable
In [26]:
# try forecast with decomposed result
from statsmodels.tsa.stattools import acf, pacf
lag_acf = acf(ts_log_decompose, nlags=20)
lag_pacf = pacf(ts_log_decompose, nlags=20, method='ols')
plt.figure(figsize=(20,10))
#Plot ACF:
## q – The lag value where the ACF chart crosses the upper confidence interval for the first time (in this case q=2)
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_decompose)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_decompose)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
#Plot PACF:
## p – The lag value where the PACF chart crosses the upper confidence interval for the first time (in this case p=2)
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_decompose)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_decompose)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()
plt.show()
In [27]:
# AR model
model = ARIMA(ts_log, order=(2, 1, 0))
results_AR = model.fit(disp=-1)
plt.figure(figsize=(20,10))
plt.plot(ts_log_decompose)
plt.plot(results_AR.fittedvalues, color='red')
result = (results_AR.fittedvalues-ts_log_decompose)**2
result.dropna(inplace=True)
plt.title('Decompose RSS: %.4f'% sum(result))
plt.show()
In [28]:
# MA Model
model = ARIMA(ts_log, order=(0, 1, 2))
results_MA = model.fit(disp=-1)
plt.figure(figsize=(20,10))
plt.plot(ts_log_decompose)
plt.plot(results_MA.fittedvalues, color='red')
result = (results_MA.fittedvalues-ts_log_decompose)**2
result.dropna(inplace=True)
plt.title('Decompose RSS: %.4f'% sum(result))
plt.show()
In [29]:
# combined mobel
model = ARIMA(ts_log, order=(2, 1, 2))
results_ARIMA = model.fit(disp=-1)
plt.figure(figsize=(20,10))
plt.plot(ts_log_decompose)
plt.plot(results_ARIMA.fittedvalues, color='red')
result = (results_ARIMA.fittedvalues-ts_log_decompose)**2
result.dropna(inplace=True)
plt.title('Decompose RSS: %.4f'% sum(result))
plt.show()
In [122]:
# It's seems that decompose AR model works best. Let's try forecasting here
# So far, it's quite challenging to add seasonality and trend back to predicted result,
## I'm using predicted result to compare with residual here
predictions_AR = pd.Series(results_AR.fittedvalues, copy=True)
print predictions_AR.head()
plt.figure(figsize=(20,10))
plt.plot(residual, color='red')
plt.plot(predictions_AR, color='green')
result = (predictions_AR-residual)**2
result.dropna(inplace=True)
plt.title('RMSE: %.4f'% np.sqrt(sum(result)/len(series)))
plt.show()
In [31]:
# LSTM, using decomposed result
print residual.head()
print
print(residual.isnull().sum())
In [32]:
# split into training and testing data
training_size = int(len(residual)*0.67)
train_data = residual[0:training_size]
test_data = residual[training_size:len(residual)]
print train_data.head()
print test_data.head()
In [33]:
residual.values[4:10]
Out[33]:
In [34]:
# Now we need to create 2 numpy arrays X, Y
## X represent time t value, while Y is time t+1 value
def look_forward_dataset(data, forward_step=1):
X = []
Y = []
for i in range(len(data)-forward_step):
X.append([data[i]]) # pay attention to the structure here, important
Y.append([data[i+1]])
return np.array(X), np.array(Y)
time_step = 1
train_X, train_Y = look_forward_dataset(train_data, time_step)
test_X, test_Y = look_forward_dataset(test_data, time_step)
In [35]:
# LSTM expects the input in a format as [samples, time steps, dimension]
## In this case, time steps is 1
train_X = np.reshape(train_X, train_X.shape + (1,))
test_X = np.reshape(test_X, test_X.shape + (1,))
train_X[4:10]
Out[35]:
In [36]:
# fit LSTM model
model = Sequential()
# 4 LSTM blocks, and LSTM doesn't care sample amount, so input_shape just need (time_step, dimension)
model.add(LSTM(4, input_shape=train_X.shape[1:]))
model.add(Dense(1)) # bring the output down to 1 single output
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(train_X, train_Y, epochs=100, batch_size=1, verbose=2,
validation_data=(test_X, test_Y)) # using the default sigmoid function here
Out[36]:
In [51]:
# make predictions
train_predict = model.predict(train_X)
test_predict = model.predict(test_X)
train_RMSE = math.sqrt(mean_squared_error(train_Y[:,0], train_predict[:,0]))
print 'Train RMSE: %.7f' % (train_RMSE)
test_RMSE = math.sqrt(mean_squared_error(test_Y[:,0], test_predict[:,0]))
print 'Test RMSE: %.7f' % (test_RMSE)
In [105]:
# In order to plot residual, you need to change its format
origin_values = residual.values
origin_data = []
for elem in origin_values:
origin_data.append([elem])
origin_data = np.array(origin_data)
origin_data[4:10]
Out[105]:
In [116]:
train_predict_plot = np.empty_like(origin_data)
train_predict_plot[:, :] = np.nan
train_predict_plot[0:len(train_predict),:] = train_predict
test_predict_plot = np.empty_like(origin_data)
test_predict_plot[:, :] = np.nan
test_predict_plot[len(train_predict)+2:len(origin_data),:] = test_predict
In [118]:
plt.plot(origin_data)
plt.plot(train_predict_plot)
plt.plot(test_predict_plot)
plt.show()