In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.ar_model import AR
from sklearn.metrics import r2_score
%matplotlib inline
plt.rcParams['figure.figsize']=(20,10)
plt.style.use('ggplot')
In [2]:
sales_data = pd.read_csv('retail_sales.csv')
sales_data['date']=pd.to_datetime(sales_data['date'])
sales_data.set_index('date', inplace=True)
In [3]:
sales_data.head()
Out[3]:
In [4]:
sales_data.plot()
Out[4]:
In [5]:
decomposed = seasonal_decompose(sales_data['sales'], model='additive')
x =decomposed.plot() #See note below about this
In [6]:
sales_data['stationary']=sales_data['sales'].diff()
In [7]:
sales_data.head()
Out[7]:
In [8]:
sales_data['stationary'].plot()
Out[8]:
In [9]:
decomposed = seasonal_decompose(sales_data['stationary'].dropna(), model='additive')
x =decomposed.plot() #See note below about this
In [10]:
pd.tools.plotting.lag_plot(sales_data['sales'])
Out[10]:
In [11]:
pd.tools.plotting.autocorrelation_plot(sales_data['sales'])
Out[11]:
In [12]:
sales_data['sales'].corr(sales_data['sales'].shift(12))
Out[12]:
In [13]:
#create train/test datasets
X = sales_data['stationary'].dropna()
train_data = X[1:len(X)-12]
test_data = X[X[len(X)-12:]]
In [14]:
#train the autoregression model
model = AR(train_data)
model_fitted = model.fit()
In [15]:
print('The lag value chose is: %s' % model_fitted.k_ar)
In [16]:
print('The coefficients of the model are:\n %s' % model_fitted.params)
In [17]:
# make predictions
predictions = model_fitted.predict(
start=len(train_data),
end=len(train_data) + len(test_data)-1,
dynamic=False)
# create a comparison dataframe
compare_df = pd.concat(
[sales_data['stationary'].tail(12),
predictions], axis=1).rename(
columns={'stationary': 'actual', 0:'predicted'})
In [18]:
compare_df
Out[18]:
In [19]:
compare_df.plot()
Out[19]:
In [20]:
r2 = r2_score(sales_data['stationary'].tail(12), predictions)
In [21]:
r2
Out[21]:
In [ ]: