Time Series Forcasting

Timothy Helton

Tutorial from Dr. Jason Brownlee's Machine Learning Mastery site.



Imports


In [1]:
from collections import OrderedDict
import os
import os.path as osp

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_squared_error

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline


Load Data


In [2]:
data_file = osp.join('..', 'data', 'time_series_forecasting',
                     'car_sales.csv')
data = pd.read_csv(data_file, header=0)
data.drop(108, inplace=True)
data.columns = ['date', 'sales']
data.date = pd.to_datetime(data.date)

In [3]:
data.head()
data.tail()


Out[3]:
date sales
0 1960-01-01 6550.0
1 1960-02-01 8728.0
2 1960-03-01 12026.0
3 1960-04-01 14395.0
4 1960-05-01 14587.0
Out[3]:
date sales
103 1968-08-01 16722.0
104 1968-09-01 14385.0
105 1968-10-01 21342.0
106 1968-11-01 17180.0
107 1968-12-01 14577.0

In [4]:
months = OrderedDict({
    'JAN': 1,
    'FEB': 2,
    'MAR': 3,
    'APR': 4,
    'MAY': 5,
    'JUN': 6,
    'JUL': 7,
    'AUG': 8,
    'SEP': 9,
    'OCT': 10,
    'NOV': 11,
    'DEC': 12,
})
month_masks = OrderedDict(
    {k: data.date.map(lambda x: x.month == v)
     for k, v in months.items()})

In [5]:
sales = pd.DataFrame(OrderedDict({k: data.sales[v]
                                  for k, v in month_masks.items()}))

In [6]:
suptitle = 'Quebec Car Sales'
title = '1960 - 1968'

suptitle_size = 24
title_size = 20
label_size = 16

fig = plt.figure(f'{suptitle}: {title}', figsize=(10, 15),
                 facecolor='white', edgecolor='black')
rows, cols = (3, 1)
ax1 = plt.subplot2grid((rows, cols), (0, 0))

data.plot(x='date', y='sales', color='navy', legend=None, ax=ax1);

ax1.set_title(title, fontsize=title_size)
ax1.set_xlabel('Date', fontsize=label_size)
ax1.set_ylabel('Sales', fontsize=label_size)

fig.suptitle(suptitle, fontsize=suptitle_size, y=0.93)
fig.autofmt_xdate();



In [7]:
suptitle = 'Quebec Car Sales Distribution'
title = '1960 - 1968'

suptitle_size = 24
title_size = 20
label_size = 16

fig = plt.figure(f'{suptitle}: {title}', figsize=(10, 7),
                 facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax1 = plt.subplot2grid((rows, cols), (0, 0))

sales.plot(kind='box', ax=ax1)

ax1.set_title(title, fontsize=title_size)
ax1.set_xlabel('Month', fontsize=label_size)
ax1.set_ylabel('Sales', fontsize=label_size)

fig.suptitle(suptitle, fontsize=suptitle_size, y=0.97);



Train and Test Datasets


In [8]:
test_duration = 24
train = data[0:-test_duration]
test = data[-test_duration:]


Optimized Persistence Forcast

Derive Persistance Model

  • prediction[n] = train[-n]

Make Forcast


In [9]:
scores = []
for td in range(test_duration):
    nieve = data.shift(td + 1)
    scores.append(mean_squared_error(test.sales,
                                     nieve.sales[-test_duration:])**0.5)
forcast_score = nieve.date[-test_duration:].copy().to_frame()
forcast_score['scores'] = scores

Plot Initial Prediction


In [10]:
suptitle = f'Nieve Set Size: {test_duration}'
title = 'Root Mean Square Deviation'

suptitle_size = 24
title_size = 20
label_size = 16

fig = plt.figure(f'{suptitle}: {title}', figsize=(10, 8),
                 facecolor='white', edgecolor='black')
rows, cols = (2, 1)
ax1 = plt.subplot2grid((rows, cols), (0, 0))
ax2 = plt.subplot2grid((rows, cols), (1, 0))

forcast_score.plot(x='date', y='scores', legend=None, ax=ax1)

ax1.set_title(title, fontsize=title_size)
ax1.set_xlabel('Date', fontsize=label_size)
ax1.set_ylabel('Root Mean Squared Error', fontsize=label_size)

test.plot(x='date', y='sales', label='Test Dataset', ax=ax2)

nieve_1 = data.copy()
nieve_1.sales = nieve_1.sales.shift(1)
nieve_1[-test_duration:].plot(x='date', y='sales',
                              label='Nieve 1 Month Shift',
                              ax=ax2)

nieve_12 = data.copy()
nieve_12.sales = nieve_12.sales.shift(12)
nieve_12[-test_duration:].plot(x='date', y='sales',
                               label='Nieve 12 Month Shift',
                               ax=ax2)

ax2.set_title('Forcasted Sales', fontsize=title_size)
ax2.set_xlabel('Date', fontsize=label_size)
ax2.set_ylabel('Sales', fontsize=label_size)

fig.suptitle(suptitle, fontsize=suptitle_size, x=0.54, y=1.03)
plt.tight_layout();


Review and Assess 1

A cyclic pattern occurs every 12 months. Alter the persistance model to be the average of all prior months of like kind.


In [11]:
sales_mean = sales.mean()
sales_mean.index = range(12)
sales_mean = sales_mean.append([sales_mean] * 8, ignore_index=True)

weighted_month = data.copy()
weighted_month['sales_mean'] = sales_mean

In [12]:
suptitle = 'Weighted Monthly Average'

suptitle_size = 24
label_size = 16

fig = plt.figure(f'{suptitle}', figsize=(10, 8),
                 facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax1 = plt.subplot2grid((rows, cols), (0, 0))

test.plot(x='date', y='sales', label='Test Dataset', ax=ax1)
weighted_month[-test_duration:].plot(x='date', y='sales_mean',
                                     ax=ax1)

ax1.set_xlabel('Date', fontsize=label_size)
ax1.set_ylabel('Sales', fontsize=label_size)

fig.suptitle(suptitle, fontsize=suptitle_size, y=1.03)
plt.tight_layout();




Expanding Window Forecast

  • Calculates a statistic on all available historic data and use that to make a forcast.
  • This model grows as more real observations are collected.

In [13]:
window_mean = data.sales.expanding().mean()
window_median = data.sales.expanding().median()
window_statistics = pd.concat([data.date,
                               window_mean,
                               window_median], axis=1)
window_statistics.columns = ['date', 'win_mean', 'win_median']
window_statistics[-test_duration:]


Out[13]:
date win_mean win_median
84 1967-01-01 13773.917647 12965.0
85 1967-02-01 13748.732558 12862.5
86 1967-03-01 13831.908046 12965.0
87 1967-04-01 13898.500000 13183.0
88 1967-05-01 14012.910112 13401.0
89 1967-06-01 14102.922222 13577.5
90 1967-07-01 14104.208791 13754.0
91 1967-08-01 14096.923913 13594.0
92 1967-09-01 14091.559140 13598.0
93 1967-10-01 14124.489362 13676.0
94 1967-11-01 14145.484211 13754.0
95 1967-12-01 14140.979167 13733.5
96 1968-01-01 14131.381443 13713.0
97 1968-02-01 14132.602041 13733.5
98 1968-03-01 14193.272727 13754.0
99 1968-04-01 14268.590000 13769.0
100 1968-05-01 14385.722772 13784.0
101 1968-06-01 14451.392157 13787.5
102 1968-07-01 14486.077670 13791.0
103 1968-08-01 14507.576923 13806.0
104 1968-09-01 14506.409524 13821.0
105 1968-10-01 14570.896226 13876.5
106 1968-11-01 14595.280374 13932.0
107 1968-12-01 14595.111111 14076.0

In [14]:
suptitle = 'Expanding Window Forecast'

suptitle_size = 24
label_size = 16

fig = plt.figure(f'{suptitle}', figsize=(10, 8),
                 facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax1 = plt.subplot2grid((rows, cols), (0, 0))

test.plot(x='date', y='sales', label='Test Dataset', ax=ax1)
window_statistics[-test_duration:].plot(x='date', y='win_mean',
                                        label='Expanding Window Mean',
                                        ax=ax1)
window_statistics[-test_duration:].plot(x='date', y='win_median',
                                        label='Expanding Window Median',
                                        ax=ax1)

ax1.set_xlabel('Date', fontsize=label_size)
ax1.set_ylabel('Sales', fontsize=label_size)

fig.suptitle(suptitle, fontsize=suptitle_size, y=1.03)
plt.tight_layout();




Rolling Window Forcast

  • Calculates a statistic on n number of historic data and use that to make a forcast.

In [15]:
rolling_mean = pd.DataFrame(data.date)
rolling_mean_error = []
rolling_median = pd.DataFrame(data.date)
rolling_median_error = []

for n in range(1, test_duration + 1):
    rolling_mean[f'rolling_{n}'] = data.sales.rolling(n).mean()
    rolling_median[f'rolling_{n}'] = data.sales.rolling(n).median()
    rolling_mean_error.append(mean_squared_error(
        test.sales, rolling_mean[f'rolling_{n}'][-test_duration - 1:-1])**0.5)
    rolling_median_error.append(mean_squared_error(
        test.sales, rolling_median[f'rolling_{n}'][-test_duration - 1:-1])**0.5)

min_mean = min(rolling_mean_error)
min_mean_idx = rolling_mean_error.index(min_mean) + 1
min_median = min(rolling_median_error)
min_median_idx = rolling_median_error.index(min_median) + 1

In [16]:
suptitle = f'Rolling Window: {test_duration}'
title = 'Root Mean Square Deviation'

suptitle_size = 24
title_size = 20
label_size = 16

fig = plt.figure(f'{suptitle}: {title}', figsize=(10, 8),
                 facecolor='white', edgecolor='black')
rows, cols = (2, 1)
ax1 = plt.subplot2grid((rows, cols), (0, 0))
ax2 = plt.subplot2grid((rows, cols), (1, 0))

ax1.plot(range(1, len(rolling_mean_error) + 1), rolling_mean_error,
         label='Rolling Mean Error')
ax1.plot(range(1, len(rolling_median_error) + 1), rolling_median_error,
         label='Rolling Median Error')
ax1.scatter(min_mean_idx, min_mean, marker='D')
ax1.text(min_mean_idx, min_mean - 150, 'Minimum Mean Error')
ax1.scatter(min_median_idx, min_median, marker='D')
ax1.text(min_median_idx, min_median - 150, 'Minimum Median Error')

ax1.set_title(title, fontsize=title_size)
ax1.legend()
ax1.set_xlabel('Date', fontsize=label_size)
ax1.set_ylabel('Root Mean Squared Error', fontsize=label_size)
ax1_xmin, ax1_xmax, ax1_ymin, ax1_ymax = ax1.axis()
ax1.axis((ax1_xmin, ax1_xmax, 3500, ax1_ymax))

test.plot(x='date', y='sales', label='Test Dataset', ax=ax2)

rolling_mean[-test_duration:].plot(x='date', y='rolling_13',
                                   label='Rolling 13 Month Mean',
                                   ax=ax2)
                                    
rolling_median[-test_duration:].plot(x='date', y='rolling_13',
                                     label='Rolling 13 Month Median',
                                     ax=ax2)

ax2.set_title('Forcasted Sales', fontsize=title_size)
ax2.set_xlabel('Date', fontsize=label_size)
ax2.set_ylabel('Sales', fontsize=label_size)

fig.suptitle(suptitle, fontsize=suptitle_size, x=0.54, y=1.03)
plt.tight_layout();