In [1]:
%matplotlib inline
from datetime import date
from l1 import l1
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import signal as sp_signal
from sklearn import ensemble
import statsmodels.api as sm
plt.style.use('fivethirtyeight')
_DATA_DIR = 'data'
_FIG_DIR = 'figures'
if not os.path.exists(_FIG_DIR):
os.makedirs(_FIG_DIR)
_SNP_500_PATH = os.path.join(_DATA_DIR, 'snp500.csv')
_BEGINNING_DATE = date(2014, 1, 1)
## Matplotlib Variables
_FIG_SIZE = (16, 12)
_FIG_FORMAT = 'png'
_FIG_DPI = 200
_FIG_LEGEND_LOCATION = 4
_ORIGINAL_SIGNAL_LABEL = 'Original Signal'
_MEAN_AVERAGE_SIGNAL_LABEL = 'Mean Averaged Signal'
_MEDIAN_FILTERED_SIGNAL_LABEL = 'Median Filtered Signal'
_EXPONENTIAL_SMOOTHED_AVERAGE_SIGNAL_LABEL = 'Exponential Moving Averaged Signal'
_SNP_CYCLE_SIGNAL_LABEL = 'Cycle Signal'
_SNP_TREND_SIGNAL_LABEL = 'Trend'
_L1_TREND_SIGNAL_LABEL = '$l_1$ Trend'
_BAND_PASS_FILTER_LABEL = 'Band-pas filtered'
_BUTTERWORTH_FILTER_TITLE = 'Butterworth filter frequency response'
_BUTTERWORTH_FILTER_XLABEL = 'Frequency [radians / second]'
_BUTTERWORTH_FILTER_YLABEL = 'Amplitude [dB]'
_MEAN_AVERAGE_SIGNAL_TITLE_TEMPLATE = 'Mean Average signal with window = {}'.format
_MEDIAN_FILTER_SIGNAL_TITLE_TEMPLATE = 'Median Filtered signal with window = {}'.format
_EXPONENTIAL_SMOOTHING_TITLE_TEMPLATE = 'Exponential Smoothed signal with span = {}'.format
_HODRICK_PRESSCOTT_TITLE_TEMPLATE = 'Trend and Cycle Signal with smoothing parameter: {}'.format
_L1_TITLE_TEMPLATE = '$l_1$ Trend Signal with regularizer: {}'.format
_BAND_PASS_TITLE_TEMPLATE = 'Band-pass order: {}, low - high cutoff frequency: {} - {}'.format
Reference: http://www.albany.edu/cer/bc/bc_essays.html
In [2]:
def _file_format(string_):
string_ = string_.replace('-', '_').replace(' ', '_').replace('$', '')
string_ += '.' + _FIG_FORMAT
return string_
In [3]:
df = pd.read_csv(_SNP_500_PATH, parse_dates=['Date'])
df = df.sort(['Date'])
df = df[df.Date > _BEGINNING_DATE]
In [4]:
df.head()
Out[4]:
In [5]:
title = 'SNP 500 Close Price between {} - 2015-07-11'.format(_BEGINNING_DATE)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-');
plt.title(title);
plt.savefig(fig_path, dpi=_FIG_DPI)
For a window size: $N$, the computation is pretty straightforward. Applying window on top of the original signal and average the signal for a given time window. Following formula assumes that $N$ being odd.
$$y(t) = \frac{\displaystyle\sum_{i=-\frac{N+1}{2}}^{\frac{N+1}{2}} x(t + i)}{w} $$
In [6]:
window = 11
title = _MEAN_AVERAGE_SIGNAL_TITLE_TEMPLATE(window)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, pd.rolling_mean(df.Close, window), '-', label=_MEAN_AVERAGE_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [7]:
window = 21
title = _MEAN_AVERAGE_SIGNAL_TITLE_TEMPLATE(window)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, pd.rolling_mean(df.Close, window), '-', label=_MEAN_AVERAGE_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [8]:
window = 61
title = _MEAN_AVERAGE_SIGNAL_TITLE_TEMPLATE(window)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, pd.rolling_mean(df.Close, window), '-', label=_MEAN_AVERAGE_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [9]:
window = 11
median_filtered_signal = sp_signal.medfilt(df.Close, window)
title = _MEDIAN_FILTER_SIGNAL_TITLE_TEMPLATE(window)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, median_filtered_signal, '-', label=_MEAN_AVERAGE_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [10]:
window = 21
median_filtered_signal = sp_signal.medfilt(df.Close, window)
title = _MEDIAN_FILTER_SIGNAL_TITLE_TEMPLATE(window)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, median_filtered_signal, '-', label=_MEDIAN_FILTERED_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [11]:
window = 61
median_filtered_signal = sp_signal.medfilt(df.Close, window)
title = _MEDIAN_FILTER_SIGNAL_TITLE_TEMPLATE(window)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, median_filtered_signal, '-', label=_MEDIAN_FILTERED_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
One can think the exponential weights in terms of span and half life. Half life signifies when the weight becomes half of the max and span is how many days it spans when you get the average of signal with weights.
In [12]:
## Exponential Weighted Moving Average
span = 11
title = _EXPONENTIAL_SMOOTHING_TITLE_TEMPLATE(span)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, pd.stats.moments.ewma(df.Close, span=span), '-', label=_EXPONENTIAL_SMOOTHED_AVERAGE_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [13]:
## Exponential Weighted Moving Average
span = 21
title = _EXPONENTIAL_SMOOTHING_TITLE_TEMPLATE(span)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, pd.stats.moments.ewma(df.Close, span=span), '-', label=_EXPONENTIAL_SMOOTHED_AVERAGE_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [14]:
## Exponential Weighted Moving Average
span = 61
title = _EXPONENTIAL_SMOOTHING_TITLE_TEMPLATE(span)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, pd.stats.moments.ewma(df.Close, span=span), '-', label=_EXPONENTIAL_SMOOTHED_AVERAGE_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
There are various ways to do trend estimation methods, you could decompose the signal into two compoenent; one cycle part(which is short-term) and one trend part(which is medium-to-long term), which is what Hodrick-Prescott Filter tries to do.
Hodrick Prescott filter is a bandpass filter where it tries to decompose the time-series signal into a trend $x_t$ (mid-term growth) and a cylical component(recurring and seasonal signal) $c_t$.
$$y_t = x_t + c_t$$The loss function that it tries to minimize is the following:
$$\min_{\\{ x_{t}\\} }\sum_{t}^{T} c_{t}^{2}+\lambda\displaystyle\sum_{t=1}^{T}\left[\left(x_{t}- x_{t-1}\right)-\left(x_{t-1}-x_{t-2}\right)\right]^{2}$$The first term is the square of difference of original signal and growth signal(cylical component) and $\lambda$ is the smoothing parameter.
Based on the smoothing parameter, you could actually change what type of effects you may want to include or capture(if you want to capture some variation and volatility in short-term signal, then you may want to use a smaller smoothing parameter so that you have less smooth signal. If you want to also capture only a long term range signal, the smoothing parameter could be chosen arbitrarily large. However, in order to get some changes, we need to not to choose very large smoothing optimization parameter.
I wrote this method in detail here.
In [15]:
lamb = 10
snp_cycle, snp_trend = sm.tsa.filters.hpfilter(df.Close, lamb=lamb)
title = _HODRICK_PRESSCOTT_TITLE_TEMPLATE(lamb)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, snp_cycle, '-', label=_SNP_CYCLE_SIGNAL_LABEL)
plt.plot_date(df.Date, snp_trend, '-', label=_SNP_TREND_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [16]:
lamb = int(1e3)
snp_cycle, snp_trend = sm.tsa.filters.hpfilter(df.Close, lamb=lamb)
title = _HODRICK_PRESSCOTT_TITLE_TEMPLATE(lamb)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, snp_cycle, '-', label=_SNP_CYCLE_SIGNAL_LABEL)
plt.plot_date(df.Date, snp_trend, '-', label=_SNP_TREND_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [17]:
lamb = int(1e6)
snp_cycle, snp_trend = sm.tsa.filters.hpfilter(df.Close, lamb=lamb)
title = _HODRICK_PRESSCOTT_TITLE_TEMPLATE(lamb)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, snp_cycle, '-', label=_SNP_CYCLE_SIGNAL_LABEL)
plt.plot_date(df.Date, snp_trend, '-', label=_SNP_TREND_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
Optimization function: $$ \frac{1}{2} \lVert x - y \rVert_2^2 + \lambda \lVert Dx \rVert_1$$ where $x,y \in \mathbf{R}^n$, $\lambda \in \mathbf{R}_+^n$ and $D \in \mathbf{R}^{(n-2) x n}$ is the second order difference matrix
In [18]:
regularizer = 1
title = _L1_TITLE_TEMPLATE(regularizer)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, l1(df.Close.values, regularizer), '-', label=_L1_TREND_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [19]:
regularizer = 10
title = _L1_TITLE_TEMPLATE(regularizer)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, l1(df.Close.values, regularizer), '-', label=_L1_TREND_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [20]:
regularizer = 100
title = _L1_TITLE_TEMPLATE(regularizer)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, l1(df.Close.values, regularizer), '-', label=_L1_TREND_SIGNAL_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
Bandpass filters allow certain frequencies of the signal(between low cutoff frequency and high cutoff frequency) and attenuates the other frequencies. By doing so, one can prepare different bandpass filters to stop a particular band as well(called band-stop filter). If the signal is very volatile(high frequency) and you want to also remove the bias(low frequency), then bandpass filter provides a nice sweet spot to remove/attenuate low frequency and high frequency in the signal. If you consider Hodrick-Prescott filter as a band-pass filter, it extracts mid-term trend by removing very small changes(bias) and extracting short-term changes(cycle).
In [21]:
## Filter Construction
filter_order = 2
low_cutoff_frequency = 0.001
high_cutoff_frequency = 0.15
b, a = sp_signal.butter(filter_order, [low_cutoff_frequency, high_cutoff_frequency], btype='bandpass', output='ba')
bandpass_filtered = sp_signal.filtfilt(b, a, df.Close.values)
# Plot
title = _BAND_PASS_TITLE_TEMPLATE(filter_order, low_cutoff_frequency, high_cutoff_frequency)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, df.Close-bandpass_filtered, '-', label=_BAND_PASS_FILTER_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [22]:
w, h = sp_signal.freqs(b, a)
plt.figure(figsize=_FIG_SIZE)
plt.semilogx(w, 20 * np.log10(abs(h)))
plt.title(_BUTTERWORTH_FILTER_TITLE)
plt.xlabel(_BUTTERWORTH_FILTER_XLABEL)
plt.ylabel(_BUTTERWORTH_FILTER_YLABEL)
#plt.margins(0, 0.1)
plt.grid(which='both', axis='both')
plt.axvline(0, color='green')
plt.axvline(high_cutoff_frequency, color='green');
In [23]:
## Filter Construction
filter_order = 2
low_cutoff_frequency = 0.001
high_cutoff_frequency = 0.35
b, a = sp_signal.butter(filter_order, [low_cutoff_frequency, high_cutoff_frequency], btype='bandpass', output='ba')
bandpass_filtered = sp_signal.filtfilt(b, a, df.Close.values)
# Plot
title = _BAND_PASS_TITLE_TEMPLATE(filter_order, low_cutoff_frequency, high_cutoff_frequency)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, df.Close - bandpass_filtered, '-', label=_BAND_PASS_FILTER_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [24]:
w, h = sp_signal.freqs(b, a)
plt.figure(figsize=_FIG_SIZE)
plt.semilogx(w, 20 * np.log10(abs(h)))
plt.title(_BUTTERWORTH_FILTER_TITLE)
plt.xlabel(_BUTTERWORTH_FILTER_XLABEL)
plt.ylabel(_BUTTERWORTH_FILTER_YLABEL)
plt.grid(which='both', axis='both')
plt.axvline(0, color='green')
plt.axvline(high_cutoff_frequency, color='green');
In [25]:
## Filter Construction
filter_order = 2
low_cutoff_frequency = 0.001
high_cutoff_frequency = 0.95
b, a = sp_signal.butter(filter_order, [low_cutoff_frequency, high_cutoff_frequency], btype='bandpass', output='ba')
bandpass_filtered = sp_signal.filtfilt(b, a, df.Close.values)
# Plot
title = _BAND_PASS_TITLE_TEMPLATE(filter_order, low_cutoff_frequency, high_cutoff_frequency)
fig_path = os.path.join(_FIG_DIR, _file_format(title))
plt.figure(figsize=_FIG_SIZE)
plt.plot_date(df.Date, df.Close, '-', label=_ORIGINAL_SIGNAL_LABEL)
plt.plot_date(df.Date, df.Close - bandpass_filtered, '-', label=_BAND_PASS_FILTER_LABEL)
plt.title(title)
plt.legend(loc=_FIG_LEGEND_LOCATION);
plt.savefig(fig_path, format=_FIG_FORMAT, dpi=_FIG_DPI)
In [26]:
w, h = sp_signal.freqs(b, a)
plt.figure(figsize=_FIG_SIZE)
plt.semilogx(w, 20 * np.log10(abs(h)))
plt.title(_BUTTERWORTH_FILTER_TITLE)
plt.xlabel(_BUTTERWORTH_FILTER_XLABEL)
plt.ylabel(_BUTTERWORTH_FILTER_YLABEL)
#plt.margins(0, 0.1)
plt.grid(which='both', axis='both')
plt.axvline(0, color='green')
plt.axvline(high_cutoff_frequency, color='green');
low cutoff frequency and high cutoff frequency) and attenuates the other frequencies.