Exponential weighted moving average

TL;DR This notebook demonstrates the performance improvement of using a numba JIT compiled algorithm for calculating exponential weighted moving average over the Pandas equivalent for some sample data.



In [1]:

    
import numpy as np
import pandas as pd
from numba import jit
import time

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from matplotlib import rcParams
from matplotlib import pyplot as plt
rcParams['figure.figsize'] = 16, 8

import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)



In [2]:

    
from utilities.ewma import ewma  # this is the function we're going to test versus pandas



In [3]:

    
# create some sample data and introduce a few NaNs
data = np.arange(1e5) * 1.0
data[3] = np.nan
data[4] = np.nan



In [4]:

    
expected = pd.Series(data).ewm(alpha=0.1, adjust=True, ignore_na=False).mean().values



In [5]:

    
output = ewma(data, alpha=0.1, adjust=True, ignore_na=False)



In [6]:

    
np.allclose(expected, output)  # assert output is as per Pandas equivalent









    Out[6]:





True



In [7]:

    
def benchmarks():
    
    res = []
    
    for exponent in range(3, 7):
        n = 10**exponent
        data = np.arange(n).astype(float)
        data[3] = np.nan
        data[4] = np.nan
        data[-1] = np.nan
        s = pd.Series(data)

        t1 = time.time()
        pandas_output = s.ewm(alpha=0.1, adjust=True, ignore_na=False).mean().values
        t2 = time.time()
        res.append(('pandas', n, (t2 - t1)))
    
        t1 = time.time()
        ewma_output = ewma(data, alpha=0.1, adjust=True, ignore_na=False)
        t2 = time.time()
        res.append(('ewma', n, (t2 - t1)))
        
        assert np.allclose(pandas_output, ewma_output)
        
    return res



In [8]:

    
data = benchmarks()
df = pd.DataFrame(data, columns = ['fn', 'population', 'time (ms)'])

df['time (ms)'] = df['time (ms)'].apply(lambda x: x * 1000.) 
df = pd.pivot_table(df, values='time (ms)', index=['population'], columns=['fn'], aggfunc=np.sum)
df



In [9]:

    
df.plot(logx=True)
plt.ylabel('time (ms)')









    Out[9]:





<matplotlib.text.Text at 0x229773b9a20>



In [ ]:

fn	ewma	pandas
population
1000	0.000000	0.000000
10000	0.000000	0.980854
100000	0.980854	10.765076
1000000	9.810448	108.829498