TL;DR Comparison of approaches
In [2]:
from numba import jit
import numpy as np
import pandas as pd
from numpy import sum, power, mean
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (16, 8)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
In [3]:
@jit(nopython=True)
def sum_sq_dev(x):
return sum(power(x - mean(x), 2))
In [4]:
@jit(nopython=True)
def sum_sq_dev_experimental(x):
population_mean = mean(x)
total = 0.0
for value in x:
total += power((value - population_mean), 2)
return total
In [5]:
x = np.random.randn(1000)
In [6]:
np.testing.assert_almost_equal(sum_sq_dev(x), sum_sq_dev_experimental(x)) # Basic sanity test
In [7]:
results = []
for exponent in range(7):
population_size = 10**exponent
x = np.random.randn(population_size)
timings_v1 = %timeit -o sum_sq_dev(x)
timings_v2 = %timeit -o sum_sq_dev_experimental(x)
np.testing.assert_almost_equal(sum_sq_dev(x), sum_sq_dev_experimental(x))
results.append((population_size, timings_v1.best, timings_v2.best))
In [8]:
df = pd.DataFrame(np.array(results), columns=['population_size', 'sum_sq_dev', 'sum_sq_dev_experimental'])
df.population_size = df.population_size.astype(int)
df = df.set_index('population_size')
df.apply(lambda x: x * 1000)
df.plot(logx=True);
plt.ylabel('best time (ms)')
Out[8]:
Out[8]:
Out[8]:
In [ ]: