Sebastian Raschka
last updated: 05/25/2014
I would be happy to hear your comments and suggestions.
Please feel free to drop me a note via
twitter, email, or google+.
In [1]:
# The statistics module has been added to
# the standard library in Python 3.4
import statistics as stats
import numpy as np
def calc_mean(samples):
return sum(samples)/float(len(samples))
def np_mean(samples):
return np.mean(samples)
def np_mean_ary(np_array):
return np.mean(np_array)
def st_mean(samples):
return stats.mean(samples)
def np_convert_and_mean_ary(samples):
return np.mean(np.array(samples))
In [2]:
n = 1000
samples = list(range(n))
samples_array = np.arange(n)
assert(st_mean(samples) == np_mean(samples)
== calc_mean(samples) == np_mean_ary(samples_array) == np_convert_and_mean_ary(samples))
print('ok')
In [3]:
import timeit
funcs = ['st_mean', 'np_mean', 'calc_mean', 'np_mean_ary', 'np_convert_and_mean_ary']
orders_n = [10**n for n in range(1, 6)]
times_n = {f:[] for f in funcs}
for n in orders_n:
samples = list(range(n))
for f in funcs:
if f == 'np_mean_ary':
samples = np.arange(n)
times_n[f].append(min(timeit.Timer('%s(samples)' %f,
'from __main__ import %s, samples' %f)
.repeat(repeat=3, number=1000)))
In [4]:
import platform
import multiprocessing
def print_sysinfo():
print('\nPython version:', platform.python_version())
print('NumPy version', np.__version__)
print('compiler:', platform.python_compiler())
print('\nsystem :', platform.system())
print('release :', platform.release())
print('machine :', platform.machine())
print('processor :', platform.processor())
print('interpreter:', platform.architecture()[0])
print('CPU count :', multiprocessing.cpu_count())
print('\n\n')
In [5]:
%matplotlib inline
In [6]:
import matplotlib.pyplot as plt
def plot_timing():
labels = [('st_mean', 'statistics.mean()'),
('np_mean', 'numpy.mean() on list'),
('np_mean_ary', 'numpy.mean() on array'),
('calc_mean', 'sum(samples)/len(samples)'),
('np_convert_and_mean_ary', 'convert to array then numpy.mean()')
]
plt.rcParams.update({'font.size': 12})
fig = plt.figure(figsize=(10,8))
for lb in labels:
plt.plot(orders_n, times_n[lb[0]],
alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
plt.title('Performance of different approaches for calculating sample means')
max_perf = max( s/c for s,c in zip(times_n['st_mean'],
times_n['np_convert_and_mean_ary']) )
min_perf = min( s/c for s,c in zip(times_n['st_mean'],
times_n['np_convert_and_mean_ary']) )
ftext = 'Converting a list to a numpy array and then using numpy.mean() \n is {:.2f}x to '\
'{:.2f}x faster than statistics.mean() on lists'\
.format(min_perf, max_perf)
plt.figtext(.14,.15, ftext, fontsize=11, ha='left')
plt.show()
In [7]:
print_sysinfo()
plot_timing()
In [7]: