In [5]:
import timeit
import numpy as np
import statsmodels.stats.weightstats as stats
import statsmodels.tsa.stattools as stattools
In [53]:
def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False):
""" Very close to np.percentile, but supports weights.
NOTE: quantiles should be in [0, 1]!
:param values: np.array with data
:param quantiles: array-like with many quantiles needed
:param sample_weight: array-like of the same length as `array`
:param values_sorted: bool, if True, then will avoid sorting of initial array
:param old_style: if True, will correct output to be consistent with np.percentile.
:return: np.array with computed quantiles.
"""
values = np.array(values)
quantiles = np.array(quantiles)
if sample_weight is None:
sample_weight = np.ones(len(values))
sample_weight = np.array(sample_weight)
assert np.all(quantiles >= 0) and np.all(quantiles <= 1), 'quantiles should be in [0, 1]'
if not values_sorted:
sorter = np.argsort(values)
values = values[sorter]
sample_weight = sample_weight[sorter]
weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight
weighted_quantiles /= np.sum(sample_weight)
return np.interp(quantiles, weighted_quantiles, values)
In [59]:
def desc_stats(x, w):
s = stats.DescrStatsW(x, weights = w)
d = {}
d['mean'] = s.mean
d['std' ] = s.std
q = s.quantile( np.array([0.10, 0.25, 0.5, 0.75, 0.90]), return_pandas = False)
d['decile_1'] = q[0] # save 10% and 90% intervals as well
d['Q1'] = q[1]
d['median'] = q[2]
d['Q3'] = q[3]
d['decile_9'] = q[4]
d['inner_quartile_range'] = d['Q3'] - d['Q1']
d['q90_q10_range'] = d['decile_9'] - d['decile_1']
d['variance'] = d['std']*d['std']
d['min'] = np.min(x)
d['max'] = np.max(x)
return d
def numpy_stats(x, w):
d = {}
d['mean'] = np.average(x, weights=w)
d['variance'] = np.average( (x-d['mean'])**2, weights=w)
d['std'] = np.sqrt(d['variance'])
q = weighted_quantile(x, [0.1, 0.25, 0.5, 0.75, 0.9], sample_weight=w)
d['decile_1'] = q[0] # save 10% and 90% intervals as well
d['Q1'] = q[1]
d['median'] = q[2]
d['Q3'] = q[3]
d['decile_9'] = q[4]
d['inner_quartile_range'] = d['Q3'] - d['Q1']
d['q90_q10_range'] = d['decile_9'] - d['decile_1']
d['min'] = np.min(x)
d['max'] = np.max(x)
return d
def numpy_stats2(x, w):
d = {}
d['mean'] = np.average(x, weights=w)
d['variance'] = np.average( (x-d['mean'])**2, weights=w)
d['std'] = np.sqrt(d['variance'])
q = weighted_quantile(x, [0.1, 0.25, 0.5, 0.75, 0.9], sample_weight=w, old_style = True)
d['decile_1'] = q[0] # save 10% and 90% intervals as well
d['Q1'] = q[1]
d['median'] = q[2]
d['Q3'] = q[3]
d['decile_9'] = q[4]
d['inner_quartile_range'] = d['Q3'] - d['Q1']
d['q90_q10_range'] = d['decile_9'] - d['decile_1']
d['min'] = np.min(x)
d['max'] = np.max(x)
return d
data = np.random.rand(10000)
dataw = np.random.normal(size = 10000)
dataw = dataw + np.abs(np.min(dataw))*1.000001 # make all positive and non-zero
dataw *= 1000
def wrapped_numpy_stats():
numpy_stats(data,dataw)
return
def wrapped_desc_stats():
desc_stats(data,dataw)
return
In [ ]:
In [60]:
number = 100
np_time = (timeit.timeit(wrapped_numpy_stats, number = number))
desc_time = (timeit.timeit(wrapped_desc_stats, number = number))
print "For 100 tries of both"
print "numpy: ", np_time
print "desc: ", desc_time
print "desc / numpy: ", desc_time / np_time
print np_time / desc_time
In [61]:
np_result = numpy_stats(data,dataw)
#np_result_2 = numpy_stats2(data,dataw)
desc_result = desc_stats(data,dataw)
In [62]:
for k in np_result.keys():
print "%20s %5.5E %5.5E %5.5E"%(k, np_result[k], np_result_2[k], desc_result[k])
In [63]:
(np_result['median'] - desc_result['median']) / (desc_result['median'])
Out[63]:
In [ ]: