In [7]:
%matplotlib inline
import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [57]:
df_being_normalized = ""

# stressng-stream is in MB/s
# redisbench is in ops/s (with each op being 8 bytes)
def get_mbps(row):
    if row['benchmark'] == 'stressng-stream':
        return row['result'] / (1024 * 1024)
    else:
        return (row['result'] * 8) / (1024 * 1024)
    
def get_slowdown(row):
    base = df_being_normalized.query('benchmark == "stressng-stream" and machine == "' + row['machine'] + '"')['mbps']
    return 1 / (row['mbps'] / float(base))

def normalize(data):
    global df_being_normalized
    df_being_normalized = data
    data['mbps'] = data.apply(get_mbps, axis=1)
    data['slowdown'] = data.apply(get_slowdown, axis=1)

In [58]:
df_without = pd.read_csv('redis_without/all.csv')
normalize(df_without)
df_without['limits'] = 'no'

In [59]:
df_with = pd.read_csv('redis_limited/all.csv')
normalize(df_with)
df_with['limits'] = 'yes'

In [60]:
df = df_with.append([df_without])

In [61]:
df.columns


Out[61]:
Index(['benchmark', 'machine', 'op', 'result', 'mbps', 'slowdown', 'limits'], dtype='object')

In [62]:
df


Out[62]:
benchmark machine op result mbps slowdown limits
0 stressng-stream dwill.soe.ucsc.edu raw 5.631954e+09 5371.050000 1.000000 yes
1 stressng-stream rackform2 raw 2.661852e+09 2538.540000 1.000000 yes
2 stressng-stream scruffy.soe.ucsc.edu raw 3.739086e+09 3565.870000 1.000000 yes
3 stressng-stream pl2 raw 3.363979e+09 3208.140000 1.000000 yes
4 redisbench dwill.soe.ucsc.edu SET 1.885725e+05 1.438694 3733.281712 yes
5 redisbench dwill.soe.ucsc.edu GET 1.830999e+05 1.396941 3844.864702 yes
6 redisbench dwill.soe.ucsc.edu LPUSH 1.998202e+05 1.524507 3523.139159 yes
7 redisbench dwill.soe.ucsc.edu LPOP 1.982947e+05 1.512868 3550.243187 yes
8 redisbench rackform2 SET 9.673051e+04 0.737995 3439.778358 yes
9 redisbench rackform2 GET 9.085540e+04 0.693172 3662.209565 yes
10 redisbench rackform2 LPUSH 9.623249e+04 0.734196 3457.579814 yes
11 redisbench rackform2 LPOP 9.524262e+04 0.726644 3493.514929 yes
12 redisbench scruffy.soe.ucsc.edu SET 1.096792e+05 0.836786 4261.389172 yes
13 redisbench scruffy.soe.ucsc.edu GET 1.067806e+05 0.814671 4377.067032 yes
14 redisbench scruffy.soe.ucsc.edu LPUSH 1.141683e+05 0.871035 4093.831244 yes
15 redisbench scruffy.soe.ucsc.edu LPOP 1.144296e+05 0.873028 4084.484050 yes
16 redisbench pl2 SET 7.719623e+04 0.588960 5447.122561 yes
17 redisbench pl2 GET 7.276696e+04 0.555168 5778.684805 yes
18 redisbench pl2 LPUSH 7.780890e+04 0.593635 5404.231728 yes
19 redisbench pl2 LPOP 7.582076e+04 0.578466 5545.939213 yes
0 stressng-stream dwill.soe.ucsc.edu raw 1.254515e+10 11963.990000 1.000000 no
1 stressng-stream rackform2 raw 5.957055e+09 5681.090000 1.000000 no
2 stressng-stream scruffy.soe.ucsc.edu raw 9.048267e+09 8629.100000 1.000000 no
3 stressng-stream pl2 raw 1.095471e+10 10447.230000 1.000000 no
4 stressng-stream issdm-0 raw 2.003577e+09 1910.760000 1.000000 no
5 redisbench dwill.soe.ucsc.edu SET 4.441484e+05 3.388583 3530.676161 no
6 redisbench dwill.soe.ucsc.edu GET 4.457321e+05 3.400666 3518.131243 no
7 redisbench dwill.soe.ucsc.edu LPUSH 4.644682e+05 3.543611 3376.214415 no
8 redisbench dwill.soe.ucsc.edu LPOP 4.640371e+05 3.540322 3379.350551 no
9 redisbench rackform2 SET 2.278164e+05 1.738101 3268.561411 no
10 redisbench rackform2 GET 2.221235e+05 1.694668 3352.332200 no
11 redisbench rackform2 LPUSH 2.350176e+05 1.793042 3168.408515 no
12 redisbench rackform2 LPOP 2.326664e+05 1.775103 3200.427550 no
13 redisbench scruffy.soe.ucsc.edu SET 2.876456e+05 2.194562 3932.037607 no
14 redisbench scruffy.soe.ucsc.edu GET 2.864919e+05 2.185760 3947.872019 no
15 redisbench scruffy.soe.ucsc.edu LPUSH 3.107520e+05 2.370850 3639.665347 no
16 redisbench scruffy.soe.ucsc.edu LPOP 3.059976e+05 2.334576 3696.217039 no
17 redisbench pl2 SET 2.812939e+05 2.146102 4868.001531 no
18 redisbench pl2 GET 2.715915e+05 2.072079 5041.907347 no
19 redisbench pl2 LPUSH 2.897291e+05 2.210458 4726.274916 no
20 redisbench pl2 LPOP 3.300330e+05 2.517952 4149.098213 no
21 redisbench issdm-0 SET 7.825032e+04 0.597003 3200.589272 no
22 redisbench issdm-0 GET 8.403362e+04 0.641126 2980.320671 no
23 redisbench issdm-0 LPUSH 7.575758e+04 0.577984 3305.901993 no
24 redisbench issdm-0 LPOP 7.698230e+04 0.587328 3253.308030 no

We run the redis benchmark (show results for SET operation) and we show results for multiple machines.


In [78]:
sns.barplot(x='machine', y='mbps', data=df.query('limits == "no" and op == "SET"'))
plt.xticks(rotation=30)


Out[78]:
(array([0, 1, 2, 3, 4]), <a list of 5 Text xticklabel objects>)

The problem with the above is that these are absolute numbers, and therefore they are missing a context. One way of providing one is to obtain raw memory bandwidth throughput and use it as a baseline (normalize the above w.r.t. raw bandwidth).


In [79]:
for b in df['op'].unique():
    if b == 'raw':
        continue
    sns.barplot(x='machine', y='slowdown', data=df.query('limits == "no" and op == "' + b + '"'))
    plt.xticks(rotation=30)
    sns.plt.title(b)
    plt.show()


The above shows the overhead (slowdown) of redis w.r.t. the raw memory bandwidth. The above makes much more sense: in the first graph we are comparing the same workload on disctinct machines, i.e. we are comparing machines. But this hypothetical experiment was evaluating the performance of the KV store!

So, in the first graph, what we can conclude is that "redis is significantly slower on issdm-0". After we normalize, then this is not the case, actually, the overhead of redis on issdm-0 is the lowest! Also, our focus moves from comparing hardware to talking about the overhead of redis overall across machines (which is the goal of the experiment). In this case, the claim we can make is that redis' overhead is 3-5k over the system memory bandwidth.

Now, would throttling help in this case? Let's see


In [80]:
for b in df['op'].unique():
    if b == 'raw':
        continue
    sns.barplot(x='machine', y='slowdown', hue='limits', data=df.query('op == "' + b + '"'))
    plt.xticks(rotation=30)
    sns.plt.title(b)
    plt.show()


Since we are throttling both, the baseline and the KV store, we don't see any change in terms of the relationship between the overhead on distinct machines. Open question: are there any experiments where proper baselining does not help to contextualize results (i.e. where the 5k upper bound limit for the overhead would be, say, 100k)? Can throttling be used to "fix" these?


In [ ]: