[PUBLIC] Analysis of CLBlast tuning

Overview

This Jupyter Notebook analyses the performance that CLBlast achieves across a range of routines, sizes and configurations.

Run first clblast-tuning-benchmarking.py

Get the experimental data from DropBox

NB: Please ignore this section if you are not interested in re-running or modifying this notebook.

The experimental data was collected on the experimental platform and archived as follows:

$ cd `ck find ck-math:script:<...>`
$ python <...>.py
$ ck zip local:experiment:* --archive_name=<...>.zip

It can be downloaded and extracted as follows:

$ wget <...>.zip
$ ck add repo:<....> --zip=<....>.zip --quiet

Data wrangling code

NB: Please ignore this section if you are not interested in re-running or modifying this notebook.

Includes

Standard


In [1]:
import os
import sys
import json
import re

Scientific

If some of the scientific packages are missing, please install them using:

# pip install jupyter pandas numpy matplotlib

In [2]:
import IPython as ip
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mp

In [3]:
print ('IPython version: %s' % ip.__version__)
print ('Pandas version: %s' % pd.__version__)
print ('NumPy version: %s' % np.__version__)
print ('Seaborn version: %s' % sns.__version__) # apt install python-tk
print ('Matplotlib version: %s' % mp.__version__)


IPython version: 5.3.0
Pandas version: 0.19.2
NumPy version: 1.12.0
Seaborn version: 0.7.1
Matplotlib version: 2.0.0

In [4]:
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline

In [5]:
from IPython.display import Image
from IPython.core.display import HTML

Collective Knowledge

If CK is not installed, please install it using:

# pip install ck

In [6]:
import ck.kernel as ck
print ('CK version: %s' % ck.__version__)


CK version: 1.8.7

Define helper functions


In [7]:
# Return the number of floating-point operations for C = alpha * A * B + beta * C,
# where A is a MxK matrix and B is a KxN matrix.
def xgemm_flops(alpha, beta, M, K, N):
    flops_AB = 2*M*N*K if alpha!=0 else 0
    flops_C = 2*M*N if beta!=0 else 0
    flops = flops_AB + flops_C
    return flops

In [8]:
# Return GFLOPS (Giga floating-point operations per second) for a known kernel and -1 otherwise.
def GFLOPS(kernel, run_characteristics, time_ms):
    if kernel.lower().find('xgemm') != -1:
        time_ms = np.float64(time_ms)
        alpha = np.float64(run_characteristics['arg_alpha'])
        beta  = np.float64(run_characteristics['arg_beta'])
        M = np.int64(run_characteristics['arg_m'])
        K = np.int64(run_characteristics['arg_k'])
        N = np.int64(run_characteristics['arg_n'])
        return (1e-9 * xgemm_flops(alpha, beta, M, K, N)) / (1e-3 * time_ms)
    else:
        return (-1.0)

In [9]:
def args_str(kernel, run):
    args = ''
    if kernel.lower().find('xgemm') != -1:
        args = 'alpha=%s, beta=%s, M=%s, K=%s, N=%s' % \
            (run['arg_alpha'], run['arg_beta'], run['arg_m'], run['arg_k'], run['arg_n'])
    return args

Access the experimental data


In [10]:
def get_experimental_results(repo_uoa='local', tags='explore-clblast-matrix-size'):
    module_uoa = 'experiment'
    r = ck.access({'action':'search', 'repo_uoa':repo_uoa, 'module_uoa':module_uoa, 'tags':tags})
    if r['return']>0:
        print ("Error: %s" % r['error'])
        exit(1)
    experiments = r['lst']
    
    dfs = []
    for experiment in experiments:
        print experiment
        data_uoa = experiment['data_uoa']
        r = ck.access({'action':'list_points', 'repo_uoa':repo_uoa, 'module_uoa':module_uoa, 'data_uoa':data_uoa})
        if r['return']>0:
            print ("Error: %s" % r['error'])
            exit(1)

        for point in r['points']:
            with open(os.path.join(r['path'], 'ckp-%s.0001.json' % point)) as point_file:
                point_data_raw = json.load(point_file)
                characteristics_list = point_data_raw['characteristics_list']
                num_repetitions = len(characteristics_list)
                # Obtain column data.
                data = [
                    {
                        'repetition_id': repetition_id,
                        'strategy'  : tuner_output['strategy'],
                        'config_id': config_id,
                        'config' : config['parameters'],
                        'kernel' : config['kernel'],
                        'args_id' : args_str(config['kernel'], characteristics['run']),
                        'ms' : np.float64(config['time']),
                        'GFLOPS' : GFLOPS(config['kernel'], characteristics['run'], config['time'])
                    }
                    for (repetition_id, characteristics) in zip(range(num_repetitions), characteristics_list) 
                    for tuner_output in characteristics['run']['data']
                    for (config_id, config) in zip(range(len(tuner_output['result'])), tuner_output['result'])
                ]
            # Construct a DataFrame.
            df = pd.DataFrame(data)
            # Set columns and index names.
            df.columns.name = 'characteristics'
            df.index.name = 'index'
            df = df.set_index([ 'kernel', 'strategy', 'args_id', 'config_id', 'repetition_id' ])
            # Append to the list of similarly constructed DataFrames.
            dfs.append(df)
    # Concatenate all constructed DataFrames (i.e. stack on top of each other).
    result = pd.concat(dfs)
    return result.sortlevel(result.index.names)

In [11]:
df = get_experimental_results(tags='explore-clblast-matrix-size,xgemm-fp32')
pd.options.display.max_columns = len(df.columns)
pd.options.display.max_rows = len(df.index)


{'ignore_case': '', 'module_uoa': 'experiment', 'data_uid': 'bf7cb47ccb9aabfa', 'search_dict': {'tags': ['explore-clblast-matrix-size', 'xgemm-fp32']}, 'ignore_update': '', 'search_string': '', 'data_uoa': 'explore-matrix-size-xgemm-fp32', 'module_uid': 'bc0409fb61f0aa82', 'repo_uoa': 'local', 'path': '/home/flavio/CK_REPOS/local/experiment/explore-matrix-size-xgemm-fp32', 'repo_uid': '9a3280b14a4285c9', 'search_by_name': '', 'out': ''}

In [12]:
kernel0 = df.iloc[0].name[0]
kernel0


Out[12]:
u'Xgemm'

In [42]:
# NB: Unlike mean(), mean() retains the 'config' column.
df_kernel0 = df.groupby(level=df.index.names[:-1]).min().loc[kernel0]

In [14]:
df_kernel0.groupby(level=df_kernel0.index.names[:-1])['GFLOPS'].min()


Out[14]:
strategy    args_id                                    
exhaustive  alpha=2.00, beta=2.00, M=128, K=1024, N=128     6.868548
            alpha=2.00, beta=2.00, M=256, K=256, N=512     11.204226
            alpha=2.00, beta=2.00, M=512, K=128, N=256     10.528199
random      alpha=2.00, beta=2.00, M=128, K=1024, N=128     0.526000
            alpha=2.00, beta=2.00, M=256, K=256, N=512      1.798959
            alpha=2.00, beta=2.00, M=512, K=128, N=256      1.746183
Name: GFLOPS, dtype: float64

In [47]:
df_kernel0.groupby(level=df_kernel0.index.names[:-1])['GFLOPS'].max()


Out[47]:
strategy    args_id                                    
exhaustive  alpha=2.00, beta=2.00, M=128, K=1024, N=128    150.615247
            alpha=2.00, beta=2.00, M=256, K=256, N=512     156.676763
            alpha=2.00, beta=2.00, M=512, K=128, N=256     144.515282
random      alpha=2.00, beta=2.00, M=128, K=1024, N=128     47.372638
            alpha=2.00, beta=2.00, M=256, K=256, N=512     113.802378
            alpha=2.00, beta=2.00, M=512, K=128, N=256      96.618789
Name: GFLOPS, dtype: float64

In [16]:
max_GFLOPS = df_kernel0.loc[df_kernel0['GFLOPS'].argmax()]['GFLOPS']
max_GFLOPS


Out[16]:
156.67676279069769

In [29]:
max_GLOPS_config = df_kernel0.loc[df_kernel0['GFLOPS'].argmax()]['config']
max_GLOPS_config


Out[29]:
{u'KWG': 32,
 u'KWI': 2,
 u'MDIMA': 8,
 u'MDIMC': 8,
 u'MWG': 32,
 u'NDIMB': 32,
 u'NDIMC': 32,
 u'NWG': 64,
 u'PRECISION': 32,
 u'SA': 1,
 u'SB': 1,
 u'STRM': 0,
 u'STRN': 0,
 u'VWM': 4,
 u'VWN': 2}

In [84]:
best_configs = df_kernel0.loc[df_kernel0.groupby(level=df_kernel0.index.names[:-1])['GFLOPS'].idxmax()]['config']
idx = df_kernel0.groupby(level=df_kernel0.index.names[:-1])['GFLOPS'].idxmax()
my = df_kernel0.loc[idx]['config']
for i in my:
    print i


{u'MDIMC': 8, u'MDIMA': 8, u'KWG': 32, u'KWI': 2, u'NDIMC': 16, u'NDIMB': 16, u'PRECISION': 32, u'VWM': 4, u'VWN': 4, u'NWG': 64, u'MWG': 32, u'STRN': 0, u'STRM': 0, u'SB': 1, u'SA': 1}
{u'MDIMC': 8, u'MDIMA': 8, u'KWG': 32, u'KWI': 2, u'NDIMC': 32, u'NDIMB': 32, u'PRECISION': 32, u'VWM': 4, u'VWN': 2, u'NWG': 64, u'MWG': 32, u'STRN': 0, u'STRM': 0, u'SB': 1, u'SA': 1}
{u'MDIMC': 8, u'MDIMA': 8, u'KWG': 32, u'KWI': 2, u'NDIMC': 32, u'NDIMB': 32, u'PRECISION': 32, u'VWM': 4, u'VWN': 2, u'NWG': 64, u'MWG': 32, u'STRN': 0, u'STRM': 0, u'SB': 1, u'SA': 1}
{u'MDIMC': 8, u'MDIMA': 16, u'KWG': 16, u'KWI': 2, u'NDIMC': 8, u'NDIMB': 8, u'PRECISION': 32, u'VWM': 2, u'VWN': 4, u'NWG': 32, u'MWG': 32, u'STRN': 1, u'STRM': 1, u'SB': 0, u'SA': 0}
{u'MDIMC': 16, u'MDIMA': 16, u'KWG': 16, u'KWI': 2, u'NDIMC': 16, u'NDIMB': 32, u'PRECISION': 32, u'VWM': 2, u'VWN': 1, u'NWG': 128, u'MWG': 32, u'STRN': 1, u'STRM': 1, u'SB': 1, u'SA': 1}
{u'MDIMC': 8, u'MDIMA': 16, u'KWG': 16, u'KWI': 2, u'NDIMC': 8, u'NDIMB': 8, u'PRECISION': 32, u'VWM': 4, u'VWN': 1, u'NWG': 32, u'MWG': 64, u'STRN': 0, u'STRM': 1, u'SB': 0, u'SA': 0}

Plot a violin graph


In [18]:
plt.figure(figsize=(12, 10))
sns.set_style('whitegrid'); sns.set_palette('Set1')
ax = sns.violinplot(data=df_kernel0.reset_index(), x='GFLOPS', y='args_id',
                    split=True, hue='strategy', hue_order=['random', 'exhaustive'])
ax.set_xticks(range(0, int(max_GFLOPS), 1))
ax.set_xlim([0, max_GFLOPS])
# Draw a dotted purple line from top to bottom at the default value (TODO).
ax.vlines(linestyles='dotted', colors='purple', x=124, ymin=ax.get_ylim()[0], ymax=ax.get_ylim()[1])


Out[18]:
<matplotlib.collections.LineCollection at 0x7fe374d43350>

In [ ]: