[PUBLIC] CLBlast vs ARM Compute Library on representative matrix sizes


  1. Data [for developers]
  2. Code [for developers]
  3. Table
  4. Plot

Data wrangling code

NB: Please ignore this section if you are not interested in re-running or modifying this notebook.



In [205]:
import os
import sys
import json
import re


In [206]:
import IPython as ip
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mp

In [207]:
In [208]:
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline

In [209]:
from IPython.display import Image, display
def display_in_full(df):
    pd.options.display.max_columns = len(df.columns)
    pd.options.display.max_rows = len(df.index)

Collective Knowledge

In [210]:
import ck.kernel as ck
print ('CK version: %s' % ck.__version__)

Define helper functions

In [211]:
# client: 'acl-sgemm-opencl-example' or 'clblast-tune'
def get_mnk(characteristics, client):
    # dim: 'm', 'n', 'k'
    def get_dim_int(characteristics, client, dim):
        if client == 'clblast-tune':
            dim_str = characteristics['run'][dim][0]
            dim_int = np.int64(dim_str)
            dim_str = characteristics['run'][dim]
            dim_int = np.int64(dim_str)
        return dim_int

    m = get_dim_int(characteristics, client, 'm')
    n = get_dim_int(characteristics, client, 'n')
    k = get_dim_int(characteristics, client, 'k')

    return ('(%d, %d, %d)' % (m, n, k))

In [212]:
def get_GFLOPS(characteristics, client):
    if client == 'acl-sgemm-opencl-example':
        GFLOPS_str = characteristics['run']['GFLOPS_1']
        GFLOPS_str = characteristics['run']['GFLOPS_1'][0]
    GFLOPS = np.float(GFLOPS_str)
    return GFLOPS

In [213]:
def get_TimeMS(characteristics,client):
    time_execution =characteristics['run'].get('ms_1')
    return time_execution
    print profiling
    start = datetime.strptime(profiling['timestamp']['start'], '%Y-%m-%dT%H:%M:%S.%f')
    end = datetime.strptime(profiling['timestamp']['end'], '%Y-%m-%dT%H:%M:%S.%f')

    print (start.timestamp() * 1000)
    print (end.timestamp() * 1000)
    elapsed = (end.timestamp() * 1000) - (start.timestamp() * 1000)
    return elapsed

Plot experimental data

In [214]:
default_colormap = cm.autumn
default_figsize = [20, 12]
default_dpi = 200
default_fontsize = 20
default_legend_fontsize = 'medium'

if mp.__version__[0]=='2': mp.style.use('classic')
mp.rcParams['figure.figsize'] = default_figsize
mp.rcParams['figure.dpi'] = default_dpi
mp.rcParams['font.size'] = default_fontsize
mp.rcParams['legend.fontsize'] = default_legend_fontsize

In [215]:
def plot(df_mean, df_std, rot=90, patch_fontsize=default_fontsize):
    ax = df_mean.plot(yerr=df_std,
        kind='bar', ylim=[0, 20], rot=rot, width=0.9, grid=True, legend=True,
        figsize=default_figsize, colormap=default_colormap, fontsize=default_fontsize)
    ax.set_title('ARM Compute Library vs CLBlast (dv/dt)', fontsize=default_fontsize)
    ax.set_ylabel('SGEMM GFLOPS', fontsize=default_fontsize)
    ax.legend(loc='upper right')
    for patch in ax.patches:
        text = '{0:2.1f}'.format(patch.get_height())
        ax.annotate(text, (patch.get_x()*1.00, patch.get_height()*1.01), fontsize=patch_fontsize)

Access experimental data

In [233]:
def get_experimental_results(repo_uoa='explore-matrix-size-gemm-libs-dvdt-prof-firefly-rk3399', tags='explore-matrix-size-libs-sgemm, acl-sgemm-opencl-example'):
    module_uoa = 'experiment'
    r = ck.access({'action':'search', 'repo_uoa':repo_uoa, 'module_uoa':module_uoa, 'tags':tags})
    if r['return']>0:
        print ("Error: %s" % r['error'])
    experiments = r['lst']
    dfs = []
    for experiment in experiments:
        data_uoa = experiment['data_uoa']
        r = ck.access({'action':'list_points', 'repo_uoa':repo_uoa, 'module_uoa':module_uoa, 'data_uoa':data_uoa})
        if r['return']>0:
            print ("Error: %s" % r['error'])
        for point in r['points']:
            with open(os.path.join(r['path'], 'ckp-%s.0001.json' % point)) as point_file:
                point_data_raw = json.load(point_file)
            characteristics_list = point_data_raw['characteristics_list']
            num_repetitions = len(characteristics_list)
            client = data_uoa[len('explore-matrix-size-gemm-libs-'):]
            # Obtain column data.
            data = [
                    'client': client,
                    '(m, n, k)': get_mnk(characteristics, client),
                    'GFLOPS': get_GFLOPS(characteristics, client),
                    'dvdt_prof_info': characteristics['run'].get('dvdt_prof',[]),
                    'time (ms)' : get_TimeMS(characteristics,client),
                    'repetition_id': repetition_id
                for (characteristics, repetition_id) in zip(characteristics_list, range(num_repetitions)) 
            #Construct a DataFrame.
            df = pd.DataFrame(data)
            # Set columns and index names.
            df.columns.name = 'characteristics'
            df.index.name = 'index'
            df = df.set_index(['client', '(m, n, k)', 'repetition_id','GFLOPS','time (ms)'])
            # Append to the list of similarly constructed DataFrames.
    # Concatenate all constructed DataFrames (i.e. stack on top of each other).
    result = pd.concat(dfs).unstack('client').swaplevel(axis=1)
    return result.sort_index(level=result.index.names)


In [234]:
df = get_experimental_results(repo_uoa=repo_uoa)


client acl-sgemm-opencl-example
characteristics dvdt_prof_info
(m, n, k) repetition_id GFLOPS time (ms)
(128, 800, 864) 0 13.367015 13.237600 [{u'timestamp': {u'start': u'2017-06-15T16:44:...
1 12.887821 13.729800 [{u'timestamp': {u'start': u'2017-06-15T16:44:...
2 14.002311 12.637000 [{u'timestamp': {u'start': u'2017-06-15T16:44:...
(192, 3136, 576) 0 21.126351 32.832600 [{u'timestamp': {u'start': u'2017-06-15T16:42:...
1 20.285938 34.192800 [{u'timestamp': {u'start': u'2017-06-15T16:42:...
2 20.581972 33.701000 [{u'timestamp': {u'start': u'2017-06-15T16:42:...
(192, 800, 1152) 0 16.676613 21.221000 [{u'timestamp': {u'start': u'2017-06-15T16:42:...
1 16.643359 21.263400 [{u'timestamp': {u'start': u'2017-06-15T16:42:...
2 16.977587 20.844800 [{u'timestamp': {u'start': u'2017-06-15T16:42:...
(320, 224, 1440) 0 16.672999 12.381600 [{u'timestamp': {u'start': u'2017-06-15T16:43:...
1 16.162851 12.772400 [{u'timestamp': {u'start': u'2017-06-15T16:43:...
2 14.629815 14.110800 [{u'timestamp': {u'start': u'2017-06-15T16:43:...
(64, 12544, 160) 0 9.601052 26.757600 [{u'timestamp': {u'start': u'2017-06-15T16:43:...
1 9.495865 27.054000 [{u'timestamp': {u'start': u'2017-06-15T16:43:...
2 9.535340 26.942000 [{u'timestamp': {u'start': u'2017-06-15T16:43:...

In [232]:
df_min = df \
    .ix[df.groupby(level=df.index.names[:-1])['time (ms)'].idxmin()] \
    .reset_index('repetition_id', drop=True)

In [ ]:
batch_size = 1
df_model_lib = df_min[['dvdt_prof_info']] \
    .reset_index('platform', drop=True) \
    .reorder_levels([ 'batch_size', 'model', 'lib']) \
    .loc[batch_size] \

In [ ]:
models = df_model_lib.index.levels[0]
libs = df_model_lib.index.levels[1]

In [ ]:
def concat(model, lib):
    return '%s:%s' % (model, lib)

In [ ]:
def analyse_model_lib(df_model_lib, model, lib, min_pc=1.0):
    trace = pw.index_calls(df_model_lib.loc[model].loc[lib]['dvdt_prof_info'])
    # All kernel enqueues.
    df_kernel_enqueues = pw.df_kernel_enqueues(pw.filter_calls(trace, ['clEnqueueNDRangeKernel']), unit='ms')
    # Kernel enqueues that take at least 'min_pc' % of the execution time.
    df_kernel_enqueues_cum_time_num = pw.df_kernel_enqueues_cumulative_time_num(df_kernel_enqueues, unit)
    df_kernel_enqueues_cum_time_num.columns.name = concat(model, lib)
    return df_kernel_enqueues_cum_time_num[df_kernel_enqueues_cum_time_num['** Execution time (%) **'] > min_pc]

In [ ]:
def analyse_xgemm_kernel(df_model_lib, model, lib, kernel):
    # Get trace for lib and model.
    trace = pw.index_calls(df_model_lib.loc[model].loc[lib]['dvdt_prof_info'])
    # All calls to set kernel args.
    set_args = pw.filter_calls(trace, ['clSetKernelArg']) 
    # All kernel enqueues.
    nqs = pw.filter_calls(trace, ['clEnqueueNDRangeKernel'])
    # Construct a DataFrame with info about kernel enqueues.
    df = pw.df_kernel_enqueues(nqs, unit='ms').swaplevel().ix[kernel]
    df = df[['p3 - p2 (ms)', 'gws2']]
    # As gws2 is always 1, we can use it to count the number of enqueues.
    df.columns = [ '** Execution time (ms) **', '** Number of enqueues **' ]
    df.columns.name = kernel
    # Augment the DataFrame with columns for the (M, N, K) triples.
    df['kSizeM'] = 'M'; df['bSizeM'] = 'MM'
    df['kSizeN'] = 'N'; df['bSizeN'] = 'NN'
    df['kSizeK'] = 'K'; df['bSizeK'] = 'KK'
    # Initialise buckets.
    buckets = init_buckets()
    # Augment the DataFrame with the actual (M, N, K) triples.
    mnk_triples = []; mmnnkk_triples = []
    for nq in nqs:
        if nq['name'] == kernel:
            prof = nq['profiling']
            (M, N, K) = ('M', 'N', 'K'); (MM, NN, KK) = ('MM', 'NN', 'KK')
            for set_arg in set_args:
                if (set_arg['call_index'] > nq['call_index']): break
                if (set_arg['kernel'] != nq['kernel']): continue
                arg_value = pc.hex_str_as_int(set_arg['arg_value'])
                if (set_arg['arg_index'] == 0): M = arg_value; MM = arg_value
                if (set_arg['arg_index'] == 1): N = arg_value; NN = arg_value
                if (set_arg['arg_index'] == 2): K = arg_value; KK = arg_value
            mnk_triples.append((M, N, K))
            mmnnkk_triples.append(get_nearest_bucket(buckets, (M, N, K)))
    df[['kSizeM', 'kSizeN', 'kSizeK']] = mnk_triples
    df[['bSizeM', 'bSizeN', 'bSizeK']] = mmnnkk_triples
    # Calculate Gflops and GFLOPS (Gflops/s).
    df['** Gflops **'] = 2*df['kSizeM']*df['kSizeN']*df['kSizeK']*1e-9
    df['** GFLOPS **'] = df['** Gflops **'] / (df['** Execution time (ms) **']*1e-3)
    return df

In [ ]:
model_lib_kernel_analysis = {}
for model in models:
    for lib in libs:
        title = concat(model, lib)
        print('== %s ==' % title)
            analysis = model_lib_analysis[title]
            print(' ... missing ...'); print(''); continue
        for kernel in analysis.index:
            if kernel.lower().find('xgemm') == -1: continue
            analysis_xgemm = analyse_xgemm_kernel(df_model_lib, model, lib, kernel)
            pd.options.display.max_columns = analysis_xgemm.columns.size
            pd.options.display.max_rows = analysis_xgemm.index.size
            analysis_xgemm_stats = analysis_xgemm.describe()
            pd.options.display.max_columns = analysis_xgemm_stats.columns.size
            pd.options.display.max_rows = analysis_xgemm_stats.index.size
            model_lib_kernel_analysis[concat(title, kernel)] = analysis_xgemm


In [ ]:
df = get_experimental_results(repo_uoa=repo_uoa)


In [ ]:
df_mean = df.groupby(level=df.index.names[:-1]).mean()
df_std = df.groupby(level=df.index.names[:-1]).std()
plot(df_mean, df_std)