Execution time

We compare performance (execution time) of different CLBlast GEMM implementations (e.g. original vs Lift overlay) on the row-major and column-major layouts.

Includes

Standard


In [ ]:
import os
import sys
import json

Scientific


In [ ]:
import IPython as ip
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp

In [ ]:
print('IPython version: %s' % ip.__version__)
print('NumPy version: %s' % np.__version__)
print('SciPy version: %s' % sp.__version__)
print('Pandas version: %s' % pd.__version__)
print('Matplotlib version: %s' % mp.__version__)

In [ ]:
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
# import scipy.stats as st

Collective Knowledge


In [ ]:
import ck.kernel as ck
print('CK version: %s' % ck.__version__)

Access experimental results


In [ ]:
def get_experimental_results(tags):
    search = 'search'
    repo_uoa = 'local'
    module_uoa = 'experiment'
    r=ck.access({'action':search, 'module_uoa':module_uoa, 'tags':tags})
    if r['return']>0:
        print ("Error: %s" % r['error'])
        exit(1)
    experiments=r['lst']
    
    dfs = []
    for experiment in experiments:
        data_uoa = experiment['data_uoa']
        r = ck.access({'action':'list_points', 'repo_oua':repo_uoa, 'module_uoa':module_uoa, 'data_uoa':data_uoa})
        if r['return']>0:
            print ("Error: %s" % r['error'])
            exit(1)
        path = r['path']
        points = r['points']
        for point in points:
            with open(os.path.join(path, 'ckp-%s.0001.json' % point)) as point_file:
                point_data_raw = json.load(point_file)
            # DataFrame columns.
            characteristics = [
                {
                    'time (ms)'   : np.float32(characteristics['run'].get('ms_1',[0])[0]),
                    'success?'    : characteristics['run'].get('run_success','n/a'),
                    'dvdt_prof'   : characteristics['run'].get('dvdt_prof',[])
#                    'layout'      : characteristics['run'].get('layout','n/a')
                }
                for characteristics in point_data_raw['characteristics_list'] 
                if characteristics['run'].get('run_success','')!=''
            ]
            df = pd.DataFrame(characteristics)
            df.columns.name = 'run characteristic'
            df.index.name = 'repetition'
            # DataFrame indices.
            df['program'] = point_data_raw['choices']['data_uoa']
            layout_info = point_data_raw['choices']['env']['CLBLAST_LAYOUT']
            if layout_info == 101: df['layout'] = 'row-major'
            elif layout_info == 102: df['layout'] = 'column-major'
            else: df['layout'] = 'unknown'
            df = df.set_index(['program','layout'], append=True)
            df = df.reorder_levels(('program', 'layout', 'repetition'))
            dfs.append(df)
    results = pd.concat(dfs)
    return results

In [ ]:
results = get_experimental_results('gemmbench,xgemm,clblast')

Show execution time


In [ ]:
results[['time (ms)']]

Plot execution time


In [ ]:
def plot(mean, std):
    mean \
        .plot(yerr=std, title='Execution time (ms)', kind='bar', colormap=cm.autumn,
            figsize=[16, 8], rot=0, grid=True, legend=True) \
        .legend(loc='upper left')

Compare row-major layout vs column-major layout for each program


In [ ]:
mean = results['time (ms)'].groupby(level=['program', 'layout']).mean().unstack('layout')
std = results['time (ms)'].groupby(level=['program', 'layout']).std().unstack('layout')
plot(mean, std)

Compare CLBlast vs Lift overlay for each layout


In [ ]:
mean = results['time (ms)'].groupby(level=['program', 'layout']).mean().unstack('program')
std = results['time (ms)'].groupby(level=['program', 'layout']).std().unstack('program')
plot(mean, std)

Show profiling info


In [ ]:
# Pick the first repetition of the first experiment for now.
trace = results['dvdt_prof'].iloc[0]
if not trace:
    raise Exception("No OpenCL profiling information!")
# What's that experiment, by the way?
results['dvdt_prof'].index[0]

In [ ]:
r=ck.access({'action':'show', 'module_uoa':'env', 'tags':'tool,opencl,dvdt,prof'})
if r['return']>0:
    print ("Error: %s" % r['error'])
    exit(1)
# Get path the first returned environment entry.
dvdt_prof_dir=r['lst'][0]['meta']['env']['CK_ENV_TOOL_DVDT_PROF']
dvdt_prof_src_python=os.path.join(dvdt_prof_dir,'src','python')
sys.path.append(dvdt_prof_src_python)
import prof_wrangler as pw
pw.test()

In [ ]:
trace = pw.index_calls(trace)
unit = 'ms'

Kernel enqueues


In [ ]:
# Partial trace only containing kernel enqueues.
kernel_enqueues = pw.filter_calls(trace, ['clEnqueueNDRangeKernel'])
# Kernel enqueues as a DataFrame.
df_kernel_enqueues = pw.df_kernel_enqueues(kernel_enqueues, unit)
df_kernel_enqueues

In [ ]:
# df_kernel_enqueues.info(memory_usage=True)

In [ ]:
num_enqueues_total = len(kernel_enqueues)
num_enqueues_per_repetition = 4
df_kernel_enqueues['kernel_index'] = (pd.Series(range(num_enqueues_total)) % num_enqueues_per_repetition).values
df_kernel_enqueues = df_kernel_enqueues \
    .set_index('kernel_index', append=True) \
    .reorder_levels(['call_index','kernel_index','name'])

In [ ]:
df_kernel_enqueues_stats = df_kernel_enqueues.groupby(level='kernel_index').describe()
df_kernel_enqueues_stats

In [ ]:
kernel_time = \
    df_kernel_enqueues_stats[['p3 - p2 (%s)' % unit,'p2 - p1 (%s)' % unit, 'p1 - p0 (%s)' % unit]] \
    .unstack('kernel_index') \
    .rename(columns={0:'transpose A', 1:'transpose B', 2:'Xgemm', 3:'transpose C'}) \
    .stack()
kernel_time

In [ ]:
mean=kernel_time.ix['mean']
std=kernel_time.ix['std']
mean.plot(yerr=std, title='Execution time (%s)' % unit,
          kind='bar', stacked=True,
          figsize=[16, 8], rot=45,
          grid=True, legend=True,
          colormap=cm.autumn) \
    .set_xlabel('Kernel')