We compare performance (execution time) of different CLBlast GEMM implementations (e.g. original vs Lift overlay) on the row-major and column-major layouts.
In [ ]:
import os
import sys
import json
In [ ]:
import IPython as ip
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
In [ ]:
print('IPython version: %s' % ip.__version__)
print('NumPy version: %s' % np.__version__)
print('SciPy version: %s' % sp.__version__)
print('Pandas version: %s' % pd.__version__)
print('Matplotlib version: %s' % mp.__version__)
In [ ]:
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
# import scipy.stats as st
In [ ]:
import ck.kernel as ck
print('CK version: %s' % ck.__version__)
In [ ]:
def get_experimental_results(tags):
search = 'search'
repo_uoa = 'local'
module_uoa = 'experiment'
r=ck.access({'action':search, 'module_uoa':module_uoa, 'tags':tags})
if r['return']>0:
print ("Error: %s" % r['error'])
exit(1)
experiments=r['lst']
dfs = []
for experiment in experiments:
data_uoa = experiment['data_uoa']
r = ck.access({'action':'list_points', 'repo_oua':repo_uoa, 'module_uoa':module_uoa, 'data_uoa':data_uoa})
if r['return']>0:
print ("Error: %s" % r['error'])
exit(1)
path = r['path']
points = r['points']
for point in points:
with open(os.path.join(path, 'ckp-%s.0001.json' % point)) as point_file:
point_data_raw = json.load(point_file)
# DataFrame columns.
characteristics = [
{
'time (ms)' : np.float32(characteristics['run'].get('ms_1',[0])[0]),
'success?' : characteristics['run'].get('run_success','n/a'),
'dvdt_prof' : characteristics['run'].get('dvdt_prof',[])
# 'layout' : characteristics['run'].get('layout','n/a')
}
for characteristics in point_data_raw['characteristics_list']
if characteristics['run'].get('run_success','')!=''
]
df = pd.DataFrame(characteristics)
df.columns.name = 'run characteristic'
df.index.name = 'repetition'
# DataFrame indices.
df['program'] = point_data_raw['choices']['data_uoa']
layout_info = point_data_raw['choices']['env']['CLBLAST_LAYOUT']
if layout_info == 101: df['layout'] = 'row-major'
elif layout_info == 102: df['layout'] = 'column-major'
else: df['layout'] = 'unknown'
df = df.set_index(['program','layout'], append=True)
df = df.reorder_levels(('program', 'layout', 'repetition'))
dfs.append(df)
results = pd.concat(dfs)
return results
In [ ]:
results = get_experimental_results('gemmbench,xgemm,clblast')
In [ ]:
results[['time (ms)']]
In [ ]:
def plot(mean, std):
mean \
.plot(yerr=std, title='Execution time (ms)', kind='bar', colormap=cm.autumn,
figsize=[16, 8], rot=0, grid=True, legend=True) \
.legend(loc='upper left')
In [ ]:
mean = results['time (ms)'].groupby(level=['program', 'layout']).mean().unstack('layout')
std = results['time (ms)'].groupby(level=['program', 'layout']).std().unstack('layout')
plot(mean, std)
In [ ]:
mean = results['time (ms)'].groupby(level=['program', 'layout']).mean().unstack('program')
std = results['time (ms)'].groupby(level=['program', 'layout']).std().unstack('program')
plot(mean, std)
In [ ]:
# Pick the first repetition of the first experiment for now.
trace = results['dvdt_prof'].iloc[0]
if not trace:
raise Exception("No OpenCL profiling information!")
# What's that experiment, by the way?
results['dvdt_prof'].index[0]
In [ ]:
r=ck.access({'action':'show', 'module_uoa':'env', 'tags':'tool,opencl,dvdt,prof'})
if r['return']>0:
print ("Error: %s" % r['error'])
exit(1)
# Get path the first returned environment entry.
dvdt_prof_dir=r['lst'][0]['meta']['env']['CK_ENV_TOOL_DVDT_PROF']
dvdt_prof_src_python=os.path.join(dvdt_prof_dir,'src','python')
sys.path.append(dvdt_prof_src_python)
import prof_wrangler as pw
pw.test()
In [ ]:
trace = pw.index_calls(trace)
unit = 'ms'
In [ ]:
# Partial trace only containing kernel enqueues.
kernel_enqueues = pw.filter_calls(trace, ['clEnqueueNDRangeKernel'])
# Kernel enqueues as a DataFrame.
df_kernel_enqueues = pw.df_kernel_enqueues(kernel_enqueues, unit)
df_kernel_enqueues
In [ ]:
# df_kernel_enqueues.info(memory_usage=True)
In [ ]:
num_enqueues_total = len(kernel_enqueues)
num_enqueues_per_repetition = 4
df_kernel_enqueues['kernel_index'] = (pd.Series(range(num_enqueues_total)) % num_enqueues_per_repetition).values
df_kernel_enqueues = df_kernel_enqueues \
.set_index('kernel_index', append=True) \
.reorder_levels(['call_index','kernel_index','name'])
In [ ]:
df_kernel_enqueues_stats = df_kernel_enqueues.groupby(level='kernel_index').describe()
df_kernel_enqueues_stats
In [ ]:
kernel_time = \
df_kernel_enqueues_stats[['p3 - p2 (%s)' % unit,'p2 - p1 (%s)' % unit, 'p1 - p0 (%s)' % unit]] \
.unstack('kernel_index') \
.rename(columns={0:'transpose A', 1:'transpose B', 2:'Xgemm', 3:'transpose C'}) \
.stack()
kernel_time
In [ ]:
mean=kernel_time.ix['mean']
std=kernel_time.ix['std']
mean.plot(yerr=std, title='Execution time (%s)' % unit,
kind='bar', stacked=True,
figsize=[16, 8], rot=45,
grid=True, legend=True,
colormap=cm.autumn) \
.set_xlabel('Kernel')