In [1]:
import lancet, re, pandas
import numpy as np
from itertools import product
import holoviews
from holoviews import *
import seaborn as sns
%load_ext holoviews.ipython

%output holomap='widgets' fig='svg' size=200

In [ ]:
benchmark_name   = 'gpu_benchmark'
cortex_density = lancet.List('cortex_density', [48, 142, 162])
iterations = lancet.List('iterations', [1000])
resource=lancet.List('gpu',[True])  
executable = '/afs/inf.ed.ac.uk/user/s11/s1137931/honours/topographica/timing_script.sh'
timing_cmd     = lancet.ShellCommand(executable=executable, posargs=['cortex_density', 'iterations', 'gpu'])
p_space = cortex_density * iterations* resource
# Runs locally. A Launcher could be used to launch jobs with Grid Engine.
for i in range(10):
    lancet.Launcher(benchmark_name, p_space, timing_cmd, output_directory='benchmarks', max_concurrency=1)()
    print "Run %d finished..." % i

In [2]:
output_files   = lancet.FilePattern('timing_files', './benchmarks/*-gpu_benchmark*/streams/*_tid_{tid:d}.o*')

In [3]:
def parse_timings(filename):
    with open(filename, 'r') as infile:
        content = infile.read()
    
    host_m = re.search('^HOST:\t(?P<host>.*?)\n', content, re.MULTILINE)
    param_m = re.search('^PARAMS:\t(?P<cortex_density>.*?)\t(?P<iterations>.*?)\t(?P<gpu>.*?)\n', content, re.MULTILINE)
    version_m = re.search('^VERSION:\t(?P<version>.*?)\n', content, re.MULTILINE)
    timing_m = re.findall('^[0-9]*.[0-9]*u [0-9]*.[0-9]*s ([0-9]*:[0-9]*.[0-9]*) ([0-9]*.[0-9])%', content, re.MULTILINE)
    if host_m is None or param_m is None or len(timing_m) < 3:
        raise Exception("The benchmark file %s has incorrect format" % filename)
    timing_m = [(float(t.split(':')[0]) * 60 + float(t.split(':')[1]) + float('0.' + t.split('.')[1]), float(u)) for t, u in timing_m]
    
    p = param_m.groupdict()
    p['gpu'] = True if p['gpu'] == 'True' else False
    p['iterations'] = int(p['iterations'])
    p['cortex_density'] = float(p['cortex_density'])
    result = {'topo_startup_time': timing_m[0][0], 'topo_startup_cpu': timing_m[0][1],
              'sim_startup_time': timing_m[1][0] - timing_m[0][0], 'sim_startup_cpu': timing_m[1][1],
              'sim_run_time': timing_m[-1][0] - timing_m[1][0],
              'total_time': timing_m[-1][0], 'total_cpu': timing_m[-1][1],
              'time_per_iteration': (timing_m[-1][0] - timing_m[1][0])/p['iterations']}
    result.update(host_m.groupdict())
    result.update(p)
    result.update(version_m.groupdict())
    return result

In [4]:
collected_timings = lancet.FileInfo(output_files, 'timing_files',
                                lancet.CustomFile(metadata_fn=parse_timings))

In [5]:
len(collected_timings)


Out[5]:
590

In [6]:
avg_timings = collected_timings.dframe.groupby(['host', 'version', 'cortex_density']).apply(np.mean)
melmac = avg_timings.ix['melmac.inf.ed.ac.uk']
stonesoup = avg_timings.ix['stonesoup.inf.ed.ac.uk']

Showing the constituents of total runtime in Traditional Topographica and Sparse Topographica

Used in section "Performance analysis of the chosen model"


In [7]:
constituents_traditional_vs_sparse = collected_timings.dframe.query('version in ["Traditional_topographica", "Sparse_topographica"] and host=="melmac.inf.ed.ac.uk"').groupby(['version', 'cortex_density']).apply(np.mean)
constituents_traditional_vs_sparse.rename(columns={'sim_run_time': 'Simulation runtime', 'topo_startup_time' : 'Topographica startup', 'sim_startup_time': 'Simulation startup'}, inplace=True)

In [8]:
%%opts Bars [color_by=['stack'] apply_databounds=False show_grid=True]
versions, densities, parts = ["Traditional_topographica", "Sparse_topographica"], [48, 142, 162], ['Topographica startup', 'Simulation startup','Simulation runtime']
keys = product(versions, densities, parts)
Bars([(k, constituents_traditional_vs_sparse.query('version == "%s" and cortex_density == %d' % (k[0], k[1]))[k[2]].mean()) for k in keys], key_dimensions=['Version', 'Cortex density', Dimension('Part', values=parts)], value_dimensions=['Time (seconds)'])


Out[8]:

Sparse topographica bottlenecks


In [9]:
versions = ['Sparse_topographica', 'Sparse_topographica_dot_only', 'Sparse_topographica_empty']
sparse_timings = collected_timings.dframe.query('version in %s and host=="melmac.inf.ed.ac.uk"' % str(versions)).loc[:, ['cortex_density', 'version', 'sim_run_time']].groupby(['version', 'cortex_density']).mean()

In [10]:
learn_norm_time = sparse_timings.ix['Sparse_topographica'] - sparse_timings.ix['Sparse_topographica_dot_only']
dot_time = sparse_timings.ix['Sparse_topographica_dot_only'] - sparse_timings.ix['Sparse_topographica_empty']
other_time = sparse_timings.ix['Sparse_topographica'] - learn_norm_time - dot_time

learn_norm_time /= sparse_timings.ix['Sparse_topographica'] * 0.01
dot_time /= sparse_timings.ix['Sparse_topographica'] * 0.01
other_time /= sparse_timings.ix['Sparse_topographica'] * 0.01

In [11]:
learn_norm_time.rename(columns={'sim_run_time': 'hebbian_norm'}, inplace=True)
dot_time.rename(columns={'sim_run_time': 'dot'}, inplace=True)
other_time.rename(columns={'sim_run_time': 'other'}, inplace=True)

In [12]:
sparse_bottlenecks_percentage = learn_norm_time.join(dot_time).join(other_time)

In [13]:
sparse_bottlenecks_percentage


Out[13]:
hebbian_norm dot other
cortex_density
48 12.554427 36.885756 50.559818
142 22.071776 75.278824 2.649400
162 21.696927 76.538144 1.764929

In [14]:
sparse_bottlenecks_time_per_iteration = sparse_bottlenecks_percentage.apply(lambda row: row * sparse_timings.ix['Sparse_topographica'].loc[row.name]['sim_run_time'] * 0.01, axis=1) / 1000

sparse_topo_dot = sparse_bottlenecks_time_per_iteration['dot']
sparse_topo_hebbian_norm = sparse_bottlenecks_time_per_iteration['hebbian_norm']
sparse_topo_other = sparse_bottlenecks_time_per_iteration['other']

print "Sparse topographica times per iteration:"
sparse_bottlenecks_time_per_iteration


Sparse topographica times per iteration:
Out[14]:
hebbian_norm dot other
cortex_density
48 0.002422 0.007116 0.009754
142 0.116682 0.397960 0.014006
162 0.192588 0.679374 0.015666

Graphing time taken over varying number of iterations to show that the amount of work done per iteration is constant


In [15]:
mini_benchmark_files   = lancet.FilePattern('timing_files', './mini_benchmark/*-gpu_benchmark*/streams/*_tid_{tid:d}.o*')
mini_timings = lancet.FileInfo(mini_benchmark_files, 'timing_files',
                                lancet.CustomFile(metadata_fn=parse_timings))

In [16]:
work_per_iteration = DFrame(mini_timings.dframe[['iterations', 'sim_run_time', 'cortex_density']])

In [17]:
%%opts Regression [show_legend=True apply_databounds=True]
work_per_iteration.regression('iterations', ['sim_run_time'], extents=(0, 0, 550, 550), mdims=['cortex_density'], reduce_fn=np.mean).overlay('cortex_density')


Out[17]:

Graphing the first successful GPU dot product against Traditional Topographica dot product


In [18]:
versions, densities, parts = ['GPU_topographica_dot_only', 'Traditional_topographica_dot_only', 'Sparse_topographica_dot_only'], [48, 142, 162], ['other', 'dot']
gpu_dot_vs_traditional_dot = melmac.ix[versions]
keys = list(product(densities, versions, parts))


row_count = len(gpu_dot_vs_traditional_dot['time_per_iteration'])
gpu_dot_vs_traditional_dot['dot'] = np.random.randn(row_count)
gpu_dot_vs_traditional_dot['other'] = np.random.randn(row_count)

for (d, v, p) in keys:
    if 'Traditional' in v:
        if p == 'other':
            gpu_dot_vs_traditional_dot.ix[v][p][d] = melmac.ix['Traditional_topographica_empty']['time_per_iteration'][d]
        else:
            gpu_dot_vs_traditional_dot.ix[v][p][d] = gpu_dot_vs_traditional_dot.ix[v]['time_per_iteration'][d] - melmac.ix['Traditional_topographica_empty']['time_per_iteration'][d]
    else:
        if p == 'other':
            gpu_dot_vs_traditional_dot.ix[v][p][d] = melmac.ix['Sparse_topographica_empty']['time_per_iteration'][d]
        else:
            gpu_dot_vs_traditional_dot.ix[v][p][d] = gpu_dot_vs_traditional_dot.ix[v]['time_per_iteration'][d] - melmac.ix['Sparse_topographica_empty']['time_per_iteration'][d]

In [19]:
%%opts Bars [color_by=['stack'] apply_databounds=False show_grid=True yticks=25]
Bars([(k, gpu_dot_vs_traditional_dot.query('version == "%s" and cortex_density == %d' % (k[1], k[0]))[k[2]]) for k in keys], key_dimensions=['Cortex density',  Dimension('Version', values=versions), Dimension('Part', values=parts)], value_dimensions=['Time (seconds)'])


Out[19]:

In [20]:
gpu_dot_vs_traditional_dot[['other', 'dot', 'time_per_iteration']]


Out[20]:
other dot time_per_iteration
version cortex_density
GPU_topographica_dot_only 48 0.00977 0.00952 0.01929
142 0.01402 0.06750 0.08152
162 0.01568 0.10491 0.12059
Traditional_topographica_dot_only 48 0.00647 0.01168 0.01815
142 0.00936 0.28175 0.29111
162 0.00996 0.45487 0.46483
Sparse_topographica_dot_only 48 0.00977 0.00709 0.01686
142 0.01402 0.39794 0.41196
162 0.01568 0.67936 0.69504

Graphing the slow normalisation vs sparse vs dense


In [21]:
versions, densities, parts = ['GPU_topographica_slow_normalisation', 'Traditional_topographica', 'Sparse_topographica'], [48, 142, 162], ['other', 'dot', 'hebbian_norm']
gpu_vs_traditional_vs_sparse = melmac.ix[versions]
keys = list(product(densities, versions, parts))


row_count = len(gpu_vs_traditional_vs_sparse['time_per_iteration'])
gpu_vs_traditional_vs_sparse['dot'] = np.random.randn(row_count)
gpu_vs_traditional_vs_sparse['hebbian_norm'] = np.random.randn(row_count)
gpu_vs_traditional_vs_sparse['other'] = np.random.randn(row_count)

for (d, v, p) in keys:
    if 'Traditional' in v:
        if p == 'other':
            gpu_vs_traditional_vs_sparse.ix[v][p][d] = melmac.ix['Traditional_topographica_empty']['time_per_iteration'][d]
        if p == 'dot':
            gpu_vs_traditional_vs_sparse.ix[v][p][d] = melmac.ix['Traditional_topographica_dot_only']['time_per_iteration'][d] - melmac.ix['Traditional_topographica_empty']['time_per_iteration'][d]
        if p == 'hebbian_norm':
            gpu_vs_traditional_vs_sparse.ix[v][p][d] = melmac.ix['Traditional_topographica']['time_per_iteration'][d] - melmac.ix['Traditional_topographica_dot_only']['time_per_iteration'][d]
    else:
        if p == 'other':
            gpu_vs_traditional_vs_sparse.ix[v][p][d] = melmac.ix['Sparse_topographica_empty']['time_per_iteration'][d]
        if p == 'dot':
            gpu_vs_traditional_vs_sparse.ix[v][p][d] = gpu_dot_vs_traditional_dot.ix['GPU_topographica_dot_only']['dot'][d] if 'GPU' in v else gpu_dot_vs_traditional_dot.ix['Sparse_topographica_dot_only']['dot'][d]
        if p == 'hebbian_norm':
            gpu_vs_traditional_vs_sparse.ix[v][p][d] = gpu_vs_traditional_vs_sparse.ix[v]['time_per_iteration'][d] - (gpu_dot_vs_traditional_dot.ix['GPU_topographica_dot_only']['time_per_iteration'][d] if 'GPU' in v else gpu_dot_vs_traditional_dot.ix['Sparse_topographica_dot_only']['time_per_iteration'][d])

In [22]:
%%opts Bars [color_by=['stack'] apply_databounds=False show_grid=True yticks=25]
Bars([(k, gpu_vs_traditional_vs_sparse.query('version == "%s" and cortex_density == %d' % (k[1], k[0]))[k[2]]) for k in keys], key_dimensions=['Cortex density',  Dimension('Version', values=versions), Dimension('Part', values=parts)], value_dimensions=['Time (seconds)'])


Out[22]:

In [23]:
gpu_vs_traditional_vs_sparse[['other', 'dot', 'hebbian_norm', 'time_per_iteration']]


Out[23]:
other dot hebbian_norm time_per_iteration
version cortex_density
GPU_topographica_slow_normalisation 48 0.00977 0.00952 0.002170 0.021460
142 0.01402 0.06750 0.018960 0.100480
162 0.01568 0.10491 0.033770 0.154360
Traditional_topographica 48 0.00647 0.01168 0.000675 0.018825
142 0.00936 0.28175 0.015355 0.306465
162 0.00996 0.45487 0.023880 0.488710
Sparse_topographica 48 0.00977 0.00709 0.002430 0.019290
142 0.01402 0.39794 0.116695 0.528655
162 0.01568 0.67936 0.192575 0.887615

Graphing the performance of Traditional topographica vs GPU Topographica with slow normalisation and fixed normalisation


In [24]:
versions, densities, parts = ['GPU_topographica_slow_normalisation', 'Synchronous_GPU_Kernels'], [48, 142, 162], ['other', 'dot', 'hebbian_norm']
gpu_slow_fast_norm_vs_tratitional = melmac.ix[versions]
keys = list(product(densities, versions, parts))

row_count = len(gpu_slow_fast_norm_vs_tratitional['time_per_iteration'])
for p in parts:
    gpu_slow_fast_norm_vs_tratitional[p] = np.random.randn(row_count)

for (d, v, p) in keys:
    if p == 'other':
        gpu_slow_fast_norm_vs_tratitional.ix[v][p][d] = melmac.ix['Sparse_topographica_empty']['time_per_iteration'][d]
    if p == 'dot':
        gpu_slow_fast_norm_vs_tratitional.ix[v][p][d] = gpu_dot_vs_traditional_dot.ix['GPU_topographica_dot_only']['dot'][d]
    if p == 'hebbian_norm':
        gpu_slow_fast_norm_vs_tratitional.ix[v][p][d] = gpu_slow_fast_norm_vs_tratitional.ix[v]['time_per_iteration'][d] - gpu_dot_vs_traditional_dot.ix['GPU_topographica_dot_only']['time_per_iteration'][d]

In [25]:
%%opts Bars [color_by=['stack'] apply_databounds=False show_grid=True yticks=25 show_legend=False]
Bars([(k, gpu_slow_fast_norm_vs_tratitional.query('version == "%s" and cortex_density == %d' % (k[1], k[0]))[k[2]]) for k in keys], key_dimensions=['Cortex density',  Dimension('Version', values=versions), Dimension('Part', values=parts)], value_dimensions=['Time (seconds)'])


Out[25]:

In [26]:
gpu_slow_fast_norm_vs_tratitional[['other', 'dot', 'hebbian_norm', 'time_per_iteration']]


Out[26]:
other dot hebbian_norm time_per_iteration
version cortex_density
GPU_topographica_slow_normalisation 48 0.00977 0.00952 0.002170 0.021460
142 0.01402 0.06750 0.018960 0.100480
162 0.01568 0.10491 0.033770 0.154360
Synchronous_GPU_Kernels 48 0.00977 0.00952 0.004905 0.024195
142 0.01402 0.06750 0.015755 0.097275
162 0.01568 0.10491 0.025485 0.146075

Graphing the performance differences between Topographica GPU implementation using CSR and HYB sparse matrix formats:


In [27]:
versions, densities, parts = ["Streamed_GPU_Kernels_HYB_dot_only", "Streamed_GPU_Kernels_CSR_dot_only"], [48, 142, 162], ['other', 'dot']
hyb_vs_csr = melmac.ix[versions]
keys = list(product(densities, versions, parts))

row_count = len(hyb_vs_csr['time_per_iteration'])
for p in parts:
  hyb_vs_csr[p] = np.random.randn(row_count)
    
for (d, v, p) in keys:
  if p == 'other':
    hyb_vs_csr.ix[v][p][d] = melmac.ix['Sparse_topographica_empty']['time_per_iteration'][d]
  else:
    hyb_vs_csr.ix[v][p][d] = hyb_vs_csr.ix[v]['time_per_iteration'][d] - melmac.ix['Sparse_topographica_empty']['time_per_iteration'][d]

In [28]:
%%opts Bars [color_by=['stack'] apply_databounds=False show_grid=True yticks=15 xrotation=90]
Bars([(k, hyb_vs_csr.query('version == "%s" and cortex_density == %d' % (k[1], k[0]))[k[2]]) for k in keys], key_dimensions=['Cortex density',  Dimension('Version', values=versions), Dimension('Part', values=parts)], value_dimensions=['Time (seconds)'])


Out[28]:

In [29]:
hyb_vs_csr[['other', 'dot', 'time_per_iteration']]


Out[29]:
other dot time_per_iteration
version cortex_density
Streamed_GPU_Kernels_HYB_dot_only 48 0.00977 0.009130 0.018900
142 0.01402 0.051415 0.065435
162 0.01568 0.096845 0.112525
Streamed_GPU_Kernels_CSR_dot_only 48 0.00977 0.008420 0.018190
142 0.01402 0.063595 0.077615
162 0.01568 0.101505 0.117185

Synchronous vs streamed


In [30]:
versions, densities, parts = ["Streamed_GPU_Kernels", "Synchronous_GPU_Kernels"], [48, 142, 162], ['other', 'dot', 'hebbian_norm']
sync_vs_streamed = melmac.ix[versions]
keys = list(product(densities, versions, parts))

row_count = len(sync_vs_streamed['time_per_iteration'])
for p in parts:
    sync_vs_streamed[p] = np.random.randn(row_count)

for (d, v, p) in keys:
    if p == 'other':
        sync_vs_streamed.ix[v][p][d] = melmac.ix['Sparse_topographica_empty']['time_per_iteration'][d]
    if p == 'dot':
        sync_vs_streamed.ix[v][p][d] = sync_vs_streamed.ix[v]['time_per_iteration'][d] - gpu_slow_fast_norm_vs_tratitional.ix['Synchronous_GPU_Kernels']['hebbian_norm'][d] - melmac.ix['Sparse_topographica_empty']['time_per_iteration'][d]
    if p == 'hebbian_norm':
        sync_vs_streamed.ix[v][p][d] = sync_vs_streamed.ix[v]['time_per_iteration'][d] - melmac.ix['Sparse_topographica_empty']['time_per_iteration'][d] - sync_vs_streamed.ix[v]['dot'][d]

In [31]:
%%opts Bars [color_by=['stack'] apply_databounds=False show_grid=True yticks=25 xrotation=90 show_legend=False]
Bars([(k, sync_vs_streamed.query('version == "%s" and cortex_density == %d' % (k[1], k[0]))[k[2]]) for k in keys], key_dimensions=['Cortex density',  Dimension('Version', values=versions), Dimension('Part', values=parts)], value_dimensions=['Time (seconds)'])


Out[31]:

In [32]:
sync_vs_streamed[['other', 'dot', 'hebbian_norm', 'time_per_iteration']]


Out[32]:
other dot hebbian_norm time_per_iteration
version cortex_density
Streamed_GPU_Kernels 48 0.00977 0.006345 0.004905 0.021020
142 0.01402 0.063635 0.015755 0.093410
162 0.01568 0.101600 0.025485 0.142765
Synchronous_GPU_Kernels 48 0.00977 0.009520 0.004905 0.024195
142 0.01402 0.067500 0.015755 0.097275
162 0.01568 0.104910 0.025485 0.146075

Showing the activities after 150 iterations to illustrate the testing procedure


In [33]:
gpu = np.array([[ 0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.00000,  0.21191,  0.00000,  0.01938,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.11707,  0.00000,  0.00000,  0.19443,  0.00000,
         0.00000,  0.00000]])
dense = np.array([[ 0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.00000,  0.00000,  0.00000,  0.00000,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.00000,  0.21191,  0.00000,  0.01938,  0.00000,
         0.00000,  0.00000],
       [ 0.00000,  0.11707,  0.00000,  0.00000,  0.19443,  0.00000,
         0.00000,  0.00000]])

In [34]:
dense = {(x, y): dense[x][y] for x in range(len(dense[0])) for y in range(len(dense))}
gpu = {(x, y): gpu[x][y] for x in range(len(gpu[0])) for y in range(len(gpu))}

In [35]:
HeatMap(gpu, label='V1 activity') + HeatMap(dense, label='V1 activity')


Out[35]:

Streamed kernels on melmac vs stonesoup


In [36]:
hosts, densities, parts = ['stonesoup.inf.ed.ac.uk', 'melmac.inf.ed.ac.uk'], [48, 142, 162], ['Time']
keys = list(product(densities, hosts, parts))

In [37]:
%%opts Bars [color_by=['stack'] apply_databounds=False show_grid=True yticks=25 xrotation=90]
Bars([(k, avg_timings.query('host == "%s" and version == "Streamed_GPU_Kernels" and cortex_density == %d' % (k[1], k[0]))['time_per_iteration'].mean()) for k in keys], key_dimensions=['Cortex density',  Dimension('Host', values=hosts), Dimension('Part', values=parts)], value_dimensions=['Time (seconds)'])


Out[37]:

In [37]: