RADICAL-Pilot on Blue Waters

Mark Santcroos mark.santcroos@rutgers.edu

Start of code


In [73]:
import pandas as pd
import pprint
import os
from radical.pilot import utils as rpu
import radical.utils as ru
import numpy as np
import matplotlib as mp

In [3]:
# 'Magic' commands
%matplotlib inline
#%pylab inline
format='png' # pdf, png, svg
%config InlineBackend.figure_formats=[format]
#mp.style.use('fivethirtyeight')
mp.style.use('ggplot')

mp.pylab.rcParams['figure.figsize'] = (13, 9)

#Colormap possible values are: 
#Spectral, summer, coolwarm, Wistia_r, pink_r, Set1, Set2, Set3, brg_r, Dark2, prism, PuOr_r, afmhot_r, 
#terrain_r, PuBuGn_r, RdPu, gist_ncar_r, gist_yarg_r, Dark2_r, YlGnBu, RdYlBu, hot_r, gist_rainbow_r, 
#gist_stern, PuBu_r, cool_r, cool, gray, copper_r, Greens_r, GnBu, gist_ncar, spring_r, gist_rainbow, 
#gist_heat_r, Wistia, OrRd_r, CMRmap, bone, gist_stern_r, RdYlGn, Pastel2_r, spring, terrain, YlOrRd_r, 
#Set2_r, winter_r, PuBu, RdGy_r, spectral, rainbow, flag_r, jet_r, RdPu_r, gist_yarg, BuGn, Paired_r, 
#hsv_r, bwr, cubehelix, Greens, PRGn, gist_heat, spectral_r, Paired, hsv, Oranges_r, prism_r, Pastel2, 
#Pastel1_r, Pastel1, gray_r, jet, Spectral_r, gnuplot2_r, gist_earth, YlGnBu_r, copper, gist_earth_r, 
#Set3_r, OrRd, gnuplot_r, ocean_r, brg, gnuplot2, PuRd_r, bone_r, BuPu, Oranges, RdYlGn_r, PiYG,
#CMRmap_r, YlGn, binary_r, gist_gray_r, Accent, BuPu_r, gist_gray, flag, bwr_r, RdBu_r, BrBG, Reds, 
#Set1_r, summer_r, GnBu_r, BrBG_r, Reds_r, RdGy, PuRd, Accent_r, Blues, autumn_r, autumn, cubehelix_r, 
#nipy_spectral_r, ocean, PRGn_r, Greys_r, pink, binary, winter, gnuplot, RdYlBu_r, hot, YlOrBr, 
#coolwarm_r, rainbow_r, Purples_r, PiYG_r, YlGn_r, Blues_r, YlOrBr_r, seismic, Purples, seismic_r, RdBu, 
#Greys, BuGn_r, YlOrRd, PuOr, PuBuGn, nipy_spectral, afmhot

Input definitions


In [4]:
#
# exp1 with 3 iterations
#
exp1 = {
    'rp.session.netbook.mark.016591.0006': {
        'pilot_cores': 256,
        'cu_cores': 1,
        'profiling': True,
        'cu_count': 512,
        'cu_runtime': 0,
        'number_of_workers': 1,
        'pilot_runtime': 30,
        'iteration': 2,
        'backend': 'ORTE'
    },
    'rp.session.netbook.mark.016591.0007': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 1, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0004': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 300, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0005': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 600, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0002': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0003': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 120, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0000': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 10, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0001': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 30, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0008': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 10, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0009': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 30, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.1002': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 1, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.1003': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 10, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.1001': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.1006': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 120, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.1007': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 300, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.1004': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 30, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.1005': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.1008': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 600, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.1009': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.1010': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 1, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0011': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 120, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0010': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0013': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 600, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0012': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 300, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}}
','.join(exp1.keys())


Out[4]:
'rp.session.netbook.mark.016591.0006,rp.session.netbook.mark.016591.0007,rp.session.netbook.mark.016591.0004,rp.session.netbook.mark.016591.0005,rp.session.netbook.mark.016591.0002,rp.session.netbook.mark.016591.0003,rp.session.netbook.mark.016591.0000,rp.session.netbook.mark.016591.0001,rp.session.netbook.mark.016591.0008,rp.session.netbook.mark.016591.0009,rp.session.netbook.mark.016590.1002,rp.session.netbook.mark.016590.1003,rp.session.netbook.mark.016590.1001,rp.session.netbook.mark.016590.1006,rp.session.netbook.mark.016590.1007,rp.session.netbook.mark.016590.1004,rp.session.netbook.mark.016590.1005,rp.session.netbook.mark.016590.1008,rp.session.netbook.mark.016590.1009,rp.session.netbook.mark.016590.1010,rp.session.netbook.mark.016591.0011,rp.session.netbook.mark.016591.0010,rp.session.netbook.mark.016591.0013,rp.session.netbook.mark.016591.0012'

In [5]:
#
# exp2 with 3 iterations
#
exp2 = {
    'rp.session.netbook.mark.016590.0025': {
        'pilot_cores': 256,
        'cu_cores': 128,
        'profiling': True,
        'cu_count': 8,
        'cu_runtime': 60,
        'number_of_workers': 1,
        'pilot_runtime': 30,
        'iteration': 2,
        'backend': 'ORTE'
    },
    'rp.session.netbook.mark.016590.0024': {'pilot_cores': 256, 'cu_cores': 64, 'profiling': True, 'cu_count': 16, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0018': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 1024, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0019': {'pilot_cores': 256, 'cu_cores': 2, 'profiling': True, 'cu_count': 512, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0012': {'pilot_cores': 256, 'cu_cores': 8, 'profiling': True, 'cu_count': 128, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0013': {'pilot_cores': 256, 'cu_cores': 16, 'profiling': True, 'cu_count': 64, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0010': {'pilot_cores': 256, 'cu_cores': 2, 'profiling': True, 'cu_count': 512, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0011': {'pilot_cores': 256, 'cu_cores': 4, 'profiling': True, 'cu_count': 256, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0016': {'pilot_cores': 256, 'cu_cores': 128, 'profiling': True, 'cu_count': 8, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0017': {'pilot_cores': 256, 'cu_cores': 256, 'profiling': True, 'cu_count': 4, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0014': {'pilot_cores': 256, 'cu_cores': 32, 'profiling': True, 'cu_count': 32, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0015': {'pilot_cores': 256, 'cu_cores': 64, 'profiling': True, 'cu_count': 16, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0005': {'pilot_cores': 256, 'cu_cores': 32, 'profiling': True, 'cu_count': 32, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0004': {'pilot_cores': 256, 'cu_cores': 16, 'profiling': True, 'cu_count': 64, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0021': {'pilot_cores': 256, 'cu_cores': 8, 'profiling': True, 'cu_count': 128, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0020': {'pilot_cores': 256, 'cu_cores': 4, 'profiling': True, 'cu_count': 256, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0001': {'pilot_cores': 256, 'cu_cores': 2, 'profiling': True, 'cu_count': 512, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0000': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 1024, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0003': {'pilot_cores': 256, 'cu_cores': 8, 'profiling': True, 'cu_count': 128, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0002': {'pilot_cores': 256, 'cu_cores': 4, 'profiling': True, 'cu_count': 256, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0009': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 1024, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0008': {'pilot_cores': 256, 'cu_cores': 256, 'profiling': True, 'cu_count': 4, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0023': {'pilot_cores': 256, 'cu_cores': 32, 'profiling': True, 'cu_count': 32, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0022': {'pilot_cores': 256, 'cu_cores': 16, 'profiling': True, 'cu_count': 64, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0007': {'pilot_cores': 256, 'cu_cores': 128, 'profiling': True, 'cu_count': 8, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0006': {'pilot_cores': 256, 'cu_cores': 64, 'profiling': True, 'cu_count': 16, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016590.0026': {'pilot_cores': 256, 'cu_cores': 256, 'profiling': True, 'cu_count': 4, 'cu_runtime': 60, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}}
#pprint.pprint(exp2)
','.join(exp2.keys())


Out[5]:
'rp.session.netbook.mark.016590.0025,rp.session.netbook.mark.016590.0024,rp.session.netbook.mark.016590.0018,rp.session.netbook.mark.016590.0019,rp.session.netbook.mark.016590.0012,rp.session.netbook.mark.016590.0013,rp.session.netbook.mark.016590.0010,rp.session.netbook.mark.016590.0011,rp.session.netbook.mark.016590.0016,rp.session.netbook.mark.016590.0017,rp.session.netbook.mark.016590.0014,rp.session.netbook.mark.016590.0015,rp.session.netbook.mark.016590.0005,rp.session.netbook.mark.016590.0004,rp.session.netbook.mark.016590.0021,rp.session.netbook.mark.016590.0020,rp.session.netbook.mark.016590.0001,rp.session.netbook.mark.016590.0000,rp.session.netbook.mark.016590.0003,rp.session.netbook.mark.016590.0002,rp.session.netbook.mark.016590.0009,rp.session.netbook.mark.016590.0008,rp.session.netbook.mark.016590.0023,rp.session.netbook.mark.016590.0022,rp.session.netbook.mark.016590.0007,rp.session.netbook.mark.016590.0006,rp.session.netbook.mark.016590.0026'

In [6]:
#
# exp3 with 3 iterations
#
exp3 = {
    'rp.session.netbook.mark.016591.0028': {
        'pilot_cores': 256,
        'cu_cores': 1,
        'profiling': True,
        'cu_count': 512,
        'cu_runtime': 0,
        'number_of_workers': 7,
        'pilot_runtime': 30,
        'iteration': 1,
        'backend': 'ORTE'},
    'rp.session.netbook.mark.016591.0029': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 8, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0024': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 3, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0025': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 4, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0026': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 5, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0027': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 6, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0020': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 7, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0021': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 8, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0022': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0023': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 2, 'pilot_runtime': 30, 'iteration': 1, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0015': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 2, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0014': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0017': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 4, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0016': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 3, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0019': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 6, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0018': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 5, 'pilot_runtime': 30, 'iteration': 0, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0037': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 8, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0036': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 7, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0035': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 6, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0034': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 5, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0033': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 4, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0032': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 3, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0031': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 2, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}, 'rp.session.netbook.mark.016591.0030': {'pilot_cores': 256, 'cu_cores': 1, 'profiling': True, 'cu_count': 512, 'cu_runtime': 0, 'number_of_workers': 1, 'pilot_runtime': 30, 'iteration': 2, 'backend': 'ORTE'}}
#pprint.pprint(exp3)
','.join(exp3.keys())


Out[6]:
'rp.session.netbook.mark.016591.0028,rp.session.netbook.mark.016591.0029,rp.session.netbook.mark.016591.0024,rp.session.netbook.mark.016591.0025,rp.session.netbook.mark.016591.0026,rp.session.netbook.mark.016591.0027,rp.session.netbook.mark.016591.0020,rp.session.netbook.mark.016591.0021,rp.session.netbook.mark.016591.0022,rp.session.netbook.mark.016591.0023,rp.session.netbook.mark.016591.0015,rp.session.netbook.mark.016591.0014,rp.session.netbook.mark.016591.0017,rp.session.netbook.mark.016591.0016,rp.session.netbook.mark.016591.0019,rp.session.netbook.mark.016591.0018,rp.session.netbook.mark.016591.0037,rp.session.netbook.mark.016591.0036,rp.session.netbook.mark.016591.0035,rp.session.netbook.mark.016591.0034,rp.session.netbook.mark.016591.0033,rp.session.netbook.mark.016591.0032,rp.session.netbook.mark.016591.0031,rp.session.netbook.mark.016591.0030'

In [7]:
#
# List of experiments that it used by data wrangling functions
#
experiments = [exp1, exp2, exp3]

Instructions on fetching profiling data

radicalpilot-stats -m prof -p /Users/mark/Documents/experiments/bluewaters/data/profiling/ -s <session[,session[,session]]>

In [9]:
#
# Function for extracting a selection of experiments from the experiment repo
#
def select_exp_from_repo(repo, filter={}):
    exp_ids = []
    for exp_name, exp_config in repo.iteritems():
        match = True
        for filter_key, filter_val in filter.iteritems():
            if filter_val != exp_config[filter_key]:
                match = False
                break
        if match:
            exp_ids.append(exp_name)
    return exp_ids

def _test_get_experiments():
    print select_exp_from_repo(exp3, {
#         'cu_cores': 1,
#         'cu_count': 500,
#         'nodes': 10,
#         'exec_workers': 1,
#         'spawner': 'shell',
#         'launcher': 'orte'
    })
    
#_test_get_experiments()

In [10]:
#
# Turn ID into a name that can be used as a python identifier.
#
def normalize_id(sid):
    return sid.replace('.', '_')

In [11]:
#
# Write session into HDF5 store
#
def stat_session(store, db, cachedir, session_id):

    session_frame, pilot_frame, unit_frame = rpu.get_session_frames(db, session_id, cachedir)

    norm_session_id = normalize_id(session_id)

    store.put('%s/session' % norm_session_id, session_frame)
    store.put('%s/pilots' % norm_session_id, pilot_frame)
    store.put('%s/units' % norm_session_id, unit_frame)

In [12]:
#
# MongoDB URL
#
import radical.utils as ru
dburl = ru.Url(os.environ['RADICAL_PILOT_DBURL'])
#dburl = ru.Url('mongodb://ec2-54-221-194-147.compute-1.amazonaws.com:24242/')

if not dburl.path or '/' == dburl.path:
    dburl.path = 'radicalpilot'

print 'Using MongoDB at: %s' % dburl


Using MongoDB at: mongodb://ec2-54-221-194-147.compute-1.amazonaws.com:24242/mark/

In [13]:
#
# Location to store raw json session data
#
cachedir = '/Users/mark/Documents/experiments/bluewaters/data/json'

if not os.path.isdir(cachedir):
    raise("No valid cache dir: %s" % cachedir)

In [14]:
#
# Location where raw profiling data is stored
#
profdir = '/Users/mark/Documents/experiments/bluewaters/data/profiling'

Data Structure

Data is stored as:

  • SESSIONID/config
  • SESSIONID/session
  • SESSIONID/pilots
  • SESSIONID/units
  • SESSIONID/prof

In [16]:
#
# Store JSON session data in HDF5 database
#
# TODO: prevent duplication
# TODO: Look into performance degradation warning
#
hdf5dir = '/Users/mark/Documents/experiments/bluewaters/data/hdf5'

store = pd.HDFStore(os.path.join(hdf5dir, 'store.h5'))

In [17]:
mongo, db, dbname, cname, pname = ru.mongodb_connect(dburl)

for exp in experiments:
    
    for session_id in select_exp_from_repo(exp):
    
        stat_session(store, db, cachedir, session_id)

#store.close()
mongo.disconnect()


/Users/mark/.virtualenv/rp/lib/python2.7/site-packages/pandas/io/pytables.py:2559: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['Done', 'Failed', 'pid', 'resource', 'sid']]

  warnings.warn(ws, PerformanceWarning)
/Users/mark/.virtualenv/rp/lib/python2.7/site-packages/pandas/io/pytables.py:2559: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['Canceled', 'Failed', 'PendingInputStaging', 'StagingInput', 'Unscheduled', 'pid', 'sid', 'slots', 'uid']]

  warnings.warn(ws, PerformanceWarning)

In [18]:
session_names = [r._v_name for r in store.root]
#session_names = list(set([x.split('/', 2)[1] for x in store.keys()]))
session_names


Out[18]:
['rp_session_netbook_mark_016590_0000',
 'rp_session_netbook_mark_016590_0001',
 'rp_session_netbook_mark_016590_0002',
 'rp_session_netbook_mark_016590_0003',
 'rp_session_netbook_mark_016590_0004',
 'rp_session_netbook_mark_016590_0005',
 'rp_session_netbook_mark_016590_0006',
 'rp_session_netbook_mark_016590_0007',
 'rp_session_netbook_mark_016590_0008',
 'rp_session_netbook_mark_016590_0009',
 'rp_session_netbook_mark_016590_0010',
 'rp_session_netbook_mark_016590_0011',
 'rp_session_netbook_mark_016590_0012',
 'rp_session_netbook_mark_016590_0013',
 'rp_session_netbook_mark_016590_0014',
 'rp_session_netbook_mark_016590_0015',
 'rp_session_netbook_mark_016590_0016',
 'rp_session_netbook_mark_016590_0017',
 'rp_session_netbook_mark_016590_0018',
 'rp_session_netbook_mark_016590_0019',
 'rp_session_netbook_mark_016590_0020',
 'rp_session_netbook_mark_016590_0021',
 'rp_session_netbook_mark_016590_0022',
 'rp_session_netbook_mark_016590_0023',
 'rp_session_netbook_mark_016590_0024',
 'rp_session_netbook_mark_016590_0025',
 'rp_session_netbook_mark_016590_0026',
 'rp_session_netbook_mark_016590_1001',
 'rp_session_netbook_mark_016590_1002',
 'rp_session_netbook_mark_016590_1003',
 'rp_session_netbook_mark_016590_1004',
 'rp_session_netbook_mark_016590_1005',
 'rp_session_netbook_mark_016590_1006',
 'rp_session_netbook_mark_016590_1007',
 'rp_session_netbook_mark_016590_1008',
 'rp_session_netbook_mark_016590_1009',
 'rp_session_netbook_mark_016590_1010',
 'rp_session_netbook_mark_016591_0000',
 'rp_session_netbook_mark_016591_0001',
 'rp_session_netbook_mark_016591_0002',
 'rp_session_netbook_mark_016591_0003',
 'rp_session_netbook_mark_016591_0004',
 'rp_session_netbook_mark_016591_0005',
 'rp_session_netbook_mark_016591_0006',
 'rp_session_netbook_mark_016591_0007',
 'rp_session_netbook_mark_016591_0008',
 'rp_session_netbook_mark_016591_0009',
 'rp_session_netbook_mark_016591_0010',
 'rp_session_netbook_mark_016591_0011',
 'rp_session_netbook_mark_016591_0012',
 'rp_session_netbook_mark_016591_0013',
 'rp_session_netbook_mark_016591_0014',
 'rp_session_netbook_mark_016591_0015',
 'rp_session_netbook_mark_016591_0016',
 'rp_session_netbook_mark_016591_0017',
 'rp_session_netbook_mark_016591_0018',
 'rp_session_netbook_mark_016591_0019',
 'rp_session_netbook_mark_016591_0020',
 'rp_session_netbook_mark_016591_0021',
 'rp_session_netbook_mark_016591_0022',
 'rp_session_netbook_mark_016591_0023',
 'rp_session_netbook_mark_016591_0024',
 'rp_session_netbook_mark_016591_0025',
 'rp_session_netbook_mark_016591_0026',
 'rp_session_netbook_mark_016591_0027',
 'rp_session_netbook_mark_016591_0028',
 'rp_session_netbook_mark_016591_0029',
 'rp_session_netbook_mark_016591_0030',
 'rp_session_netbook_mark_016591_0031',
 'rp_session_netbook_mark_016591_0032',
 'rp_session_netbook_mark_016591_0033',
 'rp_session_netbook_mark_016591_0034',
 'rp_session_netbook_mark_016591_0035',
 'rp_session_netbook_mark_016591_0036',
 'rp_session_netbook_mark_016591_0037']

In [19]:
sessions = [store['%s/session' % s_name] for s_name in session_names]

Read Agent Profiling Data


In [74]:
#
# Transpose raw DF into a CU oriented datastructure
# TODO: This needs to be converted to a DF straight away
#
# def prof2uids(rawdf):
#     units = {}
    
#     # Using "native" Python    
#     #units[exp]['all'] = [x for x in df.uid[df.uid > 0].unique() if x.startswith('unit')]
#     units['all'] = [x for x in rawdf.uid.dropna().unique() if x.startswith('unit')]
#     units['cloned']= [x for x in units['all'] if 'clone' in x]
#     units['real'] = list(set(units['all']) - set(units['cloned']))
    
#     # Or alternatively, with Pandas
#     #uids_s = df['uid']
#     #all_units_s = uids_s.loc[uids_s.str.startswith('unit.', na=False)].drop_duplicates()
#     #units[exp]['all'] = set(all_units_s)
#     #cloned_units_s = all_units_s.loc[all_units_s.str.contains('clone')]
#     #units[exp]['cloned'] = set(cloned_units_s)
#     #units[exp]['real'] = units[exp]['all'] - units[exp]['cloned']
    
#     return units

# for exp, u in units.iteritems():
#     print "Experiment:'%s', all:'%d', real:'%d', cloned:'%d'" % \
#         (exp, len(u['all']), len(u['real']), len(u['cloned']))

Event Translation Table

This table is semi-manually crafted.


In [72]:
# "label", "component", "event", "message"
# elements = [
#     ('a_get_u',         'MainThread',       'get', 'MongoDB to Agent (PendingExecution)'),
#     ('a_build_u',       'MainThread',       'Agent get unit meta', ''),
#     ('a_mkdir_u',       'MainThread',       'Agent get unit mkdir', ''),
#     ('a_notify_alloc',  'MainThread',       'put', 'Agent to update_queue (Allocating)'),
#     ('a_to_s',          'MainThread',       'put', 'Agent to schedule_queue (Allocating)'),

#     ('s_get_alloc',     'CONTINUOUS',       'get', 'schedule_queue to Scheduler (Allocating)'),
#     ('s_alloc_failed',  'CONTINUOUS',       'schedule', 'allocation failed'),
#     ('s_allocated',     'CONTINUOUS',       'schedule', 'allocated'),
#     ('s_to_ewo',        'CONTINUOUS',       'put', 'Scheduler to execution_queue (Allocating)'),
#     ('s_unqueue',       'CONTINUOUS',       'unqueue', 're-allocation done'),
    
#     ('ewo_get',         'ExecWorker-',      'get', 'executing_queue to ExecutionWorker (Executing)'),
#     ('ewo_launch',      'ExecWorker-',      'ExecWorker unit launch', ''),
#     ('ewo_spawn',       'ExecWorker-',      'ExecWorker spawn', ''),
#     ('ewo_script',      'ExecWorker-',      'launch script constructed', ''),
#     ('ewo_pty',         'ExecWorker-',      'spawning passed to pty', ''),  
#     ('ewo_notify_exec', 'ExecWorker-',      'put', 'ExecWorker to update_queue (Executing)'),
#     ('ewo_to_ewa',      'ExecWorker-',      'put', 'ExecWorker to watcher (Executing)'),
    
#     ('ewa_get',         'ExecWatcher-',     'get', 'ExecWatcher picked up unit'),
#     ('ewa_complete',    'ExecWatcher-',     'execution complete', ''),
#     ('ewa_notify_so',   'ExecWatcher-',     'put', 'ExecWatcher to update_queue (StagingOutput)'),
#     ('ewa_to_sow',      'ExecWatcher-',     'put', 'ExecWatcher to stageout_queue (StagingOutput)'),
    
#     ('sow_get_u',       'StageoutWorker-',  'get', 'stageout_queue to StageoutWorker (StagingOutput)'),
#     ('sow_u_done',      'StageoutWorker-',  'final', 'stageout done'),
#     ('sow_notify_done', 'StageoutWorker-',  'put', 'StageoutWorker to update_queue (Done)'),

#     ('uw_get_alloc',    'UpdateWorker-',    'get', 'update_queue to UpdateWorker (Allocating)'),   
#     ('uw_push_alloc',   'UpdateWorker-',    'unit update pushed (Allocating)', ''),
#     ('uw_get_exec',     'UpdateWorker-',    'get', 'update_queue to UpdateWorker (Executing)'),
#     ('uw_push_exec',    'UpdateWorker-',    'unit update pushed (Executing)', ''),
#     ('uw_get_so',       'UpdateWorker-',    'get', 'update_queue to UpdateWorker (StagingOutput)'),
#     ('uw_push_so',      'UpdateWorker-',    'unit update pushed (StagingOutput)', ''),
#     ('uw_get_done',     'UpdateWorker-',    'get', 'update_queue to UpdateWorker (Done)'),
#     ('uw_push_done',    'UpdateWorker-',    'unit update pushed (Done)', '')
# ]
# print "Number of entries: %d" % len(elements)
# [e[0] for e in elements]
#edf = pd.DataFrame(elements, columns=["label", "component", "event", "message"])

In [22]:
#
# Lookup tuples in dataframe based on uid and the tuple from the elements list
#
def tup2ts(df, uid, tup):
    #print uid
    #print tup
    all_for_uid = df[df.uid == uid].fillna('')
    val = all_for_uid[(all_for_uid.component.str.startswith(tup[1])) &
                      (all_for_uid.event == tup[2]) &
                      (all_for_uid.message == tup[3])].time
    try:
        return val.iloc[0]
    except Exception as e:
        return np.NaN

In [71]:
#
# Construct a unit based dataframe from a raw dataframe
#
def prof2df(rawdf, units): 
    # TODO: create skip logic
    #if exp in indices and exp in info:
    #    continue
    
    indices = [unit for unit in units['real']] 
    info = [{t[0]:tup2ts(rawdf, unit, t) for t in rpu.prof_entries} for unit in units['real']]
    
    # TODO: Also do this for cloned units

    return pd.DataFrame(info) # , index=indices[exp]).sort_index()

In [24]:
#
# Method to create a column based on two other columns using an operator
#
import operator

def create_column(df, lhs, rhs, operator):
    return operator(df[lhs], df[rhs])

In [25]:
#
# Add additional (derived) colums to dataframes
# 
def add_derived(df):
    df['executor_queue'] = create_column(df, 'ewo_get', 's_to_ewo', operator.sub)
    df['raw_runtime'] = create_column(df, 'ewa_complete', 'ewo_launch', operator.sub)
    df['full_runtime'] = create_column(df, 'uw_push_done', 's_to_ewo', operator.sub)
    df['watch_delay'] = create_column(df, 'ewa_get', 'ewo_to_ewa', operator.sub)
    df['allocation'] = create_column(df, 's_allocated', 'a_to_s', operator.sub)

In [26]:
for exp in experiments:
    #
    # Get raw CSV datasets as DataFrames based on selection filter
    #
    session_ids = select_exp_from_repo(
        exp,
        {
        #   'cu_cores': 128,
        #   'cu_count': 500,
        #    'nodes': 10,
        #    'exec_workers': 1,
        #   'spawner': 'shell',
        #   'launcher': 'orte'
        }
    )

    for sid in session_ids:

        norm_sid = normalize_id(sid)

        # Get multiple pilots from session
        for pid in store.get('%s/pilots' % norm_sid)['pid']:
            prof_file = os.path.join(profdir, sid + '-' + pid + '.prof')
            #print prof_file

            raw_prof_data = pd.read_csv(prof_file)
            #print raw_prof_data

            units = prof2uids(raw_prof_data)

            df = prof2df(raw_prof_data, units)

            store.put('%s/prof/%s' % (norm_sid, normalize_id(pid)), df)

Comparison of launch methods (inactive)


In [1187]:
orte0 = store.get('rp.session.netbook.mark.016525.0027/units')
aprun0 = store.get('rp.session.netbook.mark.016525.0028/units')
ccm0 = store.get('rp.session.netbook.mark.016525.0030/units')


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-1187-53fb1778e25c> in <module>()
----> 1 orte0 = store.get('rp.session.netbook.mark.016525.0027/units')
      2 aprun0 = store.get('rp.session.netbook.mark.016525.0028/units')
      3 ccm0 = store.get('rp.session.netbook.mark.016525.0030/units')

/Users/mark/.virtualenv/rp/lib/python2.7/site-packages/pandas/io/pytables.pyc in get(self, key)
    617         group = self.get_node(key)
    618         if group is None:
--> 619             raise KeyError('No object named %s in the file' % key)
    620         return self._read_group(group)
    621 

KeyError: 'No object named rp.session.netbook.mark.016525.0027/units in the file'

In [ ]:
orte0_ttc = max(orte0['Done']) - min(orte0['Executing'])
ccm0_ttc = max(ccm0['Done']) - min(ccm0['Executing'])
aprun0_ttc = max(aprun0['Done']) - min(aprun0['Executing'])
#df = pandas.DataFrame([aprun0_ttc, ccm0_ttc, orte0_ttc], ['aprun', 'ccm', 'orte'])
#df = pandas.DataFrame([[1,2,3]])
df = pd.DataFrame([{'APRUN': aprun0_ttc, 'CCM/SSH': ccm0_ttc, 'ORTE': orte0_ttc}])
ax = df.plot(kind='bar', )
ax.set_ylabel("TTC (s)")
#ax.set_xlabel('Method')
ax.set_title("Time to Completion (TTC)\n10 nodes, 100 x 32 core \"sleep 0\" tasks.\n")
#grid('off')
ax.set_xticklabels("")
ax.set_ylim(0,70)
#savefig('ttc.pdf')

In [ ]:
orte0['Runtime'] = orte0['Done'] - orte0['Executing']
aprun0['Runtime'] = aprun0['Done'] - aprun0['Executing']
ccm0['Runtime'] = ccm0['Done'] - ccm0['Executing']

In [ ]:
ax = aprun0['Runtime'].plot(kind='hist', title='APRUN, 10 nodes, 320 cores, 100x sleep 0, 32 cores')
ax.set_ylabel('Occurences')
ax.set_xlabel('Runtime (s)')

In [ ]:
ax = ccm0['Runtime'].plot(kind='hist', title='CCM/SSH, 10 nodes, 320 cores, 100x sleep 0, 32 cores')
ax.set_ylabel('Occurences')
ax.set_xlabel('Runtime (s)')

In [ ]:
ax = orte0['Runtime'].plot(kind='hist', title='ORTE, 10 nodes, 320 cores, 100x sleep 0, 32 cores')
ax.set_ylabel('Occurences')
ax.set_xlabel('Runtime (s)')

Analysis of Experiment 1

Variable CU duration (0, 1, 10, 30, 60, 120, 300, 600)

  • Fixed backend (ORTE)
  • Fixed CU count (512)
  • Fixed CU cores (1)
  • CU = /bin/sleep
  • Fixed Pilot cores (256)

Goal: investigate the relative overhead of ORTE in relation to the runtime of the CU


In [49]:
#
# Calculate TTC
#
cu_runtime = [0, 1, 10, 30, 60, 120, 300, 600]
orte_ids = {}
orte_ttc = {}
orte_execq = {}

for runtime in cu_runtime:

    orte_ids[runtime] = select_exp_from_repo(
        exp1,
        {
            'cu_runtime': runtime
        }
    )
    
    orte_ttc[runtime] = []
    orte_execq[runtime] = []
    for sid in orte_ids[runtime]:

        norm_sid = normalize_id(sid)
                  
        # Get multiple pilots from session
        for pid in store.get('%s/pilots' % norm_sid)['pid']:
            df = store.get('/%s/prof/%s' % (norm_sid, normalize_id(pid)))
            
            # TTC
            orte_ttc[runtime].append(df['ewa_complete'].max() - df['a_to_s'].min())

orte_df = pd.DataFrame(orte_ttc)
stddev = orte_df.std()

ax = orte_df.mean().plot(kind='bar', yerr=stddev)
#ax.set_title('TTC to execute 512 x single core CUs of X seconds.')
ax.set_ylabel('TTC (s)')
ax.set_xlabel('CU duration (s)')
mp.pyplot.savefig('../plots/exp1_ttc_var-cu-duration.pdf')



In [50]:
#
# Calculate TTC, in relation to execution time / optimal duration
#
cu_runtime = [0, 1, 10, 30, 60, 120, 300, 600]
orte_ids = {}
orte_ttc = {}
#orte_optimal = {}
orte_run = {}

for runtime in cu_runtime:

    orte_ids[runtime] = select_exp_from_repo(
        exp1,
        {
            'cu_runtime': runtime
        }
    )
    
    orte_ttc[runtime] = []
    #orte_optimal[runtime] = []
    orte_run[runtime] = []
    for sid in orte_ids[runtime]:

        norm_sid = normalize_id(sid)
                  
        # Get multiple pilots from session
        for pid in store.get('%s/pilots' % norm_sid)['pid']:
            df = store.get('/%s/prof/%s' % (norm_sid, normalize_id(pid)))
            
            # TTC
            orte_ttc[runtime].append(df['ewa_complete'].max() - df['a_to_s'].min())
            
            # Cumulative runtime
            orte_run[runtime].append(
                (exp1[sid]['cu_count'] * (df['ewa_complete'] - df['ewo_launch']).mean()) /
                exp1[sid]['pilot_cores'])

            # Optimal runtime
            #orte_optimal[runtime].append(
            #    (exp1[sid]['cu_count'] * exp1[sid]['cu_runtime']) /
            #    exp1[sid]['pilot_cores'])

orte_df = pd.DataFrame()
orte_df['ttc'] = pd.DataFrame(orte_ttc).mean()

#orte_df['optimal'] = pd.DataFrame(orte_optimal).mean()
#orte_df['overhead'] = orte_df['ttc'] - orte_df['optimal']
#orte_df[['optimal', 'overhead']].plot(kind='bar', stacked=True)

orte_df['Execution'] = pd.DataFrame(orte_run).mean()
orte_df['ExecWorker Queue'] = orte_df['ttc'] - orte_df['Execution']

ax = orte_df[['ExecWorker Queue', 'Execution']].plot(kind='bar', stacked=True)

#ax.set_title('TTC of 512 single core CUs with varying runtimes.')
ax.set_ylabel('TTC (s)')
ax.set_xlabel('CU duration (s)')
mp.pyplot.savefig('../plots/exp1_ttc_var-cu-duration_split.pdf')



In [55]:
#
# Calculate efficiency
#
cu_runtime = [0, 1, 10, 30, 60, 120, 300, 600]
orte_ids = {}
orte_ttc = {}
orte_run = {}

for runtime in cu_runtime:

    orte_ids[runtime] = select_exp_from_repo(
        exp1,
        {
            'cu_runtime': runtime
        }
    )
    
    orte_ttc[runtime] = []
    orte_run[runtime] = []
    for sid in orte_ids[runtime]:

        norm_sid = normalize_id(sid)
                  
        # Get multiple pilots from session
        for pid in store.get('%s/pilots' % norm_sid)['pid']:
            df = store.get('/%s/prof/%s' % (norm_sid, normalize_id(pid)))
            
            # TTC
            orte_ttc[runtime].append(df['ewa_complete'].max() - df['a_to_s'].min())
            
            # Cumulative runtime
            orte_run[runtime].append(
                (exp1[sid]['cu_count'] * (df['ewa_complete'] - df['ewo_launch']).mean()) /
                exp1[sid]['pilot_cores'])


orte_df = pd.DataFrame()
orte_df['ttc'] = pd.DataFrame(orte_ttc).mean()

orte_df['Execution'] = pd.DataFrame(orte_run).mean()
orte_df['ExecWorker Queue'] = orte_df['ttc'] - orte_df['Execution']
orte_df['Efficiency'] = orte_df['Execution'] / orte_df['ttc'] * 100

# TODO: error bars
ax = orte_df['Efficiency'].plot(kind='bar')

#ax.set_title('Efficiency of core utilization for single core CUs with various runtimes.')
ax.set_ylabel('Core Utilisation Efficiency (%)')
ax.set_xlabel('CU duration (s)')
mp.pyplot.savefig('../plots/exp1_var-cu-duration_eff.pdf')



In [69]:
#
# Calculate Exec Queue overhead
#
cu_runtime = [0, 1, 10, 30, 60, 120, 300, 600]
orte_ids = {}
orte_ttc = {}
orte_execq = {}

for runtime in cu_runtime:

    orte_ids[runtime] = select_exp_from_repo(
        exp1,
        {
            'cu_runtime': runtime
        }
    )
    
    orte_ttc[runtime] = []
    orte_execq[runtime] = []
    for sid in orte_ids[runtime]:

        norm_sid = normalize_id(sid)
                  
        # Get multiple pilots from session
        for pid in store.get('%s/pilots' % norm_sid)['pid']:
            df = store.get('/%s/prof/%s' % (norm_sid, normalize_id(pid)))
            
            # Exec Queue time
            orte_execq[runtime].append((df['ewo_get'] - df['s_to_ewo']).mean())
            
orte_df = pd.DataFrame(orte_execq)
stddev = orte_df.std()

ax = orte_df.mean().plot(kind='bar', yerr=stddev)
#ax.set_title('ExecWorker Queuing time for varying CU duration.')
ax.set_ylabel('Mean ExecWorker Queuing (s)')
ax.set_xlabel('CU duration (s)')
mp.pyplot.savefig('../plots/exp1_var-cu-duration_queueing.pdf')


Analysis of Experiment 2

  • Fixed CU duration (60)
  • Fixed backend (ORTE)
  • Variable CU count (4-1024)
  • Variable CU cores (1-256)
  • CU = /bin/sleep
  • Fixed Pilot cores (256)

Goal: Investigate the relative overhead of small tasks compared to larger tasks


In [70]:
#
# Calculate TTC
#
cu_cores = [1, 2, 4, 8, 16, 32, 64, 128, 256]
orte_ids = {}
orte_ttc = {}

for cores in cu_cores:

    orte_ids[cores] = select_exp_from_repo(
        exp2,
        {
            'cu_cores': cores
        }
    )
    
    orte_ttc[cores] = []
    for sid in orte_ids[cores]:

        norm_sid = normalize_id(sid)
                  
        # Get multiple pilots from session
        for pid in store.get('%s/pilots' % norm_sid)['pid']:
            df = store.get('/%s/prof/%s' % (norm_sid, normalize_id(pid)))
            
            orte_ttc[cores].append(df['ewa_complete'].max() - df['a_to_s'].min())

orte_df = pd.DataFrame(orte_ttc)
stddev = orte_df.std()

ax = orte_df.mean().plot(kind='bar', yerr=stddev)
#ax.set_title('TTC to consume 256 cores (8*32 core nodes),\nfor 4x60 seconds, with varying cores per CU.')
ax.set_ylabel('TTC (s)')
ax.set_xlabel('# cores per CU')
#ax.set_ylim(200, 400)
mp.pyplot.savefig('../plots/exp2_ttc_var-cu-cores.pdf')



In [58]:
#
# Calculate TTC, split up
#
cu_cores = [1, 2, 4, 8, 16, 32, 64, 128, 256]
orte_ids = {}
orte_run = pd.DataFrame()
orte_alloc = pd.DataFrame()
orte_execq = pd.DataFrame()
orte_total = pd.DataFrame()

for cores in cu_cores:

    orte_ids[cores] = select_exp_from_repo(
        exp2,
        {
            'cu_cores': cores,
        }
    )
    
    for sid in orte_ids[cores]:

        norm_sid = normalize_id(sid)
                  
        # Get multiple pilots from session
        for pid in store.get('%s/pilots' % norm_sid)['pid']:
            df = store.get('/%s/prof/%s' % (norm_sid, normalize_id(pid)))
                
            dfr = df['ewa_complete'] - df['ewo_launch']
            if cores in orte_run:
                orte_run[cores] = pd.concat([orte_run[cores], dfr], ignore_index=True)
            else:
                    orte_run[cores] = dfr

            dfq = df['ewo_get'] - df['s_to_ewo']
            if cores in orte_execq:
                orte_execq[cores] = pd.concat([orte_execq[cores], dfq], ignore_index=True)
            else:
                    orte_execq[cores] = dfq
            
orte_df = pd.DataFrame()

orte_df['ExecWorker Queue'] = orte_execq.mean()
orte_df['Execution'] = orte_run.mean()

ax = orte_df.plot(kind='bar', stacked=True)

#ax.set_title('CU WorkQueue and Execution time with varying cores per CU.')
ax.set_ylabel('time (s)')
ax.set_xlabel('# cores per CU')
mp.pyplot.savefig('../plots/exp2_var-cu-cores_split.pdf')



In [59]:
#
# Core utilization for varying core counts
#
cu_cores = [1, 2, 4, 8, 16, 32, 64, 128, 256]
orte_ids = {}
orte_run = {}
orte_ttc = {}

for cores in cu_cores:

    orte_ids[cores] = select_exp_from_repo(
        exp2,
        {
            'cu_cores': cores,
        }
    )
    
    orte_ttc[cores] = []
    orte_run[cores] = []

    for sid in orte_ids[cores]:

        norm_sid = normalize_id(sid)
                  
        # Get multiple pilots from session
        for pid in store.get('%s/pilots' % norm_sid)['pid']:
            df = store.get('/%s/prof/%s' % (norm_sid, normalize_id(pid)))
                
            # Cumulative runtime
            orte_run[cores].append(
                ((df['ewa_complete']-df['ewo_launch'] )).mean() *
                (exp2[sid]['cu_count'] * exp2[sid]['cu_cores']) / exp2[sid]['pilot_cores'] )
                    
            orte_ttc[cores].append(df['ewa_complete'].max() - df['a_to_s'].min())
            
orte_df = pd.DataFrame()

orte_df['ttc'] = pd.DataFrame(orte_ttc).mean()

orte_df['Execution'] = pd.DataFrame(orte_run).mean()
orte_df['Efficiency'] = orte_df['Execution'] / orte_df['ttc'] * 100

# TODO: error bars
ax = orte_df['Efficiency'].plot(kind='bar', stacked=True)

#ax.set_title('Core utilization for varying core counts per CU.')
ax.set_ylabel('Core Utilization Efficiency (%)')
ax.set_xlabel('# cores per CU')
mp.pyplot.savefig("../plots/exp2_var-cu-cores_eff.pdf")


Analysis of Experiment 3

  • Fixed CU duration (0s)
  • Fixed backend (ORTE)
  • Fixed CU count (512)
  • Fixed CU cores (1)
  • CU = /bin/sleep
  • Fixed Pilot cores (256)
  • Variable number of exec workers (1-8)

Goal: Investigate the effect of number of exec workers


In [65]:
#
# Calculate TTC
#
num_workers = range(1,9)
orte_ids = {}
orte_ttc = {}
orte_execq = {}

for workers in num_workers:

    orte_ids[workers] = select_exp_from_repo(
        exp3,
        {
            'number_of_workers': workers
        }
    )
    
    orte_ttc[workers] = []
    orte_execq[workers] = []
    for sid in orte_ids[workers]:

        norm_sid = normalize_id(sid)
                  
        # Get multiple pilots from session
        for pid in store.get('%s/pilots' % norm_sid)['pid']:
            df = store.get('/%s/prof/%s' % (norm_sid, normalize_id(pid)))
            
            # TTC
            orte_ttc[workers].append(df['ewa_complete'].max() - df['a_to_s'].min())

orte_df = pd.DataFrame(orte_ttc)
stddev = orte_df.std()

ax = orte_df.mean().plot(kind='bar', yerr=stddev)
#ax.set_title('TTC to execute 512 single core CUs of 0 seconds.')
ax.set_ylabel('time (s)')
ax.set_xlabel('# ExecWorkers')
mp.pyplot.savefig("../plots/exp3_ttc_var-exec-workers.pdf")



In [66]:
#
# Launch rate
#
num_workers = range(1,9)
orte_ids = {}
orte_ttc = {}
orte_execq = {}

for workers in num_workers:

    orte_ids[workers] = select_exp_from_repo(
        exp3,
        {
            'number_of_workers': workers
        }
    )
    
    orte_ttc[workers] = []
    orte_execq[workers] = []
    for sid in orte_ids[workers]:

        norm_sid = normalize_id(sid)
                  
        # Get multiple pilots from session
        for pid in store.get('%s/pilots' % norm_sid)['pid']:
            df = store.get('/%s/prof/%s' % (norm_sid, normalize_id(pid)))
            
            # TTC
            orte_ttc[workers].append(exp3[sid]['cu_count'] / (df['ewa_complete'].max() - df['a_to_s'].min()))

orte_df = pd.DataFrame(orte_ttc)
stddev = orte_df.std()

ax = orte_df.mean().plot(kind='bar', yerr=stddev)
#ax.set_title('Launch rates for single core CUs of 0 seconds with varying number of ExecWorkers.')
ax.set_ylabel('Launch rate (Executions/s)')
ax.set_xlabel('# ExecWorkers')
mp.pyplot.savefig("../plots/exp3_var-exec-workers_launchrate.pdf")



In [67]:
#
# Calculate CU time, split up
#
num_workers = range(1,9)
orte_ids = {}
orte_run = pd.DataFrame()
orte_execq = pd.DataFrame()

for workers in num_workers:

    orte_ids[workers] = select_exp_from_repo(
        exp3,
        {
            'number_of_workers': workers,
        }
    )
    
    for sid in orte_ids[workers]:

        norm_sid = normalize_id(sid)
                  
        # Get multiple pilots from session
        for pid in store.get('%s/pilots' % norm_sid)['pid']:
            df = store.get('/%s/prof/%s' % (norm_sid, normalize_id(pid)))
                
            dfr = df['ewa_complete'] - df['ewo_launch']
            if workers in orte_run:
                orte_run[workers] = pd.concat([orte_run[workers], dfr], ignore_index=True)
            else:
                    orte_run[workers] = dfr

            dfq = df['ewo_get'] - df['s_to_ewo']
            if workers in orte_execq:
                orte_execq[workers] = pd.concat([orte_execq[workers], dfq], ignore_index=True)
            else:
                    orte_execq[workers] = dfq
            
orte_df = pd.DataFrame()

orte_df['ExecWorker Queue'] = orte_execq.mean()
orte_df['Execution'] = orte_run.mean()

ax = orte_df.plot(kind='bar', stacked=True)

#ax.set_title('CU ExecWorker Queing and Execution time (s),\nwith varying number of ExecWorkers.')
ax.set_ylabel('time (s)')
ax.set_xlabel('# ExecWorkers')
mp.pyplot.savefig("../plots/exp3_var-exec-workers_split.pdf")



In [68]:
#
# Plot ExecWorker Queuing Delay, for varying number of ExecWorkers, with launches shown over time.
#
num_workers = range(1,9)
orte_ids = {}
orte_execq = pd.DataFrame()

for workers in num_workers:

    orte_ids[workers] = select_exp_from_repo(
        exp3,
        {
            'number_of_workers': workers,
            'iteration': 2 # Only one as these values are difficult to combine
        }
    )
    
    for sid in orte_ids[workers]:

        norm_sid = normalize_id(sid)
                  
        # Get multiple pilots from session
        for pid in store.get('%s/pilots' % norm_sid)['pid']:
            df = store.get('/%s/prof/%s' % (norm_sid, normalize_id(pid)))

            df['execq'] = df['ewo_get'] - df['s_to_ewo']
 
            dfq = pd.Series(df.sort('s_to_ewo')['execq'])
            dfq.index = range(exp3[sid]['cu_count'])
            orte_execq[workers] = dfq

ax = orte_execq.plot(colormap='Paired')

ax.set_title('Per CU ExecWorker Queuing time,\nfor different number of ExecWorkers.')
ax.set_ylabel('Exec Worker Queuing time (s)')
ax.set_xlabel('CU instances (chronological)')
mp.pyplot.savefig("../plots/exp3_var-exec-workers_chrono.pdf")


Parking Place