In [2]:
import sys
import os.path
import re
from glob import glob
from datetime import datetime
In [3]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
matplotlib.rcParams.update({'font.size': 9})
matplotlib.rcParams.update({'figure.autolayout': True})
from matplotlib import dates
import seaborn as sns
%matplotlib inline
Function to read benchmark results from file
In [4]:
def readResultsSSF(filename):
infile = open(filename, 'r')
# Get date of test from file name
tokens = filename.split('_')
datestring = tokens[-1].split('.')[0]
runtime = datetime.strptime(datestring, "%Y%m%d%H%M%S")
resdict = {}
resframe_proto = []
resdict['JobID'] = 'Unknown'
for line in infile:
if re.search('MPI-IO', line):
break
elif re.search('Starting job', line):
tokens = line.split()
resdict['JobID'] = tokens[2]
elif re.search('Running', line):
tokens = line.split()
resdict['Writers'] = int(tokens[2])
elif re.search('Array', line):
tokens = line.split()
x = int(tokens[4])
y = int(tokens[6])
z = int(tokens[8])
resdict['LocalSize'] = (x, y, z)
elif re.search('Global', line):
tokens = line.split()
x = int(tokens[4])
y = int(tokens[6])
z = int(tokens[8])
resdict['GlobalSize'] = (x, y, z)
elif re.search('Total', line):
tokens = line.split()
resdict['TotData'] = float(tokens[5])
infile.close()
infile = open(filename, 'r')
timedict = resdict.copy()
for line in infile:
if re.search('HDF5', line):
break
elif re.search('Writing to', line):
tokens = line.split()
if re.match('striped', tokens[2]):
timedict['Striping'] = -1
elif re.match('defstriped', tokens[2]):
timedict['Striping'] = 4
elif re.match(' time', line):
tokens = line.split()
timedict['Write'] = float(tokens[6])
timedict['File'] = os.path.abspath(filename)
timedict['RunDate'] = runtime
timedict['Count'] = 1
resframe_proto.append(timedict)
# Striping is only captured once for each set so need to preset this for all following datum
curstriping = timedict['Striping']
timedict = resdict.copy()
timedict['Striping'] = curstriping
infile.close()
return resframe_proto
In [5]:
def readResultsFPP(filename):
infile = open(filename, 'r')
# Get date of test from file name
tokens = filename.split('_')
datestring = tokens[-1].split('.')[0]
runtime = datetime.strptime(datestring, "%Y%m%d%H%M%S")
resdict = {}
resframe_proto = []
resdict['JobID'] = 'Unknown'
for line in infile:
if re.search('MPI-IO', line):
break
elif re.search('Starting job', line):
tokens = line.split()
resdict['JobID'] = tokens[2]
elif re.search('Running', line):
tokens = line.split()
resdict['Writers'] = int(tokens[2])
elif re.search('Array', line):
tokens = line.split()
x = int(tokens[4])
y = int(tokens[6])
z = int(tokens[8])
resdict['LocalSize'] = (x, y, z)
elif re.search('Global', line):
tokens = line.split()
x = int(tokens[4])
y = int(tokens[6])
z = int(tokens[8])
resdict['GlobalSize'] = (x, y, z)
elif re.search('Total', line):
tokens = line.split()
resdict['TotData'] = float(tokens[5])
infile.close()
infile = open(filename, 'r')
timedict = resdict.copy()
for line in infile:
if re.search('Finished', line):
break
elif re.search('Writing to', line):
tokens = line.split()
if re.match('unstriped', tokens[2]):
timedict['Striping'] = 1
elif re.match(' time', line):
tokens = line.split()
timedict['Write'] = float(tokens[6])
timedict['File'] = os.path.abspath(filename)
timedict['RunDate'] = runtime
timedict['Count'] = 1
resframe_proto.append(timedict)
# Striping is only captured once for each set so need to preset this for all following datum
curstriping = timedict['Striping']
timedict = resdict.copy()
timedict['Striping'] = curstriping
infile.close()
return resframe_proto
In [26]:
indir = '../benchio/results/32/ARCHER/fs3'
ssflist = glob(os.path.join(indir, 'benchio_res_' + '*' ))
ssflist.sort()
In [27]:
tssfframe = []
for filename in ssflist:
arr = readResultsSSF(filename)
tssfframe.extend(arr)
Create the dataframe and restrict to maximum striping results
In [28]:
ssfframe = pd.DataFrame(tssfframe)
ssfframe = ssfframe[ssfframe.Striping == -1]
A histogram (see below) clearly shows two different performance regimes:
In [29]:
fig, ax = plt.subplots()
# plt.hist(resframe['Write'].tolist(), histtype='step', linewidth=2, alpha=0.5)
sns.distplot(ssfframe['Write'].tolist(), bins=30, kde=False, rug=False);
plt.xlabel('Bandwidth / MiB/s')
plt.ylabel('Count')
Out[29]:
The normed CDF (see below) shows that:
In [30]:
plt.hist(ssfframe['Write'].tolist(), bins=30, histtype='step', linewidth=2, alpha=0.5, cumulative=True, normed=True)
plt.xlabel('Bandwidth / MiB/s')
plt.ylabel('Count')
Out[30]:
Aggregate the performance results for each day to assist analysis
In [31]:
groupf = {'Write':['min','median','max','mean'], 'Count':'sum'}
datessf = ssfframe.groupby('RunDate').agg(groupf)
In all the following plots the shaded area spans the minimum to maximum bandwidth for that day and the line shows the median bandwidth.
First, we show the full range of data and then zoom in on each of the 6 month periods. There was a gap in data collection from May-July 2017.
In [32]:
fig = plt.figure(1)
ax = plt.subplot(1, 1, 1)
fig.autofmt_xdate()
ax.set_ylabel('Bandwidth / MiB/s')
ax.plot_date(datessf.index.tolist(), datessf['Write','median'].tolist(), '-', linewidth=0.75)
ax.fill_between(datessf.index.tolist(), datessf['Write','min'].tolist(), datessf['Write','max'].tolist(), alpha=0.25)
Out[32]:
In [33]:
start = datetime(2016, 7, 1)
end = datetime(2017, 1, 1)
fig = plt.figure(1)
ax = plt.subplot(1, 1, 1)
fig.autofmt_xdate()
ax.set_ylabel('Bandwidth / MiB/s')
ax.set_xlim([start, end])
ax.plot_date(datessf.index.tolist(), datessf['Write','median'].tolist(), '-', linewidth=0.75)
ax.fill_between(datessf.index.tolist(), datessf['Write','min'].tolist(), datessf['Write','max'].tolist(), alpha=0.25)
Out[33]:
In [34]:
start = datetime(2017, 1, 1)
end = datetime(2017, 5, 1)
fig = plt.figure(1)
ax = plt.subplot(1, 1, 1)
fig.autofmt_xdate()
ax.set_ylabel('Bandwidth / MiB/s')
ax.set_xlim([start, end])
ax.plot_date(datessf.index.tolist(), datessf['Write','median'].tolist(), '-', linewidth=0.75)
ax.fill_between(datessf.index.tolist(), datessf['Write','min'].tolist(), datessf['Write','max'].tolist(), alpha=0.25)
Out[34]:
In [35]:
start = datetime(2017, 7, 1)
end = datetime(2017, 10, 1)
fig = plt.figure(1)
ax = plt.subplot(1, 1, 1)
fig.autofmt_xdate()
ax.set_ylabel('Bandwidth / MiB/s')
ax.set_xlim([start, end])
ax.plot_date(datessf.index.tolist(), datessf['Write','median'].tolist(), '-', linewidth=0.75)
ax.fill_between(datessf.index.tolist(), datessf['Write','min'].tolist(), datessf['Write','max'].tolist(), alpha=0.25)
Out[35]:
This final period: Jul - Sep 2017 generally shows less variation in the median than the earlier periods possibly indicating more consistent performance.
In [19]:
indir = '../benchio_fpp/results/32/ARCHER/fs3'
fpplist = glob(os.path.join(indir, 'benchio_res_' + '*' ))
fpplist.sort()
In [20]:
tfppframe = []
for filename in fpplist:
arr = readResultsFPP(filename)
tfppframe.extend(arr)
In [21]:
fppframe = pd.DataFrame(tfppframe)
fppframe = fppframe[fppframe.Striping == 1]
In [22]:
fig, ax = plt.subplots()
sns.distplot(fppframe['Write'].tolist(), bins=30, kde=False, rug=False);
plt.xlabel('Bandwidth / MiB/s')
plt.ylabel('Count')
Out[22]:
Normed CDF (below) shows that the performance distribution for FPP is roughly normal (unlike SSF)
In [23]:
plt.hist(fppframe['Write'].tolist(), bins=30, histtype='step', linewidth=2, alpha=0.5, cumulative=True, normed=True)
plt.xlabel('Bandwidth / MiB/s')
plt.ylabel('Count')
Out[23]:
In [24]:
datefpp = fppframe.groupby('RunDate').agg(groupf)
In [25]:
fig = plt.figure(1)
ax = plt.subplot(1, 1, 1)
fig.autofmt_xdate()
ax.set_ylabel('Bandwidth / MiB/s')
ax.plot_date(datefpp.index.tolist(), datefpp['Write','median'].tolist(), '-', linewidth=0.75)
ax.fill_between(datefpp.index.tolist(), datefpp['Write','min'].tolist(), datefpp['Write','max'].tolist(), alpha=0.25)
Out[25]:
In [ ]: