In [110]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
import datetime
%cd /Users/dane/src/awrjs
import js.js_pd as js_pd
c_xem = '#063CBF'
c_ana = '#EA3C00'
In [111]:
sim = js_pd.read_and_validate('/Users/dane//data/a/js/output/awr_v14a.csv')
# in this data set there are a couple of points in october we want to trim off
sim = sim[sim.submitted_date >= '2017-01-01'].copy()
summary = {}
summary['start'] = sim.iloc[0].submitted_date
summary['end'] = sim.iloc[len(sim)-1].submitted_date
summary['sub_cnt'] = len(sim)
summary['comp_cnt'] = sim.duration_m.count()
summary['users'] = len(sim.user.value_counts())
summary['tot_hrs'] = int(sim.duration_m.sum() / 60.0)
summary['big'] = sim.working_set.max() / 1000.
summary['machines'] = len(sim.host.value_counts())
In [112]:
# fix dates
sim.submitted_date = pd.to_datetime(sim.submitted_date)
sim.start_date = pd.to_datetime(sim.start_date)
sim.wait_m = pd.to_numeric(sim.wait_m)
In [113]:
initial_month = sim.iloc[0].submitted_date.month
initial_year = sim.iloc[0].submitted_date.year
# we want months to start at 1 instead of 0
#sim['sub_month'] = (sim['submitted_date'].apply(lambda x: x.year) - initial_year) * 12 + sim.submitted_date.apply(lambda x: x.month) - initial_month + 1
sim['sub_month'] = sim.submitted_date.apply(lambda x: "{:.2f}".format(x.year + x.month / 100.))
sim.columns
Out[113]:
In [114]:
# some weird simulator names come in sometimes. We only care about a few
print(len(sim))
sim = sim[sim.simulator.isin(['AXIEM', 'Analyst', 'EM_3rd_Party'])]
print(len(sim))
In [115]:
print("""
The log files analyzed contain data from {start} to {end}.
There are a total of {sub_cnt:,} jobs submitted of which {comp_cnt:,} completed.
There are {users} separate users that have run jobs for a total of {tot_hrs:,} hours of
of simulation on {machines} machines.
The largest job required {big} Gb of memory.
""".format(**summary)
)
In [116]:
# axiem_bym = sim[sim.simulator=='AXIEM'].groupby('sub_month')
# analyst_bym = sim[sim.simulator=='Analyst'].groupby('sub_month')
# fig = plt.gcf()
# fig.set_size_inches(12,10)
# axiem = axiem_bym.submitted_date.count()
# analyst = analyst_bym.submitted_date.count()
# plt.figure(1)
# plt.subplot(211)
# plt.bar(axiem, axiem, .8, color=c_xem)
# plt.title('AXIEM Jobs Submitted per Month')
# plt.ylabel('Number of Jobs')
# plt.subplot(212)
# plt.bar(analyst.index-0.4, analyst, .8, color=c_ana)
# plt.title('Analyst Jobs Submitted per Month')
# plt.xlabel('Months Since Start of Log')
# plt.ylabel('Number of Jobs')
# plt.show()
em_pvt = pd.pivot_table(sim, index=['sub_month'], columns=['simulator'], values=['duration_m'],
aggfunc=[len, sum, np.mean], fill_value=0)
em_pvt.head()
#em_pvt['len', 'duration_m', 'AXIEM'].plot(kind='barh')
num_sims = len(sim.simulator.unique())
num_months = len(sim.sub_month.unique())
fig, ax = plt.subplots(num_sims, 2, figsize=(22, 12))
for i, sim_name in enumerate(sim.simulator.unique()):
ind = np.arange(num_months)
ax[i, 0].bar(ind, em_pvt['len', 'duration_m', sim_name], width=0.35, color=c_xem)
ax[i, 0].set(xlabel='Month',
ylabel='{}'.format(sim_name),
xticks=ind,
xticklabels=em_pvt['len', 'duration_m', sim_name].index)
ax[i, 1].bar(ind, em_pvt['sum', 'duration_m', sim_name]/60., width=0.35, color=c_xem)
ax[i, 1].set(xlabel='Month',
ylabel='{}'.format(sim_name),
xticks=ind,
xticklabels=em_pvt['sum', 'duration_m', sim_name].index)
ax[0,0].set_title('Number of Jobs')
ax[0,1].set_title('Total Hours')
plt.show()
In [117]:
fig, ax = plt.subplots(num_sims, 1, figsize=(14, 12))
for i, sim_name in enumerate(sim.simulator.unique()):
ind = np.arange(num_months)
ax[i].bar(ind, em_pvt['mean', 'duration_m', sim_name], width=0.35, color=c_xem)
ax[i].set(xlabel='Month',
ylabel='{} Jobs (min)'.format(sim_name),
xticks=ind,
xticklabels=em_pvt['mean', 'duration_m', sim_name].index)
ax[0].set_title('Average Duration of Jobs')
plt.show()
In [118]:
bins = [0, 1, 5, 10, 60, 300, 5000]
comp = js_pd.successful_jobs(sim)
axiem, bins = np.histogram(comp[comp.simulator=='AXIEM'].duration_m, bins=bins)
analyst, bins = np.histogram(comp[comp.simulator=='Analyst'].duration_m, bins=bins)
fig = plt.gcf()
fig.set_size_inches(16,6)
plt.subplot(121)
plt.bar(np.arange(len(bins)-1), axiem, .6, color=c_xem)
plt.gca().set_xticklabels(['<1m', '1-5', '5-10', '10-60', '60-300', '>300'])
plt.gca().set_xticks(np.arange(len(bins)-1))
plt.xlabel('Time range in minutes')
plt.ylabel('Number of Jobs')
plt.title('AXIEM Jobs by Duration')
plt.subplot(122)
plt.bar(np.arange(len(bins)-1), analyst, .6, color=c_ana)
plt.gca().set_xticklabels(['<1m', '1-5', '5-10', '10-60', '60-300', '>300'])
plt.gca().set_xticks(np.arange(len(bins)-1))
plt.title('Analyst Jobs by Duration')
plt.xlabel('Time range in minutes')
plt.ylabel('Number of Jobs')
plt.show()
In [119]:
comp_v12 = comp[ comp.working_set.notnull()] # to looks at mem only keep v12+
bins1 = [0, 500, 1000, 2000, 4000, 8000, 90000]
axiem, bins = np.histogram(comp_v12[comp_v12.simulator=='AXIEM'].working_set, bins=bins1)
bins2 = [0, 1000, 2000, 4000, 8000, 16000, 90000]
analyst, bins = np.histogram(comp_v12[comp_v12.simulator=='Analyst'].working_set, bins=bins2)
fig = plt.gcf()
fig.set_size_inches(16,6)
plt.subplot(121)
plt.bar(np.arange(len(bins1)-1), axiem, .6, color=c_xem)
plt.gca().set_xticklabels(['<500m', '.5-1G', '1-2G', '2-4G', '4-8G', '>8G'])
plt.gca().set_xticks(np.arange(len(bins)-1))
plt.title('AXIEM Jobs by Peak Working Set')
plt.subplot(122)
plt.bar(np.arange(len(bins2)-1), analyst, .6, color=c_ana)
plt.gca().set_xticklabels(['<1G', '1-2G', '2-4G', '4-8G', '8-16G', '>16G'])
plt.gca().set_xticks(np.arange(len(bins)-1))
plt.title('Analyst Jobs by Peak Working Set')
plt.show()
In [120]:
# Look at large memory jobs only
comp_large = comp[ comp.working_set >= 8000] # to looks at mem only keep v12
bins1 = list(range(8000, 21000, 2000))
axiem, bins = np.histogram(comp_large[comp_large.simulator=='AXIEM'].working_set, bins=bins1)
bins2 = list(range(8000, 21000, 2000))
analyst, bins = np.histogram(comp_large[comp_large.simulator=='Analyst'].working_set, bins=bins2)
fig = plt.gcf()
fig.set_size_inches(16,6)
plt.subplot(121)
plt.bar(np.arange(len(bins1)-1), axiem, .6, color=c_xem)
plt.gca().set_xticklabels(['8-10G', '10-12G', '12-14G', '14-16G', '16-18G', '18-20G'])
plt.gca().set_xticks(np.arange(len(bins)-1))
plt.title('AXIEM Jobs by Peak Working Set (large jobs)')
plt.subplot(122)
plt.bar(np.arange(len(bins2)-1), analyst, .6, color=c_ana)
plt.gca().set_xticklabels(['8-10G', '10-12G', '12-14G', '14-16G', '16-18G', '18-20G'])
plt.gca().set_xticks(np.arange(len(bins)-1))
plt.title('Analyst Jobs by Peak Working Set (large jobs)')
plt.show()
In [121]:
fig = plt.gcf()
fig.set_size_inches(18,10)
d = comp_v12[ comp_v12.duration_m < 3000 ]
color = d.simulator.apply(lambda x: c_xem if x=='AXIEM' else c_ana)
plt.scatter(d.duration_m, d.working_set/1024, c=color, s=80, alpha=0.5)
plt.ylabel('Peak Working Set (Gb)')
plt.xlabel('Runtime (minutes)')
plt.title('Runtime vs Memory by Simulator')
xem_patch = mpatches.Patch(color=c_xem, label='AXIEM')
ana_patch = mpatches.Patch(color=c_ana, label='Analyst')
plt.legend(handles=[xem_patch, ana_patch], bbox_to_anchor=[0.7,0.8])
plt.show()
In [122]:
fig = plt.gcf()
fig.set_size_inches(18,10)
x_ind = comp_v12.submitted_date.apply(lambda x: (x-comp_v12.iloc[0].submitted_date).days)
color = d.simulator.apply(lambda x: c_xem if x=='AXIEM' else c_ana)
plt.scatter(x_ind, comp_v12.working_set/1024, c=color, s=80, alpha=0.5)
plt.ylabel('Peak Working Set (Gb)')
plt.xlabel('Days Since Log Started')
plt.title('Peak Working Set over Time')
# plt.xlim([250,600])
# plt.ylim([0,30])
xem_patch = mpatches.Patch(color=c_xem, label='AXIEM')
ana_patch = mpatches.Patch(color=c_ana, label='Analyst')
plt.legend(handles=[xem_patch, ana_patch], bbox_to_anchor=[0.85,0.95])
plt.show()
In [123]:
fig = plt.gcf()
fig.set_size_inches(12,8)
bins = [0, 1, 5, 30, 60, 240, 20000]
data, bins = np.histogram(comp.wait_m, bins=bins)
plt.bar(np.arange(len(bins)-1), data, .6)
plt.gca().set_xticklabels(['<1m', '1-5m', '5-30m', '30-60m', '1-4h', '>4h'])
plt.gca().set_xticks(np.arange(len(bins)-1))
plt.title('Simulation Wait Times (time from submit to start)')
plt.ylabel('Number of jobs')
plt.show()
print('Total wait time was {:,} hours'.format(int(sim.wait_m.sum()/60.0)))
In [124]:
wait_pvt = comp.pivot_table(index=['sub_month'], values='wait_m', aggfunc=sum)
fig = plt.gcf()
fig.set_size_inches(18,5)
x_ind = np.arange(len(wait_pvt))
plt.bar(x_ind, (wait_pvt/60.).values, width=0.8)
plt.title('Total Wait Time per Month')
plt.ylabel('Wait Time (hours)')
plt.xlabel('Months Since Start of Log')
plt.gca().set(xlabel='Month',
ylabel='Wait Time (hours)',
xticks=ind,
xticklabels=wait_pvt.index)
plt.show()
In [125]:
js_pd.jobs_by_type(sim)
Out[125]:
In [126]:
by_user = comp.groupby('user')
by_user_hours = by_user.duration_m.sum() / 60.0
pos = np.arange(len(by_user_hours))+.5
fig = plt.gcf()
fig.set_size_inches(16,6)
plt.barh(pos,by_user_hours, align='center')
plt.yticks(pos, by_user_hours.index)
plt.xlabel('Simulation Time (hours)')
plt.title('Total Simulation Time by User')
plt.show()
In [127]:
by_user = comp.groupby('user')
by_user_hours = by_user.user.count()
pos = np.arange(len(by_user_hours))+.5
fig = plt.gcf()
fig.set_size_inches(16,6)
plt.barh(pos,by_user_hours, align='center')
plt.yticks(pos, by_user_hours.index)
plt.xlabel('Number of Simulation Runs')
plt.title('Simulations by User')
plt.show()
In [128]:
data = comp[~comp.user.isin(['Mark','tab','jomoore','chamilto', 'john', 'ae','jcarrol'])]
x=pd.pivot_table(data, index=['user'], columns='sub_month', values='duration_m', aggfunc=len, fill_value=0)
y=pd.pivot_table(data, index=['user'], columns='sub_month', values='duration_m', aggfunc=sum, fill_value=0)
graphs = len(x.index)
j=1
plt.figure(1, figsize=(14,graphs*3))
for user in x.index:
plt.subplot(graphs, 2, j)
j += 1
plt.plot(x.T[user], linewidth=2)
plt.title('Jobs Run per Month for {}'.format(user))
plt.subplot(graphs, 2, j)
j += 1
plt.plot(y.T[user], linewidth=2)
plt.title('Total Sim Time per Month for {}'.format(user))
plt.show()
In [129]:
by_host = comp[comp.host != 'local service'].groupby('host')
by_host_hours = by_host.duration_m.sum() / 60.0
pos = np.arange(len(by_host_hours))+.5
fig = plt.gcf()
fig.set_size_inches(16,6)
plt.barh(pos,by_host_hours, align='center')
plt.yticks(pos, by_host_hours.index)
plt.xlabel('Simulation Time (hours)')
plt.title('Total Simulation Time by Computer')
plt.show()
In [130]:
by_host = comp[(comp.host != 'local service') & (comp.simulator == 'Analyst')].groupby('host')
by_host_count = by_host.host.count()
pos = np.arange(len(by_host_count))+.5
fig = plt.gcf()
fig.set_size_inches(16,6)
plt.barh(pos,by_host_count, align='center')
plt.yticks(pos, by_host_count.index)
plt.xlabel('Number of Simulation Runs')
plt.title('Simulations by Host')
plt.show()
The scatter plot of memory does a good job of tracking evolution of the large jobs over time but does not give us much insight on the average job. For this a box plot can provide more insight.
Note: Since there are a large number of outliers these graphs are zoomed in (in the Y direction) to allow the boxes to be clearly seen.
In [140]:
data = sim[(sim.simulator=='AXIEM')].copy()
data.sub_month = pd.to_numeric(data.sub_month)
fig, (ax1, ax2) = plt.subplots(2,1)
fig.set_size_inches(16,12)
ax = data.boxplot(column='working_set', by='sub_month', ax=ax1, grid=False)
ax.set_xlabel('')
ax.set_ylabel('Memory (Mb)')
ax.set_ylim([0,8000])
ax.set_title('Memory and Duration of Jobs over Time')
ax = data.boxplot(column='duration_m', by='sub_month', ax=ax2, grid=False)
ax.set_xlabel('Month Since Beginning of Log')
ax.set_ylabel('Job Duration (minutes)')
ax.set_ylim([0,100])
ax.set_title('')
plt.show()
After a simulation is done the results must be copied back to the user. Over a typical network this time is small but is it network dependent. An interesting thing to look at is the average time to copy results back per day (since network problems tend to last longer than just 1 job).
In this case, we combine all the successful jobs by day and look at the number of jobs, the average copy time in minutes and the longest copy time in minutes. Then we just keep the days where the average copy time was over 1 minute.
In [144]:
piv = pd.pivot_table(comp, index=["start_date"], values=['results_copy_m'],
aggfunc=[len, np.mean, max], fill_value=0)
piv[piv['mean']['results_copy_m'] > 1]
Out[144]:
from this we see that that coy times are largely insignificant on most days but there are specific days when something is going wrong. This is something that may be correlated with networking issues. The recommendation here is to collect more data from users and see if this data tracks with other networking issues or if it is perculiar to the job scheduler.
The raw data is available as a CSV file to allow you to perform additional analysis.
This section provides some of the above data in tabular form for those interested.
NOTE: In the tables below 'len' is an abbreviation for lenght and denotes the count. For example in the first table the 'len duration_m' denotes the number of jobs. Likewise, 'sum duration_m' denotes the total sum of the durations in minutes.
In [ ]:
pd.pivot_table(sim, index=['user'], columns=['simulator'], values=['duration_m'], aggfunc=[len, sum, np.median], fill_value=0)
In [ ]:
pd.pivot_table(sim, index=['host'], columns=['simulator'], values=['duration_m'], aggfunc=[len, sum, np.median], fill_value=0)
In [ ]:
In [ ]:
pd.pivot_table(sim, index=['simulator'], values=['duration_m'], aggfunc=[len, sum, np.median], fill_value=0)