In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [7]:
df= pd.read_csv('/Users/kaylan1/Downloads/cores_edison_14.txt',sep=' ')
df.head()
Out[7]:
In [14]:
avg= df.groupby(['cores']).agg(np.mean).reset_index()
std= df.groupby(['cores']).agg(np.std).reset_index()
print(avg.loc[avg['cores'] == 3,'total_sec'].values[0]/avg.loc[avg['cores'] == 4,'total_sec'].values[0])
print(avg.loc[avg['cores'] == 3,'total_sec'].values[0]/avg.loc[avg['cores'] == 6,'total_sec'].values[0])
avg.head()
Out[14]:
In [9]:
STAGES=['tims', 'mask_junk', 'srcs',
'fitblobs', 'coadds', 'writecat']
STAGES+=['total_sec']
import matplotlib as mpl
fontsize= 15
mpl.rcParams['axes.titlesize'] = fontsize
mpl.rcParams['axes.labelsize']= fontsize
mpl.rcParams['font.size']= fontsize
fig,ax=plt.subplots()
xvals= np.arange(len(STAGES))+1
for core,jitter in zip([3,4,6],[-0.1,0,0.1]):
ax.errorbar(xvals+jitter, avg.loc[avg['cores'] == core,STAGES].values[0]/60.,
yerr= std.loc[avg['cores'] == core,STAGES].values[0]/60.,
fmt='o',label='cores=%d' % core,
lw=2,capsize=4,ms=5,ls='--')
ax.legend() #loc='lower right',scatterpoints=1)
ax.set_xticks(xvals)
ax.set_xticklabels(STAGES,rotation=45, ha='right')
ax.set_yscale('log')
#ax.set_ylim([1e-3,1e2])
xlab=ax.set_ylabel('Wall Time (min)')
ylab=ax.set_xlabel('Tractor Stage')
plt.savefig('time_per_stage_cores.png', bbox_extra_artists=[xlab,ylab],
bbox_inches='tight',dpi=100)
#plt.close()
In [10]:
STAGES=['tims', 'mask_junk', 'srcs',
'fitblobs', 'coadds', 'writecat']
STAGES+=['total_sec']
import matplotlib as mpl
fontsize= 15
mpl.rcParams['axes.titlesize'] = fontsize
mpl.rcParams['axes.labelsize']= fontsize
mpl.rcParams['font.size']= fontsize
fig,ax=plt.subplots()
xvals= np.arange(len(STAGES))+1
for core,jitter in zip([3,4,6],[-0.1,0,0.1]):
factor=core
ax.errorbar(xvals+jitter, avg.loc[avg['cores'] == core,STAGES].values[0]/60.*factor,
yerr= std.loc[avg['cores'] == core,STAGES].values[0]/60.*factor,
fmt='o',label='cores=%d' % core,
lw=2,capsize=4,ms=5,ls='--')
ax.legend() #loc='lower right',scatterpoints=1)
ax.set_xticks(xvals)
ax.set_xticklabels(STAGES,rotation=45, ha='right')
ax.set_yscale('log')
#ax.set_ylim([1e-3,1e2])
xlab=ax.set_ylabel('Wall Time (min) * Cores')
ylab=ax.set_xlabel('Tractor Stage')
plt.savefig('time_per_stage_factor_cores.png', bbox_extra_artists=[xlab,ylab],
bbox_inches='tight',dpi=100)
#plt.close()
In [24]:
avg_3= avg.loc[avg['cores'] == 3,STAGES].values[0]
avg_6= avg.loc[avg['cores'] == 6,STAGES].values[0]
std_3= std.loc[std['cores'] == 3,STAGES].values[0]
std_6= std.loc[std['cores'] == 6,STAGES].values[0]
N= len(df[df['cores'] == 3])
stderr_mean_3div6= avg_3/avg_6/N**0.5 * np.sqrt((std_3/avg_3)**2 + (std_6/avg_6)**2)
print(stderr_mean_3div6)
In [26]:
STAGES=['tims', 'mask_junk', 'srcs',
'fitblobs', 'coadds', 'writecat']
STAGES+=['total_sec']
import matplotlib as mpl
fontsize= 15
mpl.rcParams['axes.titlesize'] = fontsize
mpl.rcParams['axes.labelsize']= fontsize
mpl.rcParams['font.size']= fontsize
fig,ax=plt.subplots()
xvals= np.arange(len(STAGES))+1
##
factor=core
#ax.plot(xvals,avg.loc[avg['cores'] == 3,STAGES].values[0]/avg.loc[avg['cores'] == 6,STAGES].values[0],
# label='cores 3/6',lw=2,ms=10,ls='--')
ax.errorbar(xvals,
avg.loc[avg['cores'] == 3,STAGES].values[0]/avg.loc[avg['cores'] == 6,STAGES].values[0],
yerr=stderr_mean_3div6,
fmt='o',label='cores 3/6',lw=2,ms=7,ls='--')
ax.legend(loc='lower right',scatterpoints=1)
ax.set_xticks(xvals)
ax.set_xticklabels(STAGES,rotation=45, ha='right')
#ax.set_yscale('log')
#ax.set_ylim([1e-3,1e2])
xlab=ax.set_ylabel('Wall Time Ratio')
ylab=ax.set_xlabel('Tractor Stage')
plt.savefig('time_per_stage_ratio_cores.png', bbox_extra_artists=[xlab,ylab],
bbox_inches='tight',dpi=100)
#plt.close()
In [54]:
df= pd.read_csv('/Users/kaylan1/Downloads/nobj_runs.txt',sep=' ')
df.head()
Out[54]:
In [55]:
avg= df.groupby(['nobj']).agg(np.mean).reset_index()
std= df.groupby(['nobj']).agg(np.std).reset_index()
avg.head()
Out[55]:
In [17]:
std.loc[avg['nobj'] == nobj,STAGES].values[0]
Out[17]:
In [24]:
np.random.seed(7)
Out[24]:
In [56]:
STAGES=['tims', 'mask_junk', 'srcs',
'fitblobs', 'coadds', 'writecat']
import matplotlib as mpl
fontsize= 15
mpl.rcParams['axes.titlesize'] = fontsize
mpl.rcParams['axes.labelsize']= fontsize
mpl.rcParams['font.size']= fontsize
fig,ax=plt.subplots()
xvals= np.arange(len(STAGES))+1
for nobj,jitter in zip([500,1000,1500],[-0.1,0,0.1]):
ax.errorbar(xvals+jitter, avg.loc[avg['nobj'] == nobj,STAGES].values[0]/60.,
yerr= std.loc[avg['nobj'] == nobj,STAGES].values[0]/60.,
fmt='o',label='nobj=%d' % nobj,
lw=2,capsize=4,ms=5,ls='--')
ax.legend() #loc='lower right',scatterpoints=1)
ax.set_xticks(xvals)
ax.set_xticklabels(STAGES,rotation=45, ha='right')
ax.set_yscale('log')
#ax.set_ylim([1e-3,1e2])
xlab=ax.set_ylabel('Wall Time (min)')
ylab=ax.set_xlabel('Tractor Stage')
plt.savefig('time_per_stage_nobj.png', bbox_extra_artists=[xlab,ylab],
bbox_inches='tight',dpi=100)
#plt.close()
In [57]:
# nobj=1500 gets 3x and 1.5x more work done than nobj=500 and 1000
fig,ax=plt.subplots()
xvals= np.arange(len(STAGES))+1
for nobj,jitter in zip([500,1000,1500],[-0.1,0,0.1]):
factor= 1500./nobj/avg.loc[avg['nobj'] == nobj,'frac_injected'].values[0]
ax.errorbar(xvals+jitter,
avg.loc[avg['nobj'] == nobj,STAGES].values[0]/60.*factor,
yerr= std.loc[avg['nobj'] == nobj,STAGES].values[0]/60.*factor,
fmt='o',label='nobj=%d' % nobj,
lw=2,capsize=4,ms=5,ls='--')
ax.legend() #loc='lower right',scatterpoints=1)
ax.set_xticks(xvals)
ax.set_xticklabels(STAGES,rotation=45, ha='right')
ax.set_yscale('log')
#ax.set_ylim([1e-3,1e2])
xlab=ax.set_ylabel('Wall Time (min)')
ylab=ax.set_xlabel('Tractor Stage')
plt.savefig('time_per_stage_factor_nobj.png', bbox_extra_artists=[xlab,ylab],
bbox_inches='tight',dpi=100)
#plt.close()
In [4]:
a=pd.read_csv(os.path.join(os.environ["HOME"],'Downloads','my_sacct.txt'),
delim_whitespace=True,header=None,
names=['slurm_id','status','num_nodes','time'])
In [5]:
for i,name in zip([0,1,2],['multi_hr','min','sec']):
a[name]= a['time'].str.split(':').str[i]
In [ ]:
In [6]:
a['multi_hr'].str.split('-').str.len().value_counts()
Out[6]:
In [8]:
hasExtra24= a['multi_hr'].str.split('-').str.len() > 1
a.loc[hasExtra24,'multi_hr'], a.loc[~hasExtra24,'multi_hr'].iloc[0]
Out[8]:
In [15]:
a['extra_hrs']= np.zeros(a.shape[0])
a.loc[hasExtra24,'extra_hrs']= 24*a.loc[hasExtra24,'multi_hr'].str.split('-').str[0].astype(float)
a['extra_hrs'].value_counts()
Out[15]:
In [16]:
a['hrs']= np.zeros(a.shape[0])
# No 01-05, just 05
a.loc[~hasExtra24,'hrs']= a.loc[~hasExtra24,'multi_hr'].astype(float)
# When 01-05, just take 05
a.loc[hasExtra24,'hrs']= a.loc[hasExtra24,'multi_hr'].str.split('-').str[1].astype(float)
a.loc[:,'hrs'] += a['extra_hrs']
In [18]:
a.loc[hasExtra24,'hrs'], a.loc[~hasExtra24,'hrs'].max()
Out[18]:
In [21]:
for col in ['min','sec']:
a.loc[:,col]= a.loc[:,col].astype(float)
In [37]:
def cpu_hrs(nodes,h,m,s, mpp=False,cori=True):
"""
Args:
nodes: number of nodes
h,m,s: hours,min,sec
"""
cores_per_node=32
if not cori:
cores_per_node=24
mpp_factor= 1.
if mpp:
mpp_factor= 2.5
return mpp_factor * cores_per_node * nodes * h + m/60. + s/3600.
a['cpu_hrs']= cpu_hrs(a['num_nodes'],a['hrs'],a['min'],a['sec'])
print('total cpu hours (M):',a['cpu_hrs'].sum()/1e6)
print('total MPP hours (M):',a['cpu_hrs'].sum()/1e6*2.5)
In [38]:
fig,ax= plt.subplots(2,2,figsize=(6,6))
names=['num_nodes','hrs','min','sec']
i=0
for row in range(2):
for col in range(2):
sns.distplot(a[names[i]],ax=ax[row,col])
i+=1
In [95]:
from glob import glob
logfns= glob(os.path.join(os.environ['HOME'],'Downloads',
'logs/*/*/*/*/*'))
logfns= [os.path.join(fn,'log.1158p195')
for fn in logfns]
logfns
Out[95]:
In [79]:
import re
def hours_elapsed_logfn(logfn):
"""returns hours between starting tims and finishing fitblobs
robust to next day occuring etc.
"""
with open(logfn,'r') as foo:
text= foo.read()
start= (re.findall(r'Running stage tims at.*?\n',text)[0]
.split(' ')[-1]
.replace('\n','')
.split('.')[0]
.replace('T',' ')
)
end= (re.findall(r'Stage fitblobs finished.*?\n',text)[0]
.split(' ')[-1]
.replace('\n','')
.split('.')[0]
.replace('T',' ')
)
start= pd.to_datetime(start, format='%Y-%m-%d %H:%M:%S')
end= pd.to_datetime(end, format='%Y-%m-%d %H:%M:%S')
return (end - start).total_seconds()/3600.
In [80]:
dtime= [hours_elapsed_logfn(logfn)
for logfn in logfns]
dtime
Out[80]:
In [89]:
df=pd.DataFrame(dict(time=dtime,
logfns=logfns))
df['name']=df['logfns'].str.split('/').str[5]
notSorted= df['name'].str.contains('notsorted')
print(df.loc[notSorted,['name','time']])
print(df.loc[~notSorted,['name','time']])
In [96]:
#simcatfns= glob(os.path.join(os.environ['HOME'],'Downloads',
# 'simcats/*/*/*/*/*'))
simcatfns= [(fn.replace('/Downloads/logs','/Downloads/simcats')
.replace('/logs/','/obiwan/')
.replace('log.1158p195','simcat-elg-1158p195.fits'))
for fn in logfns]
simcatfns
Out[96]:
In [99]:
from astrometry.util.fits import fits_table
num_srcs= [len(fits_table(fn))
for fn in simcatfns]
num_srcs
Out[99]:
In [103]:
df=pd.DataFrame(dict(time=dtime,
logfns=logfns,
num_srcs=num_srcs))
df['name']=df['logfns'].str.split('/').str[5]
notSorted= df['name'].str.contains('notsorted')
print(df.loc[notSorted,['name','time','num_srcs']])
print(df.loc[~notSorted,['name','time','num_srcs']])
In [114]:
df['time_per_src']= df['time']/df['num_srcs']*3600
print(df.loc[notSorted,['name','time_per_src']])
print(df.loc[~notSorted,['name','time_per_src']])
In [121]:
df.loc[~notSorted,'time_per_src'].values/df.loc[notSorted,'time_per_src'].values
Out[121]:
In [122]:
plt.plot([300,600,1200],
df.loc[~notSorted,'time_per_src'].values/df.loc[notSorted,'time_per_src'].values)
Out[122]:
In [40]:
start= pd.to_datetime(start, format='%Y-%m-%d %H:%M:%S')
end= pd.to_datetime(end, format='%Y-%m-%d %H:%M:%S')
start,end
Out[40]:
In [43]:
dt= (end - start)
In [70]:
dt.total_seconds()
Out[70]:
In [51]:
start= pd.to_datetime('2017-12-11 00:10:10', format='%Y-%m-%d %H:%M:%S')
end= pd.to_datetime('2017-12-12 00:11:10', format='%Y-%m-%d %H:%M:%S')
(end - start).total_seconds()/3600.
Out[51]:
In [53]:
df=pd.DataFrame(dict(start=[start]*4,
end=[end]*4))
In [54]:
(df['start'] - df['end']).total_seconds()
In [125]:
from glob import glob
fn= os.path.join(os.environ['HOME'],'Downloads',
'slurm-8934400.out')
with open(fn,'r') as foo:
text= foo.read()
start= (re.findall(r'Logging to:.*?\n',text)[0]
.split(' ')[-1]
.replace('\n','')
.split('.')[0]
.replace('T',' ')
)
In [142]:
def add_fits(text):
return text+'.fits'
def trac_fns(slurm_fn):
with open(slurm_fn,'r') as foo:
text= foo.read()
return (pd.Series(re.findall(r'Logging to:.*?\n',text))
.str.replace(r'Logging to:\s','')
.str.strip()
.str.replace('logs','tractor')
.str.replace(r'log\.','tractor-')
.apply(add_fits)
).values
a=trac_fns(fn)
In [144]:
]
Out[144]:
In [ ]: