In [1]:
import matplotlib.pyplot as plt
#%matplotlib inline
import numpy as np
import os.path
import pandas as pd
import seaborn as sns
#from MyML.helper.plotting import save_fig
In [2]:
sns.set_style("whitegrid")
fig_width = 12
fig_height = 8
In [ ]:
%matplotlib qt
In [3]:
import matplotlib
matplotlib.rcParams['savefig.dpi'] = 600.0
In [4]:
plt.ioff()
In [5]:
# These are the "Tableau 20" colors as RGB.
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
(44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
(148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
(227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
(188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.
for i in range(len(tableau20)):
r, g, b = tableau20[i]
tableau20[i] = (r / 255., g / 255., b / 255.)
tableau = tableau20[::2]
tableau.extend(tableau20[1::2])
tableau = sns.color_palette(tableau,20)
sns.palplot(tableau)
sns.set_palette(tableau, n_colors=20)
In [6]:
results_path = '/home/chiroptera/QCThesis/datasets/gauss10e6_overlap/results/results_kmin.csv'
In [7]:
results_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\gauss10_overlap\\results_kmin.csv'
In [8]:
res = pd.read_csv(results_path)
for col in res.columns:
print col
In [9]:
res['csr_max_row'] = res.biggest_cluster * 3
res['csr_topped'] = (res['csr_max_row'] == res['max_degree']) & res.type_mat
res = res[np.logical_not(res.n_samples.isnull())]
#res.max_assoc = res.max_assoc.apply(lambda s: int(s.strip('()').split(',')[0]))
#res['final density'] = res.n_assocs * 1.0 / (res.n_samples ** 2)
res['samples_per_kmin']=res['n_samples'] / res['kmin']
res['assocs_per_samples'] = res['n_assocs'] / res['n_samples']
In [10]:
def sp_lin_area(n_s, n_e, val_s, val_e):
tri = (n_e - n_s) * (val_s - val_e) / 2.0
r_rect = (1.0 - n_e) * (val_s - val_e)
cut_area = tri + r_rect
return 1 - cut_area
def lin_max_n_assocs(n, bgs):
return n * bgs * 3 * sp_lin_area(0.05, 1.0, 1.0, 0.05)
full_idx = res.type_mat == 'full'
full_condensed_idx = res.type_mat == 'full condensed'
sp_complete_idx = res.type_mat == 'sparse complete'
sp_condensed_const_idx = res.type_mat == 'sparse condensed const'
sp_lin_idx = res.type_mat == 'sparse condensed linear'
sp_const_idx = res.type_mat.isin(['sparse complete', 'sparse condensed const'])
sp_idx = res.type_mat.isin(['sparse complete', 'sparse condensed const', 'sparse condensed linear'])
# maximum number of associations allowed / pre-allocated
res['max_n_assocs'] = 0
res.loc[full_idx,'max_n_assocs'] = np.int64(res.n_samples[full_idx] **2)
res.loc[full_condensed_idx,'max_n_assocs'] = np.int64(res.n_samples[full_condensed_idx] * (res.n_samples[full_condensed_idx] - 1) / 2)
res.loc[sp_const_idx,'max_n_assocs'] = np.int64(res.n_samples[sp_const_idx] * res.biggest_cluster[sp_const_idx] * 3)
res.loc[sp_lin_idx,'max_n_assocs'] = np.int64(lin_max_n_assocs(res.n_samples[sp_lin_idx], res.biggest_cluster[sp_lin_idx]))
# actual memory used to store the associations
res['mem'] = res.max_n_assocs
res.loc[sp_idx,'mem'] = res.max_n_assocs[sp_idx] * (1+4) # data + indices
res['assoc density'] = 1.0 * res.n_assocs / (res.n_samples ** 2)
res['density'] = 1.0 * res.max_n_assocs / (res.n_samples ** 2)
res['mem_density'] = 1.0 * res.mem / (res.n_samples ** 2)
In [11]:
by_rule_n = res.groupby(by=["rule", "n_samples", "type_mat"])
rule_n_mean = by_rule_n.apply(np.mean)
rule_n_mean = rule_n_mean[['kmin','kmax','t_ensemble','biggest_cluster',
'mean_degree','std_degree','min_degree',
'max_degree','n_assocs','n_max_degree', 'accuracy_CI', 'accuracy_CI_disk',
't_build', 't_sl', 't_sl_disk', 't_accuracy_CI',
't_accuracy_CI_disk', 'sl_clusts', 'sl_clusts_disk',
'csr_max_row', 'csr_topped', 'samples_per_kmin', 'max_n_assocs',
'mem', 'assoc density', 'density', 'mem_density','assocs_per_samples']]
rule_n_mean = rule_n_mean.reset_index()
In [12]:
# rules in different plots
par_list = res['rule'].unique()
rows = 2
cols = 2
fig1 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig1.suptitle('Coassoc build time: comparisson between matrix type for each rule', size=16)
fig = fig1
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='rule'):
for type_mat, grp2 in grp.groupby(by='type_mat'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[rule]
ax.plot(grp2.n_samples[idx], grp2.t_build[idx], label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Time [s]")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [13]:
# rules in different plots
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig2 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
fig = fig2
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='type_mat'):
for type_mat, grp2 in grp.groupby(by='rule'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[rule]
ax.plot(grp2.n_samples[idx], grp2.t_build[idx], label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Time [s]")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [18]:
# rules in different plots
build_time_rule = dict()
par_list = res['rule'].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
fig_dict = build_time_rule
for rule, grp in rule_n_mean.groupby(by='rule'):
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[rule] = ax
for type_mat, grp2 in grp.groupby(by='type_mat'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax.plot(grp2.n_samples[idx], grp2.t_build[idx], label=type_mat,)
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Time [s]", fontsize=16)
ax.set_title('Combination execution time, {}={}'.format('rule',par), fontsize=20)
ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
In [19]:
# rules in different plots
build_time_type_mat = dict()
par_list = res['type_mat'].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
fig_dict = build_time_type_mat
for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[type_mat] = ax
for rule, grp2 in grp.groupby(by='rule'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax.plot(grp2.n_samples[idx], grp2.t_build[idx], label=rule,)
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Time [s]", fontsize=16)
ax.set_title('Combination execution time, {}={}'.format('type_mat',par), fontsize=20)
ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
In [20]:
img_path = '/home/chiroptera/QCThesis/results/EAC/build_time/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\build_time\\'
for name, ax in build_time_type_mat.iteritems():
f = ax.get_figure()
f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
plt.close(f)
del build_time_type_mat
for name, ax in build_time_rule.iteritems():
f = ax.get_figure()
f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
plt.close(f)
del build_time_rule
In [23]:
par_list = res['rule'].unique()
rows = 2
cols = 2
fig3 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig3
fig.suptitle("Single-Link clustering time: comparison between matrix types for each rule", size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='rule'):
for type_mat, grp2 in grp.groupby(by='type_mat'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[rule]
trace = '-'
if type_mat in ('full condensed', 'sparse condensed linear'):
trace = '--'
ax.plot(grp2.n_samples[idx], grp2.t_sl[idx], trace, label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Time [s]")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [24]:
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig4 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig4
fig.suptitle("Single-Link clustering time: comparison between rules for each matrix type", size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='type_mat'):
for type_mat, grp2 in grp.groupby(by='rule'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[rule]
ax.plot(grp2.n_samples[idx], grp2.t_sl[idx], label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Time [s]")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [25]:
# rules in different plots
sl_mem_time_type_mat = dict()
fig_dict = sl_mem_time_type_mat
par_list = res['type_mat'].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[type_mat] = ax
for rule, grp2 in grp.groupby(by='rule'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax.plot(grp2.n_samples[idx], grp2.t_sl[idx], label=rule)
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Time [s]", fontsize=16)
ax.set_title('Single-Link (main memory) execution time, {}={}'.format(par1_name,par), fontsize=20)
ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
In [26]:
# rules in different plots
sl_mem_time_rule = dict()
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
fig_dict = sl_mem_time_rule
for par1, grp in rule_n_mean.groupby(by=par1_name):
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[par1] = ax
for par2, grp2 in grp.groupby(by=par2_name):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax.plot(grp2.n_samples[idx], grp2.t_sl[idx], label=par2)
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Time [s]", fontsize=16)
ax.set_title('Single-Link (main memory) execution time, {}={}'.format(par1_name,par), fontsize=20)
ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
In [27]:
img_path = '/home/chiroptera/QCThesis/results/EAC/sl_mem_time/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\sl_mem_time\\'
for name, ax in sl_mem_time_type_mat.iteritems():
f = ax.get_figure()
f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
plt.close(f)
del sl_mem_time_type_mat
for name, ax in sl_mem_time_rule.iteritems():
f = ax.get_figure()
f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
plt.close(f)
del sl_mem_time_rule
In [28]:
# rules in different plots
par_list = res.rule.unique()
rows = 2
cols = 2
fig5 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig5
fig.suptitle('Disk-based Single-Link clustering time: comparison between matrix type for each rule', size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='rule'):
for type_mat, grp2 in grp.groupby(by='type_mat'):
if grp2.t_sl_disk.isnull().all():
continue
idx = grp2.n_samples >= 5e2
ax = ax_par_dict[rule]
trace = '--' if type_mat == 'sparse condensed linear' else '-'
ax.plot(grp2.n_samples[idx], grp2.t_sl_disk[idx], trace, label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Time [s]")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [29]:
# rules in different plots
par_list = ['sparse complete', 'sparse condensed const', 'sparse condensed linear']
rows = 2
cols = 2
fig6 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig6
fig.suptitle('Disk-based Single-Link clustering time: comparison between rules for each matrix type', size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
for rule, grp2 in grp.groupby(by='rule'):
if grp2.t_sl_disk.isnull().any():
continue
idx = grp2.n_samples >= 5e2
ax = ax_par_dict[type_mat]
ax.plot(grp2.n_samples[idx], grp2.t_sl_disk[idx], label=rule)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Time [s]")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [30]:
# rules in different plots
sl_disk_time_type_mat = dict()
fig_dict = sl_disk_time_type_mat
par1_name, par2_name = 'type_mat', 'rule'
#par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
for par1, grp in rule_n_mean.groupby(by=par1_name):
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[par1] = ax
for par2, grp2 in grp.groupby(by=par2_name):
if grp2.t_sl_disk.isnull().any():
continue
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
trace = '-'
ax.plot(grp2.n_samples[idx], grp2.t_sl_disk[idx], trace, label=par2)
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Time [s]", fontsize=16)
ax.set_title('Single-Link (disk) execution time, {}={}'.format(par1_name,par), fontsize=20)
ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
In [31]:
# rules in different plots
sl_disk_time_rule = dict()
fig_dict = sl_disk_time_rule
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
for par1, grp in rule_n_mean.groupby(by=par1_name):
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[par1] = ax
for par2, grp2 in grp.groupby(by=par2_name):
if grp2.t_sl_disk.isnull().any():
continue
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
trace = '--' if par2 == 'sparse condensed linear' else '-'
ax.plot(grp2.n_samples[idx], grp2.t_sl_disk[idx], trace, label=par2)
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Time [s]", fontsize=16)
ax.set_title('Single-Link (disk) execution time, {}={}'.format(par1_name,par), fontsize=20)
ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
In [32]:
img_path = '/home/chiroptera/QCThesis/results/EAC/sl_disk_time/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\sl_disk_time\\'
for name, ax in sl_disk_time_type_mat.iteritems():
f = ax.get_figure()
f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
plt.close(f)
del sl_disk_time_type_mat
for name, ax in sl_disk_time_rule.iteritems():
f = ax.get_figure()
f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
plt.close(f)
del sl_disk_time_rule
In [36]:
# rules in different plots
sl_time_rule = dict()
fig_dict = sl_time_rule
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
for par1, grp in rule_n_mean.groupby(by=par1_name):
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[par1] = ax
fc = grp.query('type_mat == "full condensed"')
idx = fc.n_samples >= 1e3
ax.plot(fc.n_samples[idx], fc.t_sl[idx], label='SLINK')
fc = grp.query('type_mat == "sparse condensed linear"')
idx = fc.n_samples >= 1e3
ax.plot(fc.n_samples[idx], fc.t_sl[idx], label='SL-MST')
ax.plot(fc.n_samples[idx], fc.t_sl_disk[idx], label='SL-MST-Disk')
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Time [s]", fontsize=16)
ax.set_title('Single-Link execution time, {}={}'.format(par1_name,par), fontsize=20)
ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
In [37]:
img_path = '/home/chiroptera/QCThesis/results/EAC/sl_time/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\sl_time\\'
for name, ax in sl_time_rule.iteritems():
f = ax.get_figure()
f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
plt.close(f)
del sl_time_rule
In [38]:
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig7 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig7
fig.suptitle('Association density relative to a full complete matrix: comparison between matrix type for each rule', size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
for rule, grp2 in grp.groupby(by='rule'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[type_mat]
trace = '-'
ax.plot(grp2.n_samples[idx], grp2.density[idx], trace, label=rule)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Relative density")
ax.set_title(par)
ax.legend(loc="upper right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [39]:
par_list = res['rule'].unique()
rows = 2
cols = 2
fig8 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig8
fig.suptitle('Association density relative to a full complete matrix: comparison between matrix type for each rule', size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='rule'):
for type_mat, grp2 in grp.groupby(by='type_mat'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[rule]
trace = '-'
if any(map(lambda x: type_mat==x,['sparse condensed const'])):
trace = '--'
ax.plot(grp2.n_samples[idx], grp2.density[idx], trace, label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Relative density")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
ymin,ymax = ax.get_axes().axis()[2:]
ax.set_ylim((ymin,ymax+0.1))
In [45]:
img_path = '/home/chiroptera/QCThesis/results/EAC/allocated_density/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\allocated_density\\'
for name, ax in allocated_density_type_mat.iteritems():
f = ax.get_figure()
f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
plt.close(f)
del allocated_density_type_mat
for name, ax in allocated_density_rule.iteritems():
f = ax.get_figure()
f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
plt.close(f)
del allocated_density_rule
img_path = '/home/chiroptera/QCThesis/results/EAC/mem_density/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\mem_density\\'
for name, ax in mem_density_type_mat.iteritems():
f = ax.get_figure()
f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
plt.close(f)
for name, ax in mem_density_rule.iteritems():
f = ax.get_figure()
f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
plt.close(f)
del mem_density_rule
img_path = '/home/chiroptera/QCThesis/results/EAC/assoc_density/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\assoc_density\\'
for name, ax in assoc_density.iteritems():
f = ax.get_figure()
f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
plt.close(f)
del assoc_density
In [ ]:
%matplotlib qt
In [ ]:
plt.ioff()
In [40]:
# rules in different plots
mem_density_rule= dict()
fig_dict = mem_density_rule
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
for par1, grp in rule_n_mean.groupby(by=par1_name):
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[par1] = ax
for par2, grp2 in grp.groupby(by=par2_name):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
trace = '-'
if any(map(lambda x: par2==x,['sparse condensed const'])):
trace = '.-'
ax.plot(grp2.n_samples[idx], grp2.loc[idx,'mem_density'], trace, label=par2)
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Density", fontsize=16)
ax.set_title('Memory used relative to full matrix, {}={}'.format(par1_name,par1), fontsize=20)
ax.legend(loc="upper right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
ymin,ymax = ax.get_axes().axis()[2:]
ax.set_ylim((ymin,ymax+0.1))
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
In [41]:
# rules in different plots
mem_density_type_mat = dict()
fig_dict = mem_density_type_mat
par1_name, par2_name = 'type_mat', 'rule'
#par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
for par1, grp in rule_n_mean.groupby(by=par1_name):
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[par1] = ax
for par2, grp2 in grp.groupby(by=par2_name):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
trace = '-'
if any(map(lambda x: par1==x,['sparse condensed const'])):
trace = '.-'
ax.plot(grp2.n_samples[idx], grp2.loc[idx,'mem_density'], trace, label=par2)
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Density", fontsize=16)
ax.set_title('Memory used relative to full matrix, {}={}'.format(par1_name,par1), fontsize=20)
ax.legend(loc="upper right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
ymin,ymax = ax.get_axes().axis()[2:]
ax.set_ylim((ymin,ymax+0.1))
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
In [42]:
# rules in different plots
assoc_density = dict()
fig_dict = assoc_density
par1_name, par2_name = 'type_mat', 'rule'
#par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
for par1, grp in rule_n_mean.groupby(by=par1_name):
if par1 != 'sparse condensed const' and par1 != 'sparse condensed linear':
continue
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[par1] = ax
for par2, grp2 in grp.groupby(by=par2_name):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
trace = '-'
if any(map(lambda x: par1==x,['sparse condensed const'])):
trace = '.-'
ax.plot(grp2.n_samples[idx], grp2.loc[idx,'assoc density'], trace, label=par2)
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Density", fontsize=16)
ax.set_title('Density of associations relative to full matrix, {}={}'.format(par1_name,par1), fontsize=20)
ax.legend(loc="upper right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
ymin,ymax = ax.get_axes().axis()[2:]
ax.set_ylim((ymin,ymax+0.1))
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
In [43]:
# rules in different plots
allocated_density_type_mat = dict()
fig_dict = allocated_density_type_mat
par1_name, par2_name = 'type_mat', 'rule'
#par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
for par1, grp in rule_n_mean.groupby(by=par1_name):
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[par1] = ax
for par2, grp2 in grp.groupby(by=par2_name):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
trace = '-'
if any(map(lambda x: par1==x,['sparse condensed const'])):
trace = '.-'
ax.plot(grp2.n_samples[idx], grp2.density[idx], trace, label=par2)
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Density", fontsize=16)
ax.set_title('Density of allocated associations relative to full matrix, {}={}'.format(par1_name,par), fontsize=20)
ax.legend(loc="upper right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
ymin,ymax = ax.get_axes().axis()[2:]
ax.set_ylim((ymin,ymax+0.1))
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
In [44]:
# rules in different plots
allocated_density_rule = dict()
fig_dict = allocated_density_rule
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
for par1, grp in rule_n_mean.groupby(by=par1_name):
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[par1] = ax
for par2, grp2 in grp.groupby(by=par2_name):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
trace = '-'
if any(map(lambda x: par2==x,['sparse condensed const'])):
trace = '.-'
ax.plot(grp2.n_samples[idx], grp2.density[idx], trace, label=par2)
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Density", fontsize=16)
ax.set_title('Density of allocated associations relative to full matrix, {}={}'.format(par1_name,par), fontsize=20)
ax.legend(loc="upper right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
ymin,ymax = ax.get_axes().axis()[2:]
ax.set_ylim((ymin,ymax+0.1))
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
In [ ]:
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig9 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig9
fig.suptitle('Memory used relative to a full complete matrix: comparison between matrix type for each rule', size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
for rule, grp2 in grp.groupby(by='rule'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[type_mat]
trace = '-'
ax.plot(grp2.n_samples[idx], grp2.mem_density[idx], trace, label=rule)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel('Memory ratio')
ax.set_title(par)
ax.legend(loc="upper right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [ ]:
par_list = res['rule'].unique()
rows = 2
cols = 2
fig10 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig10
fig.suptitle('Memory used relative to a full complete matrix: comparison between matrix type for each rule', size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='rule'):
for type_mat, grp2 in grp.groupby(by='type_mat'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[rule]
trace = '-'
if type_mat in ('sparse condensed const'):
trace = '--'
ax.plot(grp2.n_samples[idx], grp2.mem_density[idx], trace, label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Memory ratio")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [ ]:
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig11 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig11
fig.suptitle('Relative memory usage of sparse condensed linear and $K_{min}$ evolution', size=16)
sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']
sns.set_palette(sns.color_palette("dark", 10))
ax_s = ax.twinx()
for rule, grp in rule_n_mean.groupby(by='rule'):
idx = grp.n_samples >= 1e3 # ignore datasets smaller than 1000
idx2 = grp.type_mat == 'sparse condensed linear'
idx = np.logical_and(idx, idx2)
trace = '-'
ax.plot(grp.n_samples[idx], grp.mem_density[idx], trace, label=rule)
ax_s.plot(grp.n_samples[idx], grp.kmin[idx], trace, label=rule)
ax.plot([250000,250000],[0,10],'-.k')
ax.plot([500000,500000],[0,10],'-.k')
ax.plot([0,10e6],[0.1,0.1],'-.k')
ax.plot([0,10e6],[1,1],'-.k')
ax.set_xlabel("# samples")
ax.set_ylabel('Memory ratio')
ax_s.set_ylabel('$K_{min}$')
ax.legend(loc=(1.1, 0.0), title="Memory ratio") #loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax_s.set_xscale("log")
ax.set_yscale("log")
ax_s.set_yscale("log")
ax_s.legend(loc=(1.1, 0.8), title="$K_{min}$")
Analyzing this plot is interesting we can clearly see the effect of $K_{min}$ in the memory usage. Three of the rules have a quadratic growth, while the "sk=300" rule has a linear growth. It's obvious that a smaller $K_{min}$ will translate in higher memory consumptions. We can see that as the "sk=300" rule crosses the other rules, its correspondent memory usage becomes less than that of those rules.
In [46]:
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig11 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig11
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']
for rule, grp in rule_n_mean.groupby(by='rule'):
idx = grp.n_samples >= 1e3 # ignore datasets smaller than 1000
idx2 = grp.type_mat == 'sparse condensed linear'
idx = np.logical_and(idx, idx2)
trace = '-'
ax.plot(grp.n_samples[idx], grp.assocs_per_samples[idx], trace, label=rule)
#ax.plot([250000,250000],[0,10],'-.k')
#ax.plot([500000,500000],[0,10],'-.k')
#ax.plot([0,10e7],[0.1,0.1],'-.k')
#ax.plot([0,10e7],[1,1],'-.k')
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel('No. associations per sample',fontsize=16)
ax.set_title('Number of associations per sample', size=20)
ax.legend(loc='lower right', title="Rule",prop={'size':16}) #loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
for fig_format in ('eps','png','pdf'):
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
fig.savefig('{}assocs_per_sample.{}'.format(img_path,fig_format), bbox_inches='tight')
#plt.close(fig)
In [ ]:
same_accuracy = {n_samples:True for n_samples in res.n_samples.unique()}
n_samples_accuracy = {n_samples:0 for n_samples in res.n_samples.unique()}
for n_samples, grp in rule_n_mean.groupby(by='n_samples'):
#print n_samples,grp[['accuracy_CI','accuracy_CI_disk']]
#print grp.accuracy_CI
#print grp.accuracy_CI == grp.accuracy_CI[0]
first = True
first_score = 0
for score in np.nditer(grp[['accuracy_CI','accuracy_CI_disk']].values):
if pd.notnull(score):
if first:
first_score = score
n_samples_accuracy[n_samples] = score
first = False
if score != first_score:
same_accuracy[n_samples] = False
break
In [ ]:
print 'All accuracies are the same:'
print '----------------------------'
print '# samples\tSame'
for item in sorted(same_accuracy.iteritems(), key=lambda x: x[0]):
print '{}\t\t{}\t{}'.format(int(item[0]), item[1], n_samples_accuracy[item[0]])
Accuracies are the same throughout the whose spectrum of cardinality, for the exception of the first 3 sets. The reason for this is that the 'sk=300' rule yields a very low $K_{min}$ for low cardinality. In this cases the number of clusters in the ensemble is lower that the true number, which is undesirable in EAC.
In [47]:
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig12 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig12
sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']
idx = (res.n_samples >= 1e3) & (res.type_mat == 'sparse condensed linear') & (res.rule == 'sk=300,th=30%')
trace='-'
ax.plot(res.n_samples[idx], res.accuracy_CI_disk[idx], trace, label='sk=300,th=30%')
#ax.plot([0,0,2500000,2500000],[0,1.1,0,1.1],'.k')
#ax.plot([500000,500000],[0,1],'-.k')
ax.set_xlabel("# samples", fontsize=16)
ax.set_ylabel('Memory ratio', fontsize=16)
ax.set_title('Accuracy (CI) evolution with number of samples', size=20)
ax.legend(prop={'size':16}, loc="best") #loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
#ax.set_yscale("log")
t_axis = plt.axis()
t_axis = (t_axis[0], t_axis[1], t_axis[2], t_axis[3] + 0.01)
plt.axis(t_axis)
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
for fig_format in ('eps','png','pdf'):
#fig.savefig('/home/chiroptera/QCThesis/results/EAC/accuracy.{}'.format(fig_format), bbox_inches='tight')
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
fig.savefig('{}accuracy.{}'.format(img_path,fig_format), bbox_inches='tight')
#plt.close(fig)
In [48]:
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig13 = plt.figure(figsize=(fig_width*2 * cols, fig_height*2 * rows))
fig = fig13
fig.suptitle('Relative memory usage of sparse condensed linear and $K_{min}$ evolution', size=20)
sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']
sns.set_palette(sns.color_palette("dark", 10))
ax_s = ax.twinx()
for rule, grp in rule_n_mean.groupby(by='rule'):
idx = grp.n_samples >= 1e3 # ignore datasets smaller than 1000
idx2 = grp.type_mat == 'sparse condensed linear'
idx = np.logical_and(idx, idx2)
trace = '-'
ax.plot(grp.n_samples[idx], grp.n_assocs[idx], trace, label=rule)
ax_s.plot(grp.n_samples[idx], grp.max_n_assocs[idx], '-.', label=rule)
ax.set_xlabel("# samples", fontsize=16)
ax.set_ylabel('# assocs.', fontsize=16)
ax.set_xscale("log")
ax.set_yscale("log")
ax.grid(True, which="both")
ax.legend(loc=(1.1, 0.0), title='Number of associations',fontsize=16,prop={'size':16}) #loc=(1.1, 0.0))
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
#ax_s.set_ylabel('$K_{min}$')
ax_s.set_xscale("log")
ax_s.set_yscale("log")
ax_s.legend(loc=(1.1, 0.8), title='Maximum number of assocs.',fontsize=16)
Out[48]:
In [49]:
# double y axis example
rows = 1
cols = 1
fig14 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig14
ax = fig.add_subplot(rows, cols, 1)
for key,grp in rule_n_mean.groupby(by="rule"):
idx = (grp.n_samples >= 1e3) & (grp.type_mat == 'sparse condensed linear')
ax.plot(grp.n_samples[idx], grp.max_degree[idx] / grp.biggest_cluster[idx], label=key)
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("max # assocs / biggest cluster size",fontsize=16)
ax.set_title("Relationship between max. num. assocs and biggest cluster size per rule",fontsize=20)
ax.legend(loc="lower right",prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
for fig_format in ('eps','png','pdf'):
#fig.savefig('/home/chiroptera/QCThesis/results/EAC/max_assoc_bgs.{}'.format(fig_format), bbox_inches='tight')
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
fig.savefig('{}max_assocs_bgs.{}'.format(img_path,fig_format), bbox_inches='tight')
plt.close(fig)
In [64]:
# change all files with weird rule names to something latex can read
import glob
import os
for d in glob.glob('C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\*'):
for f in glob.glob(d + '\\*'):
if 'sk=300,th=30%' in f:
new_name = f.replace('sk=300,th=30%','sk=300')
os.rename(f, new_name)
elif 'sk=sqrt_2,th=30%' in f:
new_name = f.replace('sk=sqrt_2,th=30%','sk=sqrt_2')
os.rename(f, new_name)
elif 'full condensed' in f:
new_name = f.replace('full condensed','full_condensed')
os.rename(f, new_name)
elif 'sparse complete' in f:
new_name = f.replace('sparse complete','sparse_complete')
os.rename(f, new_name)
elif 'sparse condensed const' in f:
new_name = f.replace('sparse condensed const','sparse_condensed_const')
os.rename(f, new_name)
elif 'sparse condensed linear' in f:
new_name = f.replace('sparse condensed linear','sparse_condensed_linear')
os.rename(f, new_name)
In [50]:
# rules in different plots
kmin_per_rule = dict()
fig_dict = kmin_per_rule
title = '$K_{min}$ evolution per rule'
#par1_name : diff plots
#par2_name : different curves in plot
par1_name, par2_name = 'type_mat', 'rule'
#par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
for par1, grp in rule_n_mean.groupby(by=par1_name):
if par1 != 'sparse condensed linear':
continue
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[par1] = ax
for par2, grp2 in grp.groupby(by=par2_name):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax.plot(grp2.n_samples[idx], grp2.loc[idx,'kmin'], label=par2)
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Time [s]", fontsize=16)
ax.set_title('{}, {}={}'.format(title,par1_name,par), fontsize=20)
ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#fontsize=16
#prop={'size':16
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
for fig_format in ('eps','png','pdf'):
#fig.savefig('/home/chiroptera/QCThesis/results/EAC/kmin_evolution.{}'.format(fig_format), bbox_inches='tight')
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
fig.savefig('{}kmin_evolution.{}'.format(img_path,fig_format), bbox_inches='tight')
#plt.close(fig)
In [51]:
# rules in different plots
sl_different_methods = dict()
fig_dict = sl_different_methods
title = 'SLINK vs SL-MST vs SL-MST-Disk'
#par1_name : diff plots
#par2_name : different curves in plot
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
for par1, grp in rule_n_mean.groupby(by=par1_name):
if par1 != '2sqrt':
continue
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[par1] = ax
for par2, grp2 in grp.groupby(by=par2_name):
if par2 == 'full' or par2 == 'sparse condensed const':
continue
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax.plot(grp2.n_samples[idx], grp2.loc[idx,'t_sl'], label=par2)
if par2 == 'full condensed':
continue
ax.plot(grp2.n_samples[idx], grp2.loc[idx,'t_sl_disk'], label=par2)
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Time [s]", fontsize=16)
ax.set_title('{}, {}={}'.format(title,par1_name,par), fontsize=20)
ax.legend(['SLINK','SL-MST complete','SL-MST-Disk complete','SL-MST condensed','SL-MST-Disk condensed'],loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#fontsize=16
#prop={'size':16
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
for fig_format in ('eps','png','pdf'):
#fig.savefig('/home/chiroptera/QCThesis/results/EAC/sl_time/slink_vs_sl-mst.{}'.format(fig_format), bbox_inches='tight')
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\sl_time\\'
fig.savefig('{}slink_vs_sl-mst.{}'.format(img_path,fig_format), bbox_inches='tight')
#plt.close(fig)
In [ ]:
sns.palplot(tableau)
In [58]:
ensemble_time_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\gauss10_overlap\\ensemble_results_kmin.csv'
res_t_ensemble = pd.read_csv(ensemble_time_path)
res_t_ensemble = res_t_ensemble[['n_samples','rule','kmin','kmax','t_ensemble']].dropna(axis=0)
res_t_ensemble = res_t_ensemble.query('n_samples >= 5e2')
for key, grp1 in rule_n_mean.groupby(by=['n_samples','rule']):
n, rule = key
idx = rule_n_mean.query('n_samples=={} & rule=="{}"'.format(n,rule)).index
t = res_t_ensemble.query('n_samples=={} & rule=="{}"'.format(n,rule))['t_ensemble'].values[0]
rule_n_mean.loc[idx, 't_ensemble'] = t
In [59]:
# rules in different plots
ensemble_time = dict()
fig_dict = ensemble_time
title = 'Production execution time'
#par1_name : diff plots
#par2_name : different curves in plot
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[0] = ax
for par1, grp in rule_n_mean.groupby(by=par1_name):
ax.plot(grp.n_samples, grp.loc[:,'t_ensemble'], label= '{}'.format(par1))
#build = grp.query('type_mat == "sparse condensed linear"').loc[:,['n_samples','t_build']]
#ax.plot(build.loc[:,'n_samples'], build.loc[:,'t_build'], label= 'build time {}'.format(par1))
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Time [s]", fontsize=16)
ax.set_title('{}, {}{}'.format(title,'',''), fontsize=20)
ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#fontsize=16
#prop={'size':16
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
for fig_format in ('eps','png','pdf'):
#fig.savefig('/home/chiroptera/QCThesis/results/EAC/ensemble_time.{}'.format(fig_format), bbox_inches='tight')
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
fig.savefig('{}ensemble_time.{}'.format(img_path,fig_format), bbox_inches='tight')
In [60]:
total = rule_n_mean.query('type_mat == "sparse condensed linear"').loc[:,['rule','n_samples','t_build','t_ensemble','t_sl','t_sl_disk']]
total['total_mem'] = total[['t_build','t_ensemble','t_sl']].sum(axis=1)
total['total_disk'] = total[['t_build','t_ensemble','t_sl_disk']].sum(axis=1)
In [61]:
# rules in different plots
ensemble_time = dict()
fig_dict = ensemble_time
title = 'Total execution with SL-MST'
#par1_name : diff plots
#par2_name : different curves in plot
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[0] = ax
for par1, grp in total.groupby(by=par1_name):
grp= grp.dropna()
ax.plot(grp.n_samples, grp.loc[:,'total_mem'], label= '{}'.format(par1))
#build = grp.query('type_mat == "sparse condensed linear"').loc[:,['n_samples','t_build']]
#ax.plot(build.loc[:,'n_samples'], build.loc[:,'t_build'], label= 'build time {}'.format(par1))
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Time [s]", fontsize=16)
ax.set_title('{}, {}{}'.format(title,'',''), fontsize=20)
ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#fontsize=16
#prop={'size':16
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
for fig_format in ('eps','png','pdf'):
#fig.savefig('/home/chiroptera/QCThesis/results/EAC/total_time_sl-mst.{}'.format(fig_format), bbox_inches='tight')
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
fig.savefig('{}total_time_sl-mst.{}'.format(img_path,fig_format), bbox_inches='tight')
In [62]:
# rules in different plots
ensemble_time = dict()
fig_dict = ensemble_time
title = 'Total execution with SL-MST-Disk'
#par1_name : diff plots
#par2_name : different curves in plot
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'
par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[par1] = ax
for par1, grp in total.groupby(by=par1_name):
ax.plot(grp.n_samples, grp.loc[:,'total_disk'], label= '{}'.format(par1))
#build = grp.query('type_mat == "sparse condensed linear"').loc[:,['n_samples','t_build']]
#ax.plot(build.loc[:,'n_samples'], build.loc[:,'t_build'], label= 'build time {}'.format(par1))
for par, ax in fig_dict.iteritems():
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("Time [s]", fontsize=16)
ax.set_title('{}, {}{}'.format(title,'',''), fontsize=20)
ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#fontsize=16
#prop={'size':16
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
for fig_format in ('eps','png','pdf'):
#fig.savefig('/home/chiroptera/QCThesis/results/EAC/total_time_sl-mst-disk.{}'.format(fig_format), bbox_inches='tight')
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
fig.savefig('{}total_time_sl-mst-disk.{}'.format(img_path,fig_format), bbox_inches='tight')
In [ ]:
n_array = res.n_samples.unique()
x_kmin = map(rule_x, n_array)
In [ ]:
500 **2
In [ ]:
500 * 499 / 2
In [ ]:
res[res.n_samples == 500][['n_samples', 'type_mat', 'n_assocs', 'round']]