In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import os.path
import pandas as pd
import seaborn as sns
from MyML.helper.plotting import save_fig
In [2]:
sns.set_style("whitegrid")
fig_width = 8
fig_height = 6
In [3]:
results_path = '/home/chiroptera/QCThesis/datasets/gauss10e6_overlap/results_kmin.csv'
In [4]:
res = pd.read_csv(results_path)
for col in res.columns:
print col
In [5]:
res['csr_max_row'] = res.biggest_cluster * 3
res['csr_topped'] = (res['csr_max_row'] == res['max_degree']) & res.type_mat
res = res[np.logical_not(res.n_samples.isnull())]
#res.max_assoc = res.max_assoc.apply(lambda s: int(s.strip('()').split(',')[0]))
#res['final density'] = res.n_assocs * 1.0 / (res.n_samples ** 2)
res['samples_per_kmin']=res['n_samples'] / res['kmin']
res['assocs_per_samples'] = res['n_assocs'] / res['n_samples']
In [6]:
def sp_lin_area(n_s, n_e, val_s, val_e):
tri = (n_e - n_s) * (val_s - val_e) / 2.0
r_rect = (1.0 - n_e) * (val_s - val_e)
cut_area = tri + r_rect
return 1 - cut_area
def lin_max_n_assocs(n, bgs):
return n * bgs * 3 * sp_lin_area(0.05, 1.0, 1.0, 0.05)
full_idx = res.type_mat == 'full'
full_condensed_idx = res.type_mat == 'full condensed'
sp_complete_idx = res.type_mat == 'sparse complete'
sp_condensed_const_idx = res.type_mat == 'sparse condensed const'
sp_lin_idx = res.type_mat == 'sparse condensed linear'
sp_const_idx = res.type_mat.isin(['sparse complete', 'sparse condensed const'])
sp_idx = res.type_mat.isin(['sparse complete', 'sparse condensed const', 'sparse condensed linear'])
# maximum number of associations allowed / pre-allocated
res['max_n_assocs'] = 0
res.loc[full_idx,'max_n_assocs'] = np.int64(res.n_samples[full_idx] **2)
res.loc[full_condensed_idx,'max_n_assocs'] = np.int64(res.n_samples[full_condensed_idx] * (res.n_samples[full_condensed_idx] - 1) / 2)
res.loc[sp_const_idx,'max_n_assocs'] = np.int64(res.n_samples[sp_const_idx] * res.biggest_cluster[sp_const_idx] * 3)
res.loc[sp_lin_idx,'max_n_assocs'] = np.int64(lin_max_n_assocs(res.n_samples[sp_lin_idx], res.biggest_cluster[sp_lin_idx]))
# actual memory used to store the associations
res['mem'] = res.max_n_assocs
res.loc[sp_idx,'mem'] = res.max_n_assocs[sp_idx] * (1+4) # data + indices
res['density'] = 1.0 * res.max_n_assocs / (res.n_samples ** 2)
res['mem_density'] = 1.0 * res.mem / (res.n_samples ** 2)
In [7]:
by_rule_n = res.groupby(by=["rule", "n_samples", "type_mat"])
rule_n_mean = by_rule_n.apply(np.mean)
rule_n_mean = rule_n_mean[['kmin','kmax','t_ensemble','biggest_cluster',
'mean_degree','std_degree','min_degree',
'max_degree','n_assocs','n_max_degree', 'accuracy_CI', 'accuracy_CI_disk',
't_build', 't_sl', 't_sl_disk', 't_accuracy_CI',
't_accuracy_CI_disk', 'sl_clusts', 'sl_clusts_disk',
'csr_max_row', 'csr_topped', 'samples_per_kmin', 'max_n_assocs',
'mem', 'density', 'mem_density','assocs_per_samples']]
rule_n_mean = rule_n_mean.reset_index()
In [8]:
# rules in different plots
par_list = res['rule'].unique()
rows = 2
cols = 2
fig1 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig1.suptitle('Coassoc build time: comparisson between matrix type for each rule', size=16)
fig = fig1
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='rule'):
for type_mat, grp2 in grp.groupby(by='type_mat'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[rule]
ax.plot(grp2.n_samples[idx], grp2.t_build[idx], label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Time [s]")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [9]:
# rules in different plots
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig2 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)
fig = fig2
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='type_mat'):
for type_mat, grp2 in grp.groupby(by='rule'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[rule]
ax.plot(grp2.n_samples[idx], grp2.t_build[idx], label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Time [s]")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [10]:
par_list = res['rule'].unique()
rows = 2
cols = 2
fig3 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig3
fig.suptitle("Single-Link clustering time: comparison between matrix types for each rule", size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='rule'):
for type_mat, grp2 in grp.groupby(by='type_mat'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[rule]
trace = '-'
if type_mat in ('full condensed', 'sparse condensed linear'):
trace = '--'
ax.plot(grp2.n_samples[idx], grp2.t_sl[idx], trace, label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Time [s]")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [11]:
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig4 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig4
fig.suptitle("Single-Link clustering time: comparison between rules for each matrix type", size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='type_mat'):
for type_mat, grp2 in grp.groupby(by='rule'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[rule]
ax.plot(grp2.n_samples[idx], grp2.t_sl[idx], label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Time [s]")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [12]:
# rules in different plots
par_list = res.rule.unique()
rows = 2
cols = 2
fig5 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig5
fig.suptitle('Disk-based Single-Link clustering time: comparison between matrix type for each rule', size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='rule'):
for type_mat, grp2 in grp.groupby(by='type_mat'):
if grp2.t_sl_disk.isnull().all():
continue
idx = grp2.n_samples >= 5e2
ax = ax_par_dict[rule]
trace = '--' if type_mat == 'sparse condensed linear' else '-'
ax.plot(grp2.n_samples[idx], grp2.t_sl_disk[idx], trace, label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Time [s]")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [13]:
# rules in different plots
par_list = ['sparse complete', 'sparse condensed const', 'sparse condensed linear']
rows = 2
cols = 2
fig6 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig6
fig.suptitle('Disk-based Single-Link clustering time: comparison between rules for each matrix type', size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
for rule, grp2 in grp.groupby(by='rule'):
if grp2.t_sl_disk.isnull().any():
continue
idx = grp2.n_samples >= 5e2
ax = ax_par_dict[type_mat]
ax.plot(grp2.n_samples[idx], grp2.t_sl_disk[idx], label=rule)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Time [s]")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [14]:
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig7 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig7
fig.suptitle('Association density relative to a full complete matrix: comparison between matrix type for each rule', size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
for rule, grp2 in grp.groupby(by='rule'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[type_mat]
trace = '-'
ax.plot(grp2.n_samples[idx], grp2.density[idx], trace, label=rule)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Relative density")
ax.set_title(par)
ax.legend(loc="upper right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [15]:
par_list = res['rule'].unique()
rows = 2
cols = 2
fig8 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig8
fig.suptitle('Association density relative to a full complete matrix: comparison between matrix type for each rule', size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='rule'):
for type_mat, grp2 in grp.groupby(by='type_mat'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[rule]
trace = '-'
if type_mat in ('sparse condensed const'):
trace = '--'
ax.plot(grp2.n_samples[idx], grp2.density[idx], trace, label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Relative density")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [16]:
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig9 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig9
fig.suptitle('Memory used relative to a full complete matrix: comparison between matrix type for each rule', size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
for rule, grp2 in grp.groupby(by='rule'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[type_mat]
trace = '-'
ax.plot(grp2.n_samples[idx], grp2.mem_density[idx], trace, label=rule)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel('Memory ratio')
ax.set_title(par)
ax.legend(loc="upper right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [17]:
par_list = res['rule'].unique()
rows = 2
cols = 2
fig10 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig10
fig.suptitle('Memory used relative to a full complete matrix: comparison between matrix type for each rule', size=16)
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
sns.set_palette(sns.color_palette("deep", 6))
for rule, grp in rule_n_mean.groupby(by='rule'):
for type_mat, grp2 in grp.groupby(by='type_mat'):
idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
ax = ax_par_dict[rule]
trace = '-'
if type_mat in ('sparse condensed const'):
trace = '--'
ax.plot(grp2.n_samples[idx], grp2.mem_density[idx], trace, label=type_mat)
for par, ax in ax_par_dict.iteritems():
ax.set_xlabel("# samples")
ax.set_ylabel("Memory ratio")
ax.set_title(par)
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [18]:
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig11 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig11
fig.suptitle('Relative memory usage of sparse condensed linear and $K_{min}$ evolution', size=16)
sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']
sns.set_palette(sns.color_palette("dark", 10))
ax_s = ax.twinx()
for rule, grp in rule_n_mean.groupby(by='rule'):
idx = grp.n_samples >= 1e3 # ignore datasets smaller than 1000
idx2 = grp.type_mat == 'sparse condensed linear'
idx = np.logical_and(idx, idx2)
trace = '-'
ax.plot(grp.n_samples[idx], grp.mem_density[idx], trace, label=rule)
ax_s.plot(grp.n_samples[idx], grp.kmin[idx], trace, label=rule)
ax.plot([250000,250000],[0,10],'-.k')
ax.plot([500000,500000],[0,10],'-.k')
ax.plot([0,10e7],[0.1,0.1],'-.k')
ax.plot([0,10e7],[1,1],'-.k')
ax.set_xlabel("# samples")
ax.set_ylabel('Memory ratio')
ax_s.set_ylabel('$K_{min}$')
ax.legend(loc=(1.1, 0.0), title="Memory ratio") #loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax_s.set_xscale("log")
ax.set_yscale("log")
ax_s.set_yscale("log")
ax_s.legend(loc=(1.1, 0.8), title="$K_{min}$")
Out[18]:
Analyzing this plot is interesting we can clearly see the effect of $K_{min}$ in the memory usage. Three of the rules have a quadratic growth, while the "sk=300" rule has a linear growth. It's obvious that a smaller $K_{min}$ will translate in higher memory consumptions. We can see that as the "sk=300" rule crosses the other rules, its correspondent memory usage becomes less than that of those rules.
In [19]:
res.query('')
In [ ]:
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig11 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig11
fig.suptitle('Number of associations per sample', size=16)
sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']
for rule, grp in rule_n_mean.groupby(by='rule'):
idx = grp.n_samples >= 1e3 # ignore datasets smaller than 1000
idx2 = grp.type_mat == 'sparse condensed linear'
idx = np.logical_and(idx, idx2)
trace = '-'
ax.plot(grp.n_samples[idx], grp.assocs_per_samples[idx], trace, label=rule)
#ax.plot([250000,250000],[0,10],'-.k')
#ax.plot([500000,500000],[0,10],'-.k')
#ax.plot([0,10e7],[0.1,0.1],'-.k')
#ax.plot([0,10e7],[1,1],'-.k')
ax.set_xlabel("Data set size [# samples]")
ax.set_ylabel('No. associations per sample')
ax.legend(loc='lower right', title="Rule") #loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
In [ ]:
same_accuracy = {n_samples:True for n_samples in res.n_samples.unique()}
n_samples_accuracy = {n_samples:0 for n_samples in res.n_samples.unique()}
for n_samples, grp in rule_n_mean.groupby(by='n_samples'):
#print n_samples,grp[['accuracy_CI','accuracy_CI_disk']]
#print grp.accuracy_CI
#print grp.accuracy_CI == grp.accuracy_CI[0]
first = True
first_score = 0
for score in np.nditer(grp[['accuracy_CI','accuracy_CI_disk']].values):
if pd.notnull(score):
if first:
first_score = score
n_samples_accuracy[n_samples] = score
first = False
if score != first_score:
same_accuracy[n_samples] = False
break
In [ ]:
print 'All accuracies are the same:'
print '----------------------------'
print '# samples\tSame'
for item in sorted(same_accuracy.iteritems(), key=lambda x: x[0]):
print '{}\t\t{}\t{}'.format(int(item[0]), item[1], n_samples_accuracy[item[0]])
Accuracies are the same throughout the whose spectrum of cardinality, for the exception of the first 3 sets. The reason for this is that the 'sk=300' rule yields a very low $K_{min}$ for low cardinality. In this cases the number of clusters in the ensemble is lower that the true number, which is undesirable in EAC.
In [ ]:
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig12 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig = fig12
fig.suptitle('Accuracy (CI) evolution with number of samples', size=16)
sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']
idx = (res.n_samples >= 1e3) & (res.type_mat == 'sparse condensed linear') & (res.rule == 'sk=300,th=30%')
ax.plot(res.n_samples[idx], res.accuracy_CI_disk[idx], trace, label='sk=300,th=30%')
#ax.plot([0,0,2500000,2500000],[0,1.1,0,1.1],'.k')
#ax.plot([500000,500000],[0,1],'-.k')
ax.set_xlabel("# samples")
ax.set_ylabel('Memory ratio')
ax.legend(loc="best") #loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
#ax.set_yscale("log")
t_axis = plt.axis()
t_axis = (t_axis[0], t_axis[1], t_axis[2], t_axis[3] + 0.01)
plt.axis(t_axis)
In [ ]:
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig13 = plt.figure(figsize=(fig_width*2 * cols, fig_height*2 * rows))
fig = fig13
fig.suptitle('Relative memory usage of sparse condensed linear and $K_{min}$ evolution', size=16)
sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']
sns.set_palette(sns.color_palette("dark", 10))
ax_s = ax.twinx()
for rule, grp in rule_n_mean.groupby(by='rule'):
idx = grp.n_samples >= 1e3 # ignore datasets smaller than 1000
idx2 = grp.type_mat == 'sparse condensed linear'
idx = np.logical_and(idx, idx2)
trace = '-'
ax.plot(grp.n_samples[idx], grp.n_assocs[idx], trace, label=rule)
ax_s.plot(grp.n_samples[idx], grp.max_n_assocs[idx], '-.', label=rule)
ax.set_xlabel("# samples", fontsize=16)
ax.set_ylabel('# assocs.', fontsize=16)
ax.set_xscale("log")
ax.set_yscale("log")
ax.grid(True, which="both")
ax.legend(loc=(1.1, 0.0), title='Number of associations',fontsize='x-large') #loc=(1.1, 0.0))
#ax_s.set_ylabel('$K_{min}$')
ax_s.set_xscale("log")
ax_s.set_yscale("log")
ax_s.legend(loc=(1.1, 0.8), title='Maximum number of assocs.')
In [ ]:
# double y axis example
rows = 1
cols = 1
fig14 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
sns.set_palette(sns.color_palette("deep", 6))
ax = fig14.add_subplot(rows, cols, 1)
for key,grp in rule_n_mean.groupby(by="rule"):
idx = (grp.n_samples >= 1e3) & (grp.type_mat == 'sparse condensed linear')
ax.plot(grp.n_samples[idx], grp.max_degree[idx] / grp.biggest_cluster[idx], label=key)
ax.set_xlabel("# samples")
ax.set_ylabel("max # assocs / biggest cluster size")
ax.set_title("Relationship between max. num. assocs and biggest cluster size per rule")
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
In [ ]:
n_array = res.n_samples.unique()
x_kmin = map(rule_x, n_array)
In [ ]:
500 **2
In [ ]:
500 * 499 / 2
In [ ]:
res[res.n_samples == 500][['n_samples', 'type_mat', 'n_assocs', 'round']]