In [1]:
import matplotlib.pyplot as plt
#%matplotlib inline
import numpy as np
import os.path
import pandas as pd
import seaborn as sns
#from MyML.helper.plotting import save_fig

In [2]:
sns.set_style("whitegrid")
fig_width = 12
fig_height = 8

In [ ]:
%matplotlib qt

In [3]:
import matplotlib
matplotlib.rcParams['savefig.dpi'] = 600.0

In [4]:
plt.ioff()

In [5]:
# These are the "Tableau 20" colors as RGB.  
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),  
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),  
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),  
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),  
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]

# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.  
for i in range(len(tableau20)):  
    r, g, b = tableau20[i]  
    tableau20[i] = (r / 255., g / 255., b / 255.)
    
tableau = tableau20[::2]
tableau.extend(tableau20[1::2])
tableau = sns.color_palette(tableau,20)
sns.palplot(tableau)
sns.set_palette(tableau, n_colors=20)

In [6]:
results_path = '/home/chiroptera/QCThesis/datasets/gauss10e6_overlap/results/results_kmin.csv'

In [7]:
results_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\gauss10_overlap\\results_kmin.csv'

In [8]:
res = pd.read_csv(results_path)
for col in res.columns:
    print col


Unnamed: 0
n_samples
rule
kmin
kmax
t_ensemble
t_build
t_sl
t_accuracy_CI
t_accuracy_H
t_sl_disk
t_store
t_accuracy_CI_disk
t_accuracy_H_disk
biggest_cluster
type_mat
n_assocs
n_max_degree
min_degree
max_degree
mean_degree
std_degree
accuracy_CI
accuracy_H
sl_clusts
accuracy_CI_disk
accuracy_H_disk
sl_clusts_disk
round
disk

In [9]:
res['csr_max_row'] = res.biggest_cluster * 3
res['csr_topped'] = (res['csr_max_row'] == res['max_degree']) & res.type_mat
res = res[np.logical_not(res.n_samples.isnull())]
#res.max_assoc = res.max_assoc.apply(lambda s: int(s.strip('()').split(',')[0]))
#res['final density'] = res.n_assocs * 1.0 / (res.n_samples ** 2)
res['samples_per_kmin']=res['n_samples'] / res['kmin']
res['assocs_per_samples'] = res['n_assocs'] / res['n_samples']

In [10]:
def sp_lin_area(n_s, n_e, val_s, val_e):

    tri = (n_e - n_s) * (val_s - val_e) / 2.0
    r_rect = (1.0 - n_e) * (val_s - val_e)
    cut_area = tri + r_rect

    return 1 - cut_area

def lin_max_n_assocs(n, bgs):
    return n * bgs * 3 * sp_lin_area(0.05, 1.0, 1.0, 0.05)

full_idx = res.type_mat == 'full'
full_condensed_idx = res.type_mat == 'full condensed'
sp_complete_idx = res.type_mat == 'sparse complete'
sp_condensed_const_idx = res.type_mat == 'sparse condensed const'
sp_lin_idx = res.type_mat == 'sparse condensed linear'

sp_const_idx = res.type_mat.isin(['sparse complete', 'sparse condensed const'])
sp_idx = res.type_mat.isin(['sparse complete', 'sparse condensed const', 'sparse condensed linear'])

# maximum number of associations allowed / pre-allocated
res['max_n_assocs'] = 0
res.loc[full_idx,'max_n_assocs'] = np.int64(res.n_samples[full_idx] **2)
res.loc[full_condensed_idx,'max_n_assocs'] = np.int64(res.n_samples[full_condensed_idx] * (res.n_samples[full_condensed_idx] - 1) / 2)
res.loc[sp_const_idx,'max_n_assocs'] = np.int64(res.n_samples[sp_const_idx] * res.biggest_cluster[sp_const_idx] * 3)
res.loc[sp_lin_idx,'max_n_assocs'] = np.int64(lin_max_n_assocs(res.n_samples[sp_lin_idx], res.biggest_cluster[sp_lin_idx]))

# actual memory used to store the associations
res['mem'] = res.max_n_assocs
res.loc[sp_idx,'mem'] = res.max_n_assocs[sp_idx] * (1+4) # data + indices

res['assoc density'] = 1.0 * res.n_assocs / (res.n_samples ** 2)
res['density'] = 1.0 * res.max_n_assocs / (res.n_samples ** 2)
res['mem_density'] = 1.0 * res.mem / (res.n_samples ** 2)

In [11]:
by_rule_n = res.groupby(by=["rule", "n_samples", "type_mat"])
rule_n_mean = by_rule_n.apply(np.mean)
rule_n_mean = rule_n_mean[['kmin','kmax','t_ensemble','biggest_cluster',
                           'mean_degree','std_degree','min_degree',
                           'max_degree','n_assocs','n_max_degree', 'accuracy_CI', 'accuracy_CI_disk',
                           't_build', 't_sl', 't_sl_disk', 't_accuracy_CI',
                           't_accuracy_CI_disk', 'sl_clusts', 'sl_clusts_disk',
                           'csr_max_row', 'csr_topped', 'samples_per_kmin', 'max_n_assocs',
                           'mem', 'assoc density', 'density', 'mem_density','assocs_per_samples']]
rule_n_mean = rule_n_mean.reset_index()

Build time


In [12]:
# rules in different plots
par_list = res['rule'].unique()
rows = 2
cols = 2
fig1 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig1.suptitle('Coassoc build time: comparisson between matrix type for each rule', size=16)

fig = fig1

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}

sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='rule'):
    for type_mat, grp2 in grp.groupby(by='type_mat'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[rule]
        ax.plot(grp2.n_samples[idx], grp2.t_build[idx], label=type_mat)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Time [s]")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

In [13]:
# rules in different plots
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig2 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)

fig = fig2

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}

sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='type_mat'):
    for type_mat, grp2 in grp.groupby(by='rule'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[rule]
        ax.plot(grp2.n_samples[idx], grp2.t_build[idx], label=type_mat)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Time [s]")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

for saving


In [18]:
# rules in different plots

build_time_rule = dict()

par_list = res['rule'].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)

fig_dict = build_time_rule

for rule, grp in rule_n_mean.groupby(by='rule'):
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[rule] = ax
    for type_mat, grp2 in grp.groupby(by='type_mat'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax.plot(grp2.n_samples[idx], grp2.t_build[idx], label=type_mat,)

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Time [s]", fontsize=16)
    ax.set_title('Combination execution time, {}={}'.format('rule',par), fontsize=20)
    ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    #,fontsize=16
    #,prop={'size':16}
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)

In [19]:
# rules in different plots

build_time_type_mat = dict()

par_list = res['type_mat'].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)

fig_dict = build_time_type_mat

for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[type_mat] = ax
    for rule, grp2 in grp.groupby(by='rule'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax.plot(grp2.n_samples[idx], grp2.t_build[idx], label=rule,)

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Time [s]", fontsize=16)
    ax.set_title('Combination execution time, {}={}'.format('type_mat',par), fontsize=20)
    ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    #,fontsize=16
    #,prop={'size':16}
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)

In [20]:
img_path = '/home/chiroptera/QCThesis/results/EAC/build_time/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\build_time\\'
for name, ax in build_time_type_mat.iteritems():
    f = ax.get_figure()
    f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
    plt.close(f)
del build_time_type_mat

for name, ax in build_time_rule.iteritems():
    f = ax.get_figure()
    f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
    plt.close(f)
del build_time_rule

Single-Clustering time: memory


In [23]:
par_list = res['rule'].unique()
rows = 2
cols = 2
fig3 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig3
fig.suptitle("Single-Link clustering time: comparison between matrix types for each rule", size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='rule'):
    for type_mat, grp2 in grp.groupby(by='type_mat'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[rule]
        trace = '-'
        if type_mat in ('full condensed', 'sparse condensed linear'):
            trace = '--'
        ax.plot(grp2.n_samples[idx], grp2.t_sl[idx], trace, label=type_mat)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Time [s]")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

In [24]:
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig4 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig4
fig.suptitle("Single-Link clustering time: comparison between rules for each matrix type", size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}

sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='type_mat'):
    for type_mat, grp2 in grp.groupby(by='rule'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[rule]
        ax.plot(grp2.n_samples[idx], grp2.t_sl[idx], label=type_mat)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Time [s]")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

for saving


In [25]:
# rules in different plots

sl_mem_time_type_mat = dict()
fig_dict = sl_mem_time_type_mat
par_list = res['type_mat'].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)

for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[type_mat] = ax
    for rule, grp2 in grp.groupby(by='rule'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax.plot(grp2.n_samples[idx], grp2.t_sl[idx], label=rule)

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Time [s]", fontsize=16)
    ax.set_title('Single-Link (main memory) execution time, {}={}'.format(par1_name,par), fontsize=20)
    ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    #,fontsize=16
    #,prop={'size':16}
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)

In [26]:
# rules in different plots

sl_mem_time_rule = dict()
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)

fig_dict = sl_mem_time_rule


for par1, grp in rule_n_mean.groupby(by=par1_name):
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[par1] = ax
    for par2, grp2 in grp.groupby(by=par2_name):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax.plot(grp2.n_samples[idx], grp2.t_sl[idx], label=par2)

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Time [s]", fontsize=16)
    ax.set_title('Single-Link (main memory) execution time, {}={}'.format(par1_name,par), fontsize=20)
    ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    #,fontsize=16
    #,prop={'size':16}
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)


C:\Users\Isabella\Miniconda\lib\site-packages\matplotlib\pyplot.py:424: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

In [27]:
img_path = '/home/chiroptera/QCThesis/results/EAC/sl_mem_time/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\sl_mem_time\\'
for name, ax in sl_mem_time_type_mat.iteritems():
    f = ax.get_figure()
    f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
    plt.close(f)
del sl_mem_time_type_mat
    
for name, ax in sl_mem_time_rule.iteritems():
    f = ax.get_figure()
    f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
    plt.close(f)
del sl_mem_time_rule

Single-Clustering time: disk


In [28]:
# rules in different plots
par_list = res.rule.unique()
rows = 2
cols = 2
fig5 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig5
fig.suptitle('Disk-based Single-Link clustering time: comparison between matrix type for each rule', size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='rule'):
    for type_mat, grp2 in grp.groupby(by='type_mat'):
        if grp2.t_sl_disk.isnull().all():
            continue
        idx = grp2.n_samples >= 5e2
        ax = ax_par_dict[rule]
        trace = '--' if type_mat == 'sparse condensed linear' else '-'
        ax.plot(grp2.n_samples[idx], grp2.t_sl_disk[idx], trace, label=type_mat)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Time [s]")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

In [29]:
# rules in different plots
par_list = ['sparse complete', 'sparse condensed const', 'sparse condensed linear']
rows = 2
cols = 2
fig6 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig6

fig.suptitle('Disk-based Single-Link clustering time: comparison between rules for each matrix type', size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
    for rule, grp2 in grp.groupby(by='rule'):
        if grp2.t_sl_disk.isnull().any():
            continue
        idx = grp2.n_samples >= 5e2
        ax = ax_par_dict[type_mat]
        ax.plot(grp2.n_samples[idx], grp2.t_sl_disk[idx], label=rule)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Time [s]")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

for saving


In [30]:
# rules in different plots

sl_disk_time_type_mat = dict()
fig_dict = sl_disk_time_type_mat
par1_name, par2_name = 'type_mat', 'rule'
#par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)

for par1, grp in rule_n_mean.groupby(by=par1_name):
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[par1] = ax
    for par2, grp2 in grp.groupby(by=par2_name):
        if grp2.t_sl_disk.isnull().any():
            continue
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        trace = '-'
        ax.plot(grp2.n_samples[idx], grp2.t_sl_disk[idx], trace, label=par2)

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Time [s]", fontsize=16)
    ax.set_title('Single-Link (disk) execution time, {}={}'.format(par1_name,par), fontsize=20)
    ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    #,fontsize=16
    #,prop={'size':16}
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)


C:\Users\Isabella\Miniconda\lib\site-packages\matplotlib\axes\_axes.py:475: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "

In [31]:
# rules in different plots

sl_disk_time_rule = dict()
fig_dict = sl_disk_time_rule
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)


for par1, grp in rule_n_mean.groupby(by=par1_name):
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[par1] = ax
    for par2, grp2 in grp.groupby(by=par2_name):
        if grp2.t_sl_disk.isnull().any():
            continue        
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        trace = '--' if par2 == 'sparse condensed linear' else '-'
        ax.plot(grp2.n_samples[idx], grp2.t_sl_disk[idx], trace, label=par2)

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Time [s]", fontsize=16)
    ax.set_title('Single-Link (disk) execution time, {}={}'.format(par1_name,par), fontsize=20)
    ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    #,fontsize=16
    #,prop={'size':16}
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)

In [32]:
img_path = '/home/chiroptera/QCThesis/results/EAC/sl_disk_time/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\sl_disk_time\\'
for name, ax in sl_disk_time_type_mat.iteritems():
    f = ax.get_figure()
    f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
    plt.close(f)
del sl_disk_time_type_mat

for name, ax in sl_disk_time_rule.iteritems():
    f = ax.get_figure()
    f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
    plt.close(f)
del sl_disk_time_rule

SLINK vs SL-MST vs SL-MST-Disk


In [36]:
# rules in different plots

sl_time_rule = dict()
fig_dict = sl_time_rule
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)


for par1, grp in rule_n_mean.groupby(by=par1_name):
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[par1] = ax
    
    fc = grp.query('type_mat == "full condensed"')
    idx = fc.n_samples >= 1e3
    ax.plot(fc.n_samples[idx], fc.t_sl[idx], label='SLINK')
    
    fc = grp.query('type_mat == "sparse condensed linear"')
    idx = fc.n_samples >= 1e3
    ax.plot(fc.n_samples[idx], fc.t_sl[idx], label='SL-MST')
    ax.plot(fc.n_samples[idx], fc.t_sl_disk[idx], label='SL-MST-Disk')    

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Time [s]", fontsize=16)
    ax.set_title('Single-Link execution time, {}={}'.format(par1_name,par), fontsize=20)
    ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    #,fontsize=16
    #,prop={'size':16}
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)

In [37]:
img_path = '/home/chiroptera/QCThesis/results/EAC/sl_time/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\sl_time\\'
for name, ax in sl_time_rule.iteritems():
    f = ax.get_figure()
    f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
    plt.close(f)
del sl_time_rule

Sparsity: maximum number of assocs.


In [38]:
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig7 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig7
fig.suptitle('Association density relative to a full complete matrix: comparison between matrix type for each rule', size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
    for rule, grp2 in grp.groupby(by='rule'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[type_mat]
        trace = '-'
        ax.plot(grp2.n_samples[idx], grp2.density[idx], trace, label=rule)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Relative density")
    ax.set_title(par)
    ax.legend(loc="upper right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

In [39]:
par_list = res['rule'].unique()
rows = 2
cols = 2
fig8 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig8
fig.suptitle('Association density relative to a full complete matrix: comparison between matrix type for each rule', size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='rule'):
    for type_mat, grp2 in grp.groupby(by='type_mat'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[rule]
        trace = '-'
        if any(map(lambda x: type_mat==x,['sparse condensed const'])):
            trace = '--'
        ax.plot(grp2.n_samples[idx], grp2.density[idx], trace, label=type_mat)

        
for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Relative density")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    ymin,ymax = ax.get_axes().axis()[2:]
    ax.set_ylim((ymin,ymax+0.1))

for saving


In [45]:
img_path = '/home/chiroptera/QCThesis/results/EAC/allocated_density/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\allocated_density\\'
for name, ax in allocated_density_type_mat.iteritems():
    f = ax.get_figure()
    f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
    plt.close(f)
del allocated_density_type_mat
    
for name, ax in allocated_density_rule.iteritems():
    f = ax.get_figure()
    f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
    plt.close(f)
del allocated_density_rule
    
img_path = '/home/chiroptera/QCThesis/results/EAC/mem_density/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\mem_density\\'
for name, ax in mem_density_type_mat.iteritems():
    f = ax.get_figure()
    f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
    plt.close(f)
    
for name, ax in mem_density_rule.iteritems():
    f = ax.get_figure()
    f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
    plt.close(f)
del mem_density_rule
    
img_path = '/home/chiroptera/QCThesis/results/EAC/assoc_density/'
img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\assoc_density\\'
for name, ax in assoc_density.iteritems():
    f = ax.get_figure()
    f.savefig('{}{}.eps'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.pdf'.format(img_path,name),bbox_inches='tight')
    f.savefig('{}{}.png'.format(img_path,name),bbox_inches='tight')
    plt.close(f)
del assoc_density

In [ ]:
%matplotlib qt

In [ ]:
plt.ioff()

In [40]:
# rules in different plots

mem_density_rule= dict()
fig_dict = mem_density_rule
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)

for par1, grp in rule_n_mean.groupby(by=par1_name):
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[par1] = ax
    for par2, grp2 in grp.groupby(by=par2_name):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        trace = '-'
        if any(map(lambda x: par2==x,['sparse condensed const'])):
            trace = '.-'        
        ax.plot(grp2.n_samples[idx], grp2.loc[idx,'mem_density'], trace, label=par2)

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Density", fontsize=16)
    ax.set_title('Memory used relative to full matrix, {}={}'.format(par1_name,par1), fontsize=20)
    ax.legend(loc="upper right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    ymin,ymax = ax.get_axes().axis()[2:]
    ax.set_ylim((ymin,ymax+0.1))
    #,fontsize=16
    #,prop={'size':16}
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)

In [41]:
# rules in different plots

mem_density_type_mat = dict()
fig_dict = mem_density_type_mat
par1_name, par2_name = 'type_mat', 'rule'
#par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)


for par1, grp in rule_n_mean.groupby(by=par1_name):
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[par1] = ax
    for par2, grp2 in grp.groupby(by=par2_name):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        trace = '-'
        if any(map(lambda x: par1==x,['sparse condensed const'])):
            trace = '.-'        
        ax.plot(grp2.n_samples[idx], grp2.loc[idx,'mem_density'], trace, label=par2)

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Density", fontsize=16)
    ax.set_title('Memory used relative to full matrix, {}={}'.format(par1_name,par1), fontsize=20)
    ax.legend(loc="upper right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    ymin,ymax = ax.get_axes().axis()[2:]
    ax.set_ylim((ymin,ymax+0.1))
    #,fontsize=16
    #,prop={'size':16}
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)

In [42]:
# rules in different plots

assoc_density = dict()
fig_dict = assoc_density
par1_name, par2_name = 'type_mat', 'rule'
#par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)

for par1, grp in rule_n_mean.groupby(by=par1_name):
    if par1 != 'sparse condensed const' and par1 != 'sparse condensed linear':
        continue
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[par1] = ax

    for par2, grp2 in grp.groupby(by=par2_name):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        trace = '-'
        if any(map(lambda x: par1==x,['sparse condensed const'])):
            trace = '.-'        
        ax.plot(grp2.n_samples[idx], grp2.loc[idx,'assoc density'], trace, label=par2)

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Density", fontsize=16)
    ax.set_title('Density of associations relative to full matrix, {}={}'.format(par1_name,par1), fontsize=20)
    ax.legend(loc="upper right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    ymin,ymax = ax.get_axes().axis()[2:]
    ax.set_ylim((ymin,ymax+0.1))
    #,fontsize=16
    #,prop={'size':16}
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)

In [43]:
# rules in different plots

allocated_density_type_mat = dict()
fig_dict = allocated_density_type_mat
par1_name, par2_name = 'type_mat', 'rule'
#par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)

for par1, grp in rule_n_mean.groupby(by=par1_name):
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[par1] = ax
    for par2, grp2 in grp.groupby(by=par2_name):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        trace = '-'
        if any(map(lambda x: par1==x,['sparse condensed const'])):
            trace = '.-'        
        ax.plot(grp2.n_samples[idx], grp2.density[idx], trace, label=par2)

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Density", fontsize=16)
    ax.set_title('Density of allocated associations relative to full matrix, {}={}'.format(par1_name,par), fontsize=20)
    ax.legend(loc="upper right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    ymin,ymax = ax.get_axes().axis()[2:]
    ax.set_ylim((ymin,ymax+0.1))
    #,fontsize=16
    #,prop={'size':16}
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)

In [44]:
# rules in different plots

allocated_density_rule = dict()
fig_dict = allocated_density_rule
#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)

for par1, grp in rule_n_mean.groupby(by=par1_name):
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[par1] = ax
    for par2, grp2 in grp.groupby(by=par2_name):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        trace = '-'
        if any(map(lambda x: par2==x,['sparse condensed const'])):
            trace = '.-'        
        ax.plot(grp2.n_samples[idx], grp2.density[idx], trace, label=par2)

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Density", fontsize=16)
    ax.set_title('Density of allocated associations relative to full matrix, {}={}'.format(par1_name,par), fontsize=20)
    ax.legend(loc="upper right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    ymin,ymax = ax.get_axes().axis()[2:]
    ax.set_ylim((ymin,ymax+0.1))
    #,fontsize=16
    #,prop={'size':16}
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)

Sparsity: memory


In [ ]:
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig9 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig9
fig.suptitle('Memory used relative to a full complete matrix: comparison between matrix type for each rule', size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
    for rule, grp2 in grp.groupby(by='rule'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[type_mat]
        trace = '-'
        ax.plot(grp2.n_samples[idx], grp2.mem_density[idx], trace, label=rule)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel('Memory ratio')
    ax.set_title(par)
    ax.legend(loc="upper right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

In [ ]:
par_list = res['rule'].unique()
rows = 2
cols = 2
fig10 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig10
fig.suptitle('Memory used relative to a full complete matrix: comparison between matrix type for each rule', size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='rule'):
    for type_mat, grp2 in grp.groupby(by='type_mat'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[rule]
        trace = '-'
        if type_mat in ('sparse condensed const'):
            trace = '--'
        ax.plot(grp2.n_samples[idx], grp2.mem_density[idx], trace, label=type_mat)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Memory ratio")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

Memory / $K_{min}$ relationship


In [ ]:
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig11 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig11
fig.suptitle('Relative memory usage of sparse condensed linear and $K_{min}$ evolution', size=16)

sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']
sns.set_palette(sns.color_palette("dark", 10))
ax_s = ax.twinx()

for rule, grp in rule_n_mean.groupby(by='rule'):
    idx = grp.n_samples >= 1e3 # ignore datasets smaller than 1000
    idx2 = grp.type_mat == 'sparse condensed linear'
    idx = np.logical_and(idx, idx2)
    trace = '-'
    ax.plot(grp.n_samples[idx], grp.mem_density[idx], trace, label=rule)
    ax_s.plot(grp.n_samples[idx], grp.kmin[idx], trace, label=rule)    

ax.plot([250000,250000],[0,10],'-.k')
ax.plot([500000,500000],[0,10],'-.k')

ax.plot([0,10e6],[0.1,0.1],'-.k')
ax.plot([0,10e6],[1,1],'-.k')
    
ax.set_xlabel("# samples")
ax.set_ylabel('Memory ratio')
ax_s.set_ylabel('$K_{min}$')

ax.legend(loc=(1.1, 0.0), title="Memory ratio") #loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax_s.set_xscale("log")
ax.set_yscale("log")
ax_s.set_yscale("log")

ax_s.legend(loc=(1.1, 0.8), title="$K_{min}$")

Analyzing this plot is interesting we can clearly see the effect of $K_{min}$ in the memory usage. Three of the rules have a quadratic growth, while the "sk=300" rule has a linear growth. It's obvious that a smaller $K_{min}$ will translate in higher memory consumptions. We can see that as the "sk=300" rule crosses the other rules, its correspondent memory usage becomes less than that of those rules.


In [46]:
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig11 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig11

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']

for rule, grp in rule_n_mean.groupby(by='rule'):
    idx = grp.n_samples >= 1e3 # ignore datasets smaller than 1000
    idx2 = grp.type_mat == 'sparse condensed linear'
    idx = np.logical_and(idx, idx2)
    trace = '-'
    ax.plot(grp.n_samples[idx], grp.assocs_per_samples[idx], trace, label=rule)

#ax.plot([250000,250000],[0,10],'-.k')
#ax.plot([500000,500000],[0,10],'-.k')

#ax.plot([0,10e7],[0.1,0.1],'-.k')
#ax.plot([0,10e7],[1,1],'-.k')
    
ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel('No. associations per sample',fontsize=16)
ax.set_title('Number of associations per sample', size=20)

ax.legend(loc='lower right', title="Rule",prop={'size':16}) #loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(14) 

for fig_format in ('eps','png','pdf'):
    img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
    fig.savefig('{}assocs_per_sample.{}'.format(img_path,fig_format), bbox_inches='tight')
#plt.close(fig)

Accuracy


In [ ]:
same_accuracy = {n_samples:True for n_samples in res.n_samples.unique()}
n_samples_accuracy = {n_samples:0 for n_samples in res.n_samples.unique()}
for n_samples, grp in rule_n_mean.groupby(by='n_samples'):
    #print n_samples,grp[['accuracy_CI','accuracy_CI_disk']]
    #print grp.accuracy_CI
    #print grp.accuracy_CI == grp.accuracy_CI[0]
    first = True
    first_score = 0
    for score in np.nditer(grp[['accuracy_CI','accuracy_CI_disk']].values):
        if pd.notnull(score):
            if first:
                first_score = score
                n_samples_accuracy[n_samples] = score
                first = False
            if score != first_score:
                same_accuracy[n_samples] = False
                break

In [ ]:
print 'All accuracies are the same:'
print '----------------------------'
print '# samples\tSame'
for item in sorted(same_accuracy.iteritems(), key=lambda x: x[0]):
    print '{}\t\t{}\t{}'.format(int(item[0]), item[1], n_samples_accuracy[item[0]])

Accuracies are the same throughout the whose spectrum of cardinality, for the exception of the first 3 sets. The reason for this is that the 'sk=300' rule yields a very low $K_{min}$ for low cardinality. In this cases the number of clusters in the ensemble is lower that the true number, which is undesirable in EAC.


In [47]:
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig12 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig12


sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']

idx = (res.n_samples >= 1e3) & (res.type_mat == 'sparse condensed linear') & (res.rule == 'sk=300,th=30%')
trace='-'
ax.plot(res.n_samples[idx], res.accuracy_CI_disk[idx], trace, label='sk=300,th=30%')

#ax.plot([0,0,2500000,2500000],[0,1.1,0,1.1],'.k')
#ax.plot([500000,500000],[0,1],'-.k')
    
ax.set_xlabel("# samples", fontsize=16)
ax.set_ylabel('Memory ratio', fontsize=16)
ax.set_title('Accuracy (CI) evolution with number of samples', size=20)
ax.legend(prop={'size':16}, loc="best") #loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
#ax.set_yscale("log")
t_axis = plt.axis()
t_axis = (t_axis[0], t_axis[1], t_axis[2], t_axis[3] + 0.01)
plt.axis(t_axis)
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(14) 

for fig_format in ('eps','png','pdf'):
    #fig.savefig('/home/chiroptera/QCThesis/results/EAC/accuracy.{}'.format(fig_format), bbox_inches='tight')
    img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
    fig.savefig('{}accuracy.{}'.format(img_path,fig_format), bbox_inches='tight')    
#plt.close(fig)

In [48]:
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig13 = plt.figure(figsize=(fig_width*2 * cols, fig_height*2 * rows))

fig = fig13
fig.suptitle('Relative memory usage of sparse condensed linear and $K_{min}$ evolution', size=20)

sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']
sns.set_palette(sns.color_palette("dark", 10))
ax_s = ax.twinx()

for rule, grp in rule_n_mean.groupby(by='rule'):
    idx = grp.n_samples >= 1e3 # ignore datasets smaller than 1000
    idx2 = grp.type_mat == 'sparse condensed linear'
    idx = np.logical_and(idx, idx2)
    trace = '-'
    ax.plot(grp.n_samples[idx], grp.n_assocs[idx], trace, label=rule)
    ax_s.plot(grp.n_samples[idx], grp.max_n_assocs[idx], '-.', label=rule)    

   
ax.set_xlabel("# samples", fontsize=16)
ax.set_ylabel('# assocs.', fontsize=16)
ax.set_xscale("log")
ax.set_yscale("log")
ax.grid(True, which="both")
ax.legend(loc=(1.1, 0.0), title='Number of associations',fontsize=16,prop={'size':16}) #loc=(1.1, 0.0))
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(14) 


#ax_s.set_ylabel('$K_{min}$')
ax_s.set_xscale("log")
ax_s.set_yscale("log")
ax_s.legend(loc=(1.1, 0.8), title='Maximum number of assocs.',fontsize=16)


Out[48]:
<matplotlib.legend.Legend at 0x25009550>

In [49]:
# double y axis example
rows = 1
cols = 1
fig14 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig14

ax = fig.add_subplot(rows, cols, 1)
for key,grp in rule_n_mean.groupby(by="rule"):
    idx = (grp.n_samples >= 1e3) & (grp.type_mat == 'sparse condensed linear')
    ax.plot(grp.n_samples[idx], grp.max_degree[idx] / grp.biggest_cluster[idx], label=key)

ax.set_xlabel("# samples",fontsize=16)
ax.set_ylabel("max # assocs / biggest cluster size",fontsize=16)
ax.set_title("Relationship between max. num. assocs and biggest cluster size per rule",fontsize=20)
ax.legend(loc="lower right",prop={'size':16})#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
#,fontsize=16
#,prop={'size':16}
for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(14)
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(14) 

for fig_format in ('eps','png','pdf'):
    #fig.savefig('/home/chiroptera/QCThesis/results/EAC/max_assoc_bgs.{}'.format(fig_format), bbox_inches='tight')
    img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
    fig.savefig('{}max_assocs_bgs.{}'.format(img_path,fig_format), bbox_inches='tight')    
plt.close(fig)

delete bad names from images


In [64]:
# change all files with weird rule names to something latex can read
import glob
import os
for d in glob.glob('C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\*'):
    for f in glob.glob(d + '\\*'):
        if 'sk=300,th=30%' in f:
            new_name = f.replace('sk=300,th=30%','sk=300')
            os.rename(f, new_name)
        elif 'sk=sqrt_2,th=30%' in f:
            new_name = f.replace('sk=sqrt_2,th=30%','sk=sqrt_2')
            os.rename(f, new_name)
        elif 'full condensed' in f:
            new_name = f.replace('full condensed','full_condensed')
            os.rename(f, new_name)
        elif 'sparse complete' in f:
            new_name = f.replace('sparse complete','sparse_complete')
            os.rename(f, new_name)
        elif 'sparse condensed const' in f:
            new_name = f.replace('sparse condensed const','sparse_condensed_const')
            os.rename(f, new_name)
        elif 'sparse condensed linear' in f:
            new_name = f.replace('sparse condensed linear','sparse_condensed_linear')
            os.rename(f, new_name)

Kmin evolution per rule


In [50]:
# rules in different plots

kmin_per_rule = dict()
fig_dict = kmin_per_rule
title = '$K_{min}$ evolution per rule'


#par1_name : diff plots
#par2_name : different curves in plot

par1_name, par2_name = 'type_mat', 'rule'
#par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)



for par1, grp in rule_n_mean.groupby(by=par1_name):
    if par1 != 'sparse condensed linear':
        continue
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[par1] = ax
    for par2, grp2 in grp.groupby(by=par2_name):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax.plot(grp2.n_samples[idx], grp2.loc[idx,'kmin'], label=par2)

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Time [s]", fontsize=16)
    ax.set_title('{}, {}={}'.format(title,par1_name,par), fontsize=20)
    ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    #fontsize=16
    #prop={'size':16
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)     

for fig_format in ('eps','png','pdf'):
    #fig.savefig('/home/chiroptera/QCThesis/results/EAC/kmin_evolution.{}'.format(fig_format), bbox_inches='tight')
    img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
    fig.savefig('{}kmin_evolution.{}'.format(img_path,fig_format), bbox_inches='tight')    
#plt.close(fig)

SL time per rule


In [51]:
# rules in different plots

sl_different_methods = dict()
fig_dict = sl_different_methods
title = 'SLINK vs SL-MST vs SL-MST-Disk'


#par1_name : diff plots
#par2_name : different curves in plot

#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)



for par1, grp in rule_n_mean.groupby(by=par1_name):
    if par1 != '2sqrt':
        continue
    fig = plt.figure(figsize=(fig_width, fig_height))
    ax = fig.add_subplot(111)
    fig_dict[par1] = ax
    for par2, grp2 in grp.groupby(by=par2_name):
        if par2 == 'full' or par2 == 'sparse condensed const':
            continue
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax.plot(grp2.n_samples[idx], grp2.loc[idx,'t_sl'], label=par2)
        if par2 == 'full condensed':
            continue
        ax.plot(grp2.n_samples[idx], grp2.loc[idx,'t_sl_disk'], label=par2)

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Time [s]", fontsize=16)
    ax.set_title('{}, {}={}'.format(title,par1_name,par), fontsize=20)
    ax.legend(['SLINK','SL-MST complete','SL-MST-Disk complete','SL-MST condensed','SL-MST-Disk condensed'],loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    #fontsize=16
    #prop={'size':16
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)     
    
for fig_format in ('eps','png','pdf'):
    #fig.savefig('/home/chiroptera/QCThesis/results/EAC/sl_time/slink_vs_sl-mst.{}'.format(fig_format), bbox_inches='tight')
    img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\sl_time\\'
    fig.savefig('{}slink_vs_sl-mst.{}'.format(img_path,fig_format), bbox_inches='tight')    
#plt.close(fig)

In [ ]:
sns.palplot(tableau)

need ensemble time


In [58]:
ensemble_time_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\gauss10_overlap\\ensemble_results_kmin.csv'
res_t_ensemble = pd.read_csv(ensemble_time_path)
res_t_ensemble = res_t_ensemble[['n_samples','rule','kmin','kmax','t_ensemble']].dropna(axis=0)
res_t_ensemble = res_t_ensemble.query('n_samples >= 5e2')

for key, grp1 in rule_n_mean.groupby(by=['n_samples','rule']):
    n, rule = key
    idx = rule_n_mean.query('n_samples=={} & rule=="{}"'.format(n,rule)).index
    t = res_t_ensemble.query('n_samples=={} & rule=="{}"'.format(n,rule))['t_ensemble'].values[0]
    rule_n_mean.loc[idx, 't_ensemble'] = t

Ensemble time


In [59]:
# rules in different plots

ensemble_time = dict()
fig_dict = ensemble_time
title = 'Production execution time'

#par1_name : diff plots
#par2_name : different curves in plot

#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)



fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[0] = ax   
for par1, grp in rule_n_mean.groupby(by=par1_name):

    ax.plot(grp.n_samples, grp.loc[:,'t_ensemble'], label= '{}'.format(par1))
    #build = grp.query('type_mat == "sparse condensed linear"').loc[:,['n_samples','t_build']]
    #ax.plot(build.loc[:,'n_samples'], build.loc[:,'t_build'], label= 'build time {}'.format(par1))

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Time [s]", fontsize=16)
    ax.set_title('{}, {}{}'.format(title,'',''), fontsize=20)
    ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    #fontsize=16
    #prop={'size':16
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)     
    
for fig_format in ('eps','png','pdf'):
    #fig.savefig('/home/chiroptera/QCThesis/results/EAC/ensemble_time.{}'.format(fig_format), bbox_inches='tight')
    img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
    fig.savefig('{}ensemble_time.{}'.format(img_path,fig_format), bbox_inches='tight')

Total Time


In [60]:
total = rule_n_mean.query('type_mat == "sparse condensed linear"').loc[:,['rule','n_samples','t_build','t_ensemble','t_sl','t_sl_disk']]
total['total_mem'] = total[['t_build','t_ensemble','t_sl']].sum(axis=1)
total['total_disk'] = total[['t_build','t_ensemble','t_sl_disk']].sum(axis=1)

In [61]:
# rules in different plots

ensemble_time = dict()
fig_dict = ensemble_time
title = 'Total execution with SL-MST'

#par1_name : diff plots
#par2_name : different curves in plot

#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)



fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[0] = ax
for par1, grp in total.groupby(by=par1_name):
    grp= grp.dropna()
    ax.plot(grp.n_samples, grp.loc[:,'total_mem'], label= '{}'.format(par1))
    #build = grp.query('type_mat == "sparse condensed linear"').loc[:,['n_samples','t_build']]
    #ax.plot(build.loc[:,'n_samples'], build.loc[:,'t_build'], label= 'build time {}'.format(par1))

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Time [s]", fontsize=16)
    ax.set_title('{}, {}{}'.format(title,'',''), fontsize=20)
    ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    #fontsize=16
    #prop={'size':16
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14) 
    
for fig_format in ('eps','png','pdf'):
    #fig.savefig('/home/chiroptera/QCThesis/results/EAC/total_time_sl-mst.{}'.format(fig_format), bbox_inches='tight')
    img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
    fig.savefig('{}total_time_sl-mst.{}'.format(img_path,fig_format), bbox_inches='tight')

In [62]:
# rules in different plots

ensemble_time = dict()
fig_dict = ensemble_time
title = 'Total execution with SL-MST-Disk'

#par1_name : diff plots
#par2_name : different curves in plot

#par1_name, par2_name = 'type_mat', 'rule'
par1_name, par2_name = 'rule', 'type_mat'

par_list = res[par1_name].unique()
#fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)



fig = plt.figure(figsize=(fig_width, fig_height))
ax = fig.add_subplot(111)
fig_dict[par1] = ax
for par1, grp in total.groupby(by=par1_name):

    ax.plot(grp.n_samples, grp.loc[:,'total_disk'], label= '{}'.format(par1))
    #build = grp.query('type_mat == "sparse condensed linear"').loc[:,['n_samples','t_build']]
    #ax.plot(build.loc[:,'n_samples'], build.loc[:,'t_build'], label= 'build time {}'.format(par1))

for par, ax in fig_dict.iteritems():
    ax.set_xlabel("# samples",fontsize=16)
    ax.set_ylabel("Time [s]", fontsize=16)
    ax.set_title('{}, {}{}'.format(title,'',''), fontsize=20)
    ax.legend(loc="lower right", prop={'size':16})#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")
    #fontsize=16
    #prop={'size':16
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(14)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(14)     
    
for fig_format in ('eps','png','pdf'):
    #fig.savefig('/home/chiroptera/QCThesis/results/EAC/total_time_sl-mst-disk.{}'.format(fig_format), bbox_inches='tight')
    img_path = 'C:\\Users\\Isabella\\Downloads\\working_data\\working_data\\results\\eac\\'
    fig.savefig('{}total_time_sl-mst-disk.{}'.format(img_path,fig_format), bbox_inches='tight')

finding out if there are sparse coassocs with less assocs than full coassocs


In [ ]:
n_array = res.n_samples.unique()
x_kmin = map(rule_x, n_array)

In [ ]:
500 **2

In [ ]:
500 * 499 / 2

In [ ]:
res[res.n_samples == 500][['n_samples', 'type_mat', 'n_assocs', 'round']]