In [1]:

    
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import os.path
import pandas as pd
import seaborn as sns
from MyML.helper.plotting import save_fig



In [2]:

    
sns.set_style("whitegrid")
fig_width = 8
fig_height = 6



In [3]:

    
results_path = '/home/chiroptera/QCThesis/datasets/gauss10e6_overlap/results_kmin.csv'



In [4]:

    
res = pd.read_csv(results_path)
for col in res.columns:
    print col









    



Unnamed: 0
n_samples
rule
kmin
kmax
t_ensemble
t_build
t_sl
t_accuracy_CI
t_accuracy_H
t_sl_disk
t_store
t_accuracy_CI_disk
t_accuracy_H_disk
biggest_cluster
type_mat
n_assocs
n_max_degree
min_degree
max_degree
mean_degree
std_degree
accuracy_CI
accuracy_H
sl_clusts
accuracy_CI_disk
accuracy_H_disk
sl_clusts_disk
round
disk



In [5]:

    
res['csr_max_row'] = res.biggest_cluster * 3
res['csr_topped'] = (res['csr_max_row'] == res['max_degree']) & res.type_mat
res = res[np.logical_not(res.n_samples.isnull())]
#res.max_assoc = res.max_assoc.apply(lambda s: int(s.strip('()').split(',')[0]))
#res['final density'] = res.n_assocs * 1.0 / (res.n_samples ** 2)
res['samples_per_kmin']=res['n_samples'] / res['kmin']
res['assocs_per_samples'] = res['n_assocs'] / res['n_samples']



In [6]:

    
def sp_lin_area(n_s, n_e, val_s, val_e):

    tri = (n_e - n_s) * (val_s - val_e) / 2.0
    r_rect = (1.0 - n_e) * (val_s - val_e)
    cut_area = tri + r_rect

    return 1 - cut_area

def lin_max_n_assocs(n, bgs):
    return n * bgs * 3 * sp_lin_area(0.05, 1.0, 1.0, 0.05)

full_idx = res.type_mat == 'full'
full_condensed_idx = res.type_mat == 'full condensed'
sp_complete_idx = res.type_mat == 'sparse complete'
sp_condensed_const_idx = res.type_mat == 'sparse condensed const'
sp_lin_idx = res.type_mat == 'sparse condensed linear'

sp_const_idx = res.type_mat.isin(['sparse complete', 'sparse condensed const'])
sp_idx = res.type_mat.isin(['sparse complete', 'sparse condensed const', 'sparse condensed linear'])

# maximum number of associations allowed / pre-allocated
res['max_n_assocs'] = 0
res.loc[full_idx,'max_n_assocs'] = np.int64(res.n_samples[full_idx] **2)
res.loc[full_condensed_idx,'max_n_assocs'] = np.int64(res.n_samples[full_condensed_idx] * (res.n_samples[full_condensed_idx] - 1) / 2)
res.loc[sp_const_idx,'max_n_assocs'] = np.int64(res.n_samples[sp_const_idx] * res.biggest_cluster[sp_const_idx] * 3)
res.loc[sp_lin_idx,'max_n_assocs'] = np.int64(lin_max_n_assocs(res.n_samples[sp_lin_idx], res.biggest_cluster[sp_lin_idx]))

# actual memory used to store the associations
res['mem'] = res.max_n_assocs
res.loc[sp_idx,'mem'] = res.max_n_assocs[sp_idx] * (1+4) # data + indices

res['density'] = 1.0 * res.max_n_assocs / (res.n_samples ** 2)
res['mem_density'] = 1.0 * res.mem / (res.n_samples ** 2)



In [7]:

    
by_rule_n = res.groupby(by=["rule", "n_samples", "type_mat"])
rule_n_mean = by_rule_n.apply(np.mean)
rule_n_mean = rule_n_mean[['kmin','kmax','t_ensemble','biggest_cluster',
                           'mean_degree','std_degree','min_degree',
                           'max_degree','n_assocs','n_max_degree', 'accuracy_CI', 'accuracy_CI_disk',
                           't_build', 't_sl', 't_sl_disk', 't_accuracy_CI',
                           't_accuracy_CI_disk', 'sl_clusts', 'sl_clusts_disk',
                           'csr_max_row', 'csr_topped', 'samples_per_kmin', 'max_n_assocs',
                           'mem', 'density', 'mem_density','assocs_per_samples']]
rule_n_mean = rule_n_mean.reset_index()

Build time



In [8]:

    
# rules in different plots
par_list = res['rule'].unique()
rows = 2
cols = 2
fig1 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig1.suptitle('Coassoc build time: comparisson between matrix type for each rule', size=16)

fig = fig1

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}

sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='rule'):
    for type_mat, grp2 in grp.groupby(by='type_mat'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[rule]
        ax.plot(grp2.n_samples[idx], grp2.t_build[idx], label=type_mat)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Time [s]")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")



In [9]:

    
# rules in different plots
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig2 = plt.figure(figsize=(fig_width * cols, fig_height * rows))
fig2.suptitle('Coassoc build time: comparisson between rules for each matrix type', size=16)

fig = fig2

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}

sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='type_mat'):
    for type_mat, grp2 in grp.groupby(by='rule'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[rule]
        ax.plot(grp2.n_samples[idx], grp2.t_build[idx], label=type_mat)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Time [s]")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

Single-Clustering time: memory



In [10]:

    
par_list = res['rule'].unique()
rows = 2
cols = 2
fig3 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig3
fig.suptitle("Single-Link clustering time: comparison between matrix types for each rule", size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='rule'):
    for type_mat, grp2 in grp.groupby(by='type_mat'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[rule]
        trace = '-'
        if type_mat in ('full condensed', 'sparse condensed linear'):
            trace = '--'
        ax.plot(grp2.n_samples[idx], grp2.t_sl[idx], trace, label=type_mat)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Time [s]")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")



In [11]:

    
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig4 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig4
fig.suptitle("Single-Link clustering time: comparison between rules for each matrix type", size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}

sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='type_mat'):
    for type_mat, grp2 in grp.groupby(by='rule'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[rule]
        ax.plot(grp2.n_samples[idx], grp2.t_sl[idx], label=type_mat)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Time [s]")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

Single-Clustering time: disk



In [12]:

    
# rules in different plots
par_list = res.rule.unique()
rows = 2
cols = 2
fig5 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig5
fig.suptitle('Disk-based Single-Link clustering time: comparison between matrix type for each rule', size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='rule'):
    for type_mat, grp2 in grp.groupby(by='type_mat'):
        if grp2.t_sl_disk.isnull().all():
            continue
        idx = grp2.n_samples >= 5e2
        ax = ax_par_dict[rule]
        trace = '--' if type_mat == 'sparse condensed linear' else '-'
        ax.plot(grp2.n_samples[idx], grp2.t_sl_disk[idx], trace, label=type_mat)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Time [s]")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")



In [13]:

    
# rules in different plots
par_list = ['sparse complete', 'sparse condensed const', 'sparse condensed linear']
rows = 2
cols = 2
fig6 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig6

fig.suptitle('Disk-based Single-Link clustering time: comparison between rules for each matrix type', size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
    for rule, grp2 in grp.groupby(by='rule'):
        if grp2.t_sl_disk.isnull().any():
            continue
        idx = grp2.n_samples >= 5e2
        ax = ax_par_dict[type_mat]
        ax.plot(grp2.n_samples[idx], grp2.t_sl_disk[idx], label=rule)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Time [s]")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

Sparsity: maximum number of assocs.



In [14]:

    
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig7 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig7
fig.suptitle('Association density relative to a full complete matrix: comparison between matrix type for each rule', size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
    for rule, grp2 in grp.groupby(by='rule'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[type_mat]
        trace = '-'
        ax.plot(grp2.n_samples[idx], grp2.density[idx], trace, label=rule)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Relative density")
    ax.set_title(par)
    ax.legend(loc="upper right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")



In [15]:

    
par_list = res['rule'].unique()
rows = 2
cols = 2
fig8 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig8
fig.suptitle('Association density relative to a full complete matrix: comparison between matrix type for each rule', size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='rule'):
    for type_mat, grp2 in grp.groupby(by='type_mat'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[rule]
        trace = '-'
        if type_mat in ('sparse condensed const'):
            trace = '--'
        ax.plot(grp2.n_samples[idx], grp2.density[idx], trace, label=type_mat)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Relative density")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

Sparsity: memory



In [16]:

    
par_list = res['type_mat'].unique()
rows = 3
cols = 2
fig9 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig9
fig.suptitle('Memory used relative to a full complete matrix: comparison between matrix type for each rule', size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for type_mat, grp in rule_n_mean.groupby(by='type_mat'):
    for rule, grp2 in grp.groupby(by='rule'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[type_mat]
        trace = '-'
        ax.plot(grp2.n_samples[idx], grp2.mem_density[idx], trace, label=rule)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel('Memory ratio')
    ax.set_title(par)
    ax.legend(loc="upper right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")



In [17]:

    
par_list = res['rule'].unique()
rows = 2
cols = 2
fig10 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig10
fig.suptitle('Memory used relative to a full complete matrix: comparison between matrix type for each rule', size=16)

ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}


sns.set_palette(sns.color_palette("deep", 6))

for rule, grp in rule_n_mean.groupby(by='rule'):
    for type_mat, grp2 in grp.groupby(by='type_mat'):
        idx = grp2.n_samples >= 1e3 # ignore datasets smaller than 1000
        ax = ax_par_dict[rule]
        trace = '-'
        if type_mat in ('sparse condensed const'):
            trace = '--'
        ax.plot(grp2.n_samples[idx], grp2.mem_density[idx], trace, label=type_mat)

for par, ax in ax_par_dict.iteritems():
    ax.set_xlabel("# samples")
    ax.set_ylabel("Memory ratio")
    ax.set_title(par)
    ax.legend(loc="lower right")#loc=(1.1, 0.0))
    ax.grid(True, which="both")
    ax.set_xscale("log")
    ax.set_yscale("log")

Memory / $K_{min}$ relationship



In [18]:

    
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig11 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig11
fig.suptitle('Relative memory usage of sparse condensed linear and $K_{min}$ evolution', size=16)

sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']
sns.set_palette(sns.color_palette("dark", 10))
ax_s = ax.twinx()

for rule, grp in rule_n_mean.groupby(by='rule'):
    idx = grp.n_samples >= 1e3 # ignore datasets smaller than 1000
    idx2 = grp.type_mat == 'sparse condensed linear'
    idx = np.logical_and(idx, idx2)
    trace = '-'
    ax.plot(grp.n_samples[idx], grp.mem_density[idx], trace, label=rule)
    ax_s.plot(grp.n_samples[idx], grp.kmin[idx], trace, label=rule)    

ax.plot([250000,250000],[0,10],'-.k')
ax.plot([500000,500000],[0,10],'-.k')

ax.plot([0,10e7],[0.1,0.1],'-.k')
ax.plot([0,10e7],[1,1],'-.k')
    
ax.set_xlabel("# samples")
ax.set_ylabel('Memory ratio')
ax_s.set_ylabel('$K_{min}$')

ax.legend(loc=(1.1, 0.0), title="Memory ratio") #loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax_s.set_xscale("log")
ax.set_yscale("log")
ax_s.set_yscale("log")

ax_s.legend(loc=(1.1, 0.8), title="$K_{min}$")









    Out[18]:





<matplotlib.legend.Legend at 0x7f6287bfb7d0>

Analyzing this plot is interesting we can clearly see the effect of $K_{min}$ in the memory usage. Three of the rules have a quadratic growth, while the "sk=300" rule has a linear growth. It's obvious that a smaller $K_{min}$ will translate in higher memory consumptions. We can see that as the "sk=300" rule crosses the other rules, its correspondent memory usage becomes less than that of those rules.



In [19]:

    
res.query('')









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-19-34a1eff805c7> in <module>()
----> 1 res.query('')

/home/chiroptera/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in query(self, expr, **kwargs)
   1933         """
   1934         kwargs['level'] = kwargs.pop('level', 0) + 1
-> 1935         res = self.eval(expr, **kwargs)
   1936 
   1937         try:

/home/chiroptera/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in eval(self, expr, **kwargs)
   1985         kwargs['target'] = self
   1986         kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers
-> 1987         return _eval(expr, **kwargs)
   1988 
   1989     def select_dtypes(self, include=None, exclude=None):

/home/chiroptera/anaconda/lib/python2.7/site-packages/pandas/computation/eval.pyc in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target)
    215     pandas.DataFrame.eval
    216     """
--> 217     expr = _convert_expression(expr)
    218     _check_engine(engine)
    219     _check_parser(parser)

/home/chiroptera/anaconda/lib/python2.7/site-packages/pandas/computation/eval.pyc in _convert_expression(expr)
    115     """
    116     s = com.pprint_thing(expr)
--> 117     _check_expression(s)
    118     return s
    119 

/home/chiroptera/anaconda/lib/python2.7/site-packages/pandas/computation/eval.pyc in _check_expression(expr)
     88     """
     89     if not expr:
---> 90         raise ValueError("expr cannot be an empty string")
     91 
     92 

ValueError: expr cannot be an empty string



In [ ]:

    
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig11 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig11
fig.suptitle('Number of associations per sample', size=16)

sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']

for rule, grp in rule_n_mean.groupby(by='rule'):
    idx = grp.n_samples >= 1e3 # ignore datasets smaller than 1000
    idx2 = grp.type_mat == 'sparse condensed linear'
    idx = np.logical_and(idx, idx2)
    trace = '-'
    ax.plot(grp.n_samples[idx], grp.assocs_per_samples[idx], trace, label=rule)

#ax.plot([250000,250000],[0,10],'-.k')
#ax.plot([500000,500000],[0,10],'-.k')

#ax.plot([0,10e7],[0.1,0.1],'-.k')
#ax.plot([0,10e7],[1,1],'-.k')
    
ax.set_xlabel("Data set size [# samples]")
ax.set_ylabel('No. associations per sample')

ax.legend(loc='lower right', title="Rule") #loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
ax.set_yscale("log")

Accuracy



In [ ]:

    
same_accuracy = {n_samples:True for n_samples in res.n_samples.unique()}
n_samples_accuracy = {n_samples:0 for n_samples in res.n_samples.unique()}
for n_samples, grp in rule_n_mean.groupby(by='n_samples'):
    #print n_samples,grp[['accuracy_CI','accuracy_CI_disk']]
    #print grp.accuracy_CI
    #print grp.accuracy_CI == grp.accuracy_CI[0]
    first = True
    first_score = 0
    for score in np.nditer(grp[['accuracy_CI','accuracy_CI_disk']].values):
        if pd.notnull(score):
            if first:
                first_score = score
                n_samples_accuracy[n_samples] = score
                first = False
            if score != first_score:
                same_accuracy[n_samples] = False
                break



In [ ]:

    
print 'All accuracies are the same:'
print '----------------------------'
print '# samples\tSame'
for item in sorted(same_accuracy.iteritems(), key=lambda x: x[0]):
    print '{}\t\t{}\t{}'.format(int(item[0]), item[1], n_samples_accuracy[item[0]])

Accuracies are the same throughout the whose spectrum of cardinality, for the exception of the first 3 sets. The reason for this is that the 'sk=300' rule yields a very low $K_{min}$ for low cardinality. In this cases the number of clusters in the ensemble is lower that the true number, which is undesirable in EAC.



In [ ]:

    
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig12 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

fig = fig12
fig.suptitle('Accuracy (CI) evolution with number of samples', size=16)

sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']

idx = (res.n_samples >= 1e3) & (res.type_mat == 'sparse condensed linear') & (res.rule == 'sk=300,th=30%')

ax.plot(res.n_samples[idx], res.accuracy_CI_disk[idx], trace, label='sk=300,th=30%')

#ax.plot([0,0,2500000,2500000],[0,1.1,0,1.1],'.k')
#ax.plot([500000,500000],[0,1],'-.k')
    
ax.set_xlabel("# samples")
ax.set_ylabel('Memory ratio')

ax.legend(loc="best") #loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")
#ax.set_yscale("log")
t_axis = plt.axis()
t_axis = (t_axis[0], t_axis[1], t_axis[2], t_axis[3] + 0.01)
plt.axis(t_axis)



In [ ]:

    
par_list = ['sparse condensed linear']
rows = 1
cols = 1
fig13 = plt.figure(figsize=(fig_width*2 * cols, fig_height*2 * rows))

fig = fig13
fig.suptitle('Relative memory usage of sparse condensed linear and $K_{min}$ evolution', size=16)

sns.set_palette(sns.color_palette("deep", 6))
ax_par_dict = {key: fig.add_subplot(rows, cols, i+1) for i, key in enumerate(par_list)}
ax = ax_par_dict['sparse condensed linear']
sns.set_palette(sns.color_palette("dark", 10))
ax_s = ax.twinx()

for rule, grp in rule_n_mean.groupby(by='rule'):
    idx = grp.n_samples >= 1e3 # ignore datasets smaller than 1000
    idx2 = grp.type_mat == 'sparse condensed linear'
    idx = np.logical_and(idx, idx2)
    trace = '-'
    ax.plot(grp.n_samples[idx], grp.n_assocs[idx], trace, label=rule)
    ax_s.plot(grp.n_samples[idx], grp.max_n_assocs[idx], '-.', label=rule)    

   
ax.set_xlabel("# samples", fontsize=16)
ax.set_ylabel('# assocs.', fontsize=16)
ax.set_xscale("log")
ax.set_yscale("log")
ax.grid(True, which="both")
ax.legend(loc=(1.1, 0.0), title='Number of associations',fontsize='x-large') #loc=(1.1, 0.0))


#ax_s.set_ylabel('$K_{min}$')
ax_s.set_xscale("log")
ax_s.set_yscale("log")
ax_s.legend(loc=(1.1, 0.8), title='Maximum number of assocs.')



In [ ]:

    
# double y axis example
rows = 1
cols = 1
fig14 = plt.figure(figsize=(fig_width * cols, fig_height * rows))

sns.set_palette(sns.color_palette("deep", 6))
ax = fig14.add_subplot(rows, cols, 1)
for key,grp in rule_n_mean.groupby(by="rule"):
    idx = (grp.n_samples >= 1e3) & (grp.type_mat == 'sparse condensed linear')
    ax.plot(grp.n_samples[idx], grp.max_degree[idx] / grp.biggest_cluster[idx], label=key)

ax.set_xlabel("# samples")
ax.set_ylabel("max # assocs / biggest cluster size")
ax.set_title("Relationship between max. num. assocs and biggest cluster size per rule")
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")

finding out if there are sparse coassocs with less assocs than full coassocs



In [ ]:

    
n_array = res.n_samples.unique()
x_kmin = map(rule_x, n_array)



In [ ]:

    
500 **2



In [ ]:

    
500 * 499 / 2



In [ ]:

    
res[res.n_samples == 500][['n_samples', 'type_mat', 'n_assocs', 'round']]