In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
sns.set_style("whitegrid")

In [4]:
from MyML.helper.plotting import save_fig

In [5]:
folder = "/home/chiroptera/QCThesis/experiments/study kmin/"
filename = "results_kmin_100k.csv"

In [6]:
res = pd.read_csv(folder + filename)
for col in res.columns:
    print col


Unnamed: 0
n_samples
rule
kmin
kmax
t_ensemble
biggest_cluster
type_mat
t_build
n_assocs
max_assoc
t_sl
accuracy
round

In [7]:
res = res[np.logical_not(res.n_samples.isnull())]
res.max_assoc = res.max_assoc.apply(lambda s: int(s.strip('()').split(',')[0]))
res['sparsity'] = res.n_assocs * 1.0 / (res.n_samples ** 2)

In [8]:
by_rule_n = res.groupby(by=["rule","n_samples"])
rule_n_mean = by_rule_n.apply(np.mean)
rule_n_mean = rule_n_mean[['kmin','kmax','t_ensemble','biggest_cluster','t_build','max_assoc','sparsity']]
rule_n_mean = rule_n_mean.reset_index()

In [9]:
rule_n_mean


Out[9]:
rule n_samples kmin kmax t_ensemble biggest_cluster t_build max_assoc sparsity
0 2sqrt 500 24 46 0.449143 103.0 0.003593 102.0 0.175640
1 2sqrt 1000 32 64 0.614484 163.0 0.007613 179.0 0.140068
2 2sqrt 5000 72 142 1.653651 244.2 0.193588 553.0 0.064680
3 2sqrt 10000 100 200 3.284222 357.6 0.679371 825.0 0.048116
4 2sqrt 25000 160 318 9.912761 514.6 2.944074 1290.4 0.029989
5 2sqrt 50000 224 448 25.169035 631.6 8.518460 1645.0 0.020584
6 2sqrt 100000 318 634 65.957036 885.8 25.149985 2302.6 0.014636
7 sk=300,th=30% 500 2 3 0.256407 333.0 0.015492 499.0 0.906096
8 sk=300,th=30% 1000 4 5 0.333140 654.0 0.036897 722.2 0.569142
9 sk=300,th=30% 5000 17 22 0.601839 1353.6 0.685861 1352.6 0.211443
10 sk=300,th=30% 10000 34 44 1.275257 1695.6 1.757256 1708.0 0.156986
11 sk=300,th=30% 25000 84 109 4.562427 1136.4 6.138902 2757.2 0.065943
12 sk=300,th=30% 50000 167 217 15.033801 1053.8 14.082020 2673.4 0.033254
13 sk=300,th=30% 100000 334 434 53.708112 946.0 30.026679 2467.4 0.016464
14 sk=sqrt/2,th=30% 500 46 60 0.554201 73.2 0.004065 85.4 0.116782
15 sk=sqrt/2,th=30% 1000 67 87 0.795285 60.4 0.007561 133.4 0.080113
16 sk=sqrt/2,th=30% 5000 143 186 2.325367 123.2 0.126228 327.4 0.038199
17 sk=sqrt/2,th=30% 10000 200 260 4.727282 185.0 0.458040 432.0 0.027395
18 sk=sqrt/2,th=30% 25000 317 412 14.629016 246.0 1.921412 657.2 0.017175
19 sk=sqrt/2,th=30% 50000 451 586 37.425763 358.8 5.502441 917.2 0.012107
20 sk=sqrt/2,th=30% 100000 633 823 101.051155 486.4 16.198583 1288.2 0.008585
21 sqrt 500 12 23 0.365377 153.8 0.020901 167.2 0.268864
22 sqrt 1000 16 32 0.461489 244.2 0.010415 243.2 0.199118
23 sqrt 5000 36 71 1.009844 762.0 0.344361 869.2 0.126275
24 sqrt 10000 50 100 1.929875 836.4 1.146886 1643.8 0.094650
25 sqrt 25000 80 159 5.414094 1369.2 5.308476 2668.4 0.059584
26 sqrt 50000 112 224 13.590840 1474.0 15.921081 3479.8 0.041310
27 sqrt 100000 159 317 34.770353 2030.4 48.424562 4843.4 0.029708

In [46]:
fig1 = plt.figure(figsize=(16,6))
sns.set_palette(sns.color_palette("deep", 6))

ax = fig1.add_subplot(121)
for key,grp in rule_n_mean.groupby(by="rule"):
    idx = grp.n_samples >= 1e3
    ax.plot(grp.n_samples[idx], grp.sparsity[idx], label=key)

ax.set_xlabel("# samples")
ax.set_ylabel("sparsity")
ax.set_title("Sparsity evolution with several rules")
ax.legend(loc="best")
ax.grid(True, which="both")
ax.set_xscale("log")

    
ax = fig1.add_subplot(122)
for key,grp in rule_n_mean.groupby(by="rule"):
    idx = grp.n_samples >= 1e4
    ax.plot(grp.n_samples[idx], grp.sparsity[idx], label=key)    
    
ax.set_xlabel("# samples")
ax.set_ylabel("sparsity")
ax.set_title("Sparsity evolution with several rules")
ax.legend(loc="best")
ax.grid(True, which="both")
ax.set_xscale("log")



In [47]:
fig2 = plt.figure(figsize=(16,6))
sns.set_palette(sns.color_palette("deep", 6))

ax = fig2.add_subplot(121)
for key,grp in rule_n_mean.groupby(by="rule"):
    idx = grp.n_samples >= 1e3
    ax.plot(grp.n_samples[idx], grp.max_assoc[idx], label=key)

ax.set_xlabel("# samples")
ax.set_ylabel("max. # assocs.")
ax.set_title("Max. # assocs. evolution with several rules")
ax.legend(loc="best")
ax.grid(True, which="both")
ax.set_xscale("log")



In [48]:
fig3 = plt.figure(figsize=(16,6))
sns.set_palette(sns.color_palette("deep", 6))

ax = fig3.add_subplot(121)
for key,grp in rule_n_mean.groupby(by="rule"):
    idx = grp.n_samples >= 1e3
    ax.plot(grp.n_samples[idx], grp.t_ensemble[idx], label=key)

ax.set_xlabel("# samples")
ax.set_ylabel("time to generate ensemble [s]")
ax.set_title("Time to generate ensembles")
ax.legend(loc="best")
ax.grid(True, which="both")
ax.set_xscale("log")
#ax.set_yscale("log")
    
ax = fig3.add_subplot(122)
for key,grp in rule_n_mean.groupby(by="rule"):
    idx = grp.n_samples >= 1e3
    ax.plot(grp.n_samples[idx], grp.t_build[idx], label=key)

ax.set_xlabel("# samples")
ax.set_ylabel("time to build matrix [s]")
ax.set_title("Time to build matrices")
ax.legend(loc="best")
ax.grid(True, which="both")
ax.set_xscale("log")



In [41]:
# double y axis example
fig4 = plt.figure(figsize=(16,6))

sns.set_palette(sns.color_palette("dark", 10))
ax = fig4.add_subplot(121)
sns.set_palette(sns.color_palette("deep", 10))
ax_s = ax.twinx()
for key,grp in rule_n_mean.groupby(by="rule"):
    idx = grp.n_samples >= 1e3
    ax.plot(grp.n_samples[idx], grp.sparsity[idx], label=key)
    ax_s.plot(grp.n_samples[idx], grp.n_samples[idx] / grp.kmin[idx], label=key)

ax_s.set_ylabel("# samples / Kmin")
#ax_s.legend(loc=(1.1, 0.8), title="max_assocs")


ax.set_xlabel("# samples")
ax.set_ylabel("sparsity")
ax.set_title("Sparsity and samples per cluster evolution")
#ax.legend(loc=(1.1, 0.0), title="sparsity")
ax.grid(True, which="both")
ax.set_xscale("log")

sns.set_palette(sns.color_palette("dark", 10))
ax = fig4.add_subplot(122)
sns.set_palette(sns.color_palette("deep", 10))
ax_s = ax.twinx()
for key,grp in rule_n_mean.groupby(by="rule"):
    idx = grp.n_samples >= 1e4
    ax.plot(grp.n_samples[idx], grp.sparsity[idx], label=key)    
    ax_s.plot(grp.n_samples[idx], grp.n_samples[idx] / grp.kmin[idx], label=key)

ax_s.set_ylabel("# samples / Kmin")    
ax_s.legend(loc=(1.1, 0.8), title="max_assocs")

ax.set_xlabel("# samples")
ax.set_ylabel("sparsity")
ax.set_title("Sparsity and samples per cluster evolution")
ax.legend(loc=(1.1, 0.0), title="sparsity")
ax.grid(True, which="both")
ax.set_xscale("log")



In [42]:
dark_deep_palette = list()
for (col1,col2) in zip(sns.color_palette("dark", 6),sns.color_palette("deep", 6)):
    dark_deep_palette.append(col1)
    dark_deep_palette.append(col2)
    
sns.palplot(dark_deep_palette)



In [43]:
# double y axis example
fig5 = plt.figure(figsize=(18,6))

sns.set_palette(dark_deep_palette, n_colors=len(dark_deep_palette))
ax = fig5.add_subplot(121)
for key,grp in rule_n_mean.groupby(by="rule"):
    idx = grp.n_samples >= 1e3
    ax.plot(grp.n_samples[idx], grp.max_assoc[idx], label=key + " max assoc")
    ax.plot(grp.n_samples[idx], grp.biggest_cluster[idx], label=key + " clust")

ax_s.set_ylabel("biggest cluster")    
ax_s.legend(loc=(1.1, 0.8), title="biggest cluster")

ax.set_xlabel("# samples")
ax.set_ylabel("max # assocs / biggest cluster size")
ax.set_title("Max. num. assocs and biggest cluster size evolution per rule")
ax.legend(loc="upper left")#(1.1, 0.0), title="Legend")
ax.grid(True, which="both")
ax.set_xscale("log")



In [44]:
# double y axis example
fig6 = plt.figure(figsize=(18,6))

sns.set_palette(sns.color_palette("deep", 6))
ax = fig6.add_subplot(121)
for key,grp in rule_n_mean.groupby(by="rule"):
    idx = grp.n_samples >= 1e3
    ax.plot(grp.n_samples[idx], grp.max_assoc[idx] / grp.biggest_cluster[idx], label=key)

ax.set_xlabel("# samples")
ax.set_ylabel("max # assocs / biggest cluster size")
ax.set_title("Relationship between max. num. assocs and biggest cluster size per rule")
ax.legend(loc="lower right")#loc=(1.1, 0.0))
ax.grid(True, which="both")
ax.set_xscale("log")



In [49]:
save_fig(fig1,folder + "sparsity")
save_fig(fig2,folder + "max_assocs")
save_fig(fig3,folder + "times")
save_fig(fig4,folder + "sparsity_kmin")
save_fig(fig5,folder + "max_assoc_cluster_size")
save_fig(fig6,folder + "max_assoc_div_cluster_size")