This notebook generates every plot and numerical result mentioned or alluded in Section 4 of Algorithms for the Pagination Problem, a Bin Packing with Overlapping Items.
In [ ]:
from collections import OrderedDict
INPUT_PATH = "gauss/"
(MIN_PREFIX, MAX_PREFIX) = ("C015", "C055") # for instance filenames
OUTPUT_PATH = "plots/"
WINDOW = 150 # size of the subsets of instances used as a moving window
SOLVER_NAMES = OrderedDict([
("GeneticGroup", "Grouping GA"),
("GeneticStandard", "Standard GA"),
("OverloadAndRemove", "Overload-and-Remove"),
("OverloadAndRemovePresort", "Overload-and-Remove (with presort)"),
("BestFusion", "Best Fusion"),
("FirstFit", "First Fit"),
])
EXCLUDED_SOLVER_NAMES = {"OverloadAndRemovePresort"} # excluded from certain plots
solvers = ["solvers" + name for name in SOLVER_NAMES.keys()]
times = ["times" + name for name in SOLVER_NAMES.keys()]
In [ ]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import Locator
np.warnings.filterwarnings("ignore", category=RuntimeWarning)
np.warnings.filterwarnings("ignore", category=UserWarning)
In [ ]:
!pip install seaborn
In [ ]:
import seaborn as sns
sns.set_style("white")
sns.set_context("paper", font_scale=2)
sns.set_palette(sns.color_palette("Set1", 5))
In [ ]:
def plot_linear_regression(x, y):
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1])
correlation = round(x.corr(y), 3)
print("Pearson:", correlation)
return correlation
In [ ]:
!pip install pandas --upgrade
Create a DataFrame from all the JSON files whose name is comprised between MIN_PREFIX
and MAX_PREFIX
.
In [ ]:
import os, json
df = []
indexes = []
for filename in os.listdir(INPUT_PATH):
if not filename.endswith("json") or not MIN_PREFIX <= filename <= MAX_PREFIX:
continue
with open(os.path.join(INPUT_PATH, filename)) as f:
instances = json.loads(f.read())
indexes.extend([(filename, discriminant) for discriminant in range(len(instances))])
for instance in instances:
for (k, v) in list(instance.items()):
if isinstance(v, dict): # flatten any sub-dict with dot notation
for (sub_key, sub_value) in v.items():
instance[k + sub_key] = sub_value
del instance[k]
df.extend(instances)
df = pd.DataFrame(df, index=pd.MultiIndex.from_tuples(indexes, names=("filename", "i")))
df["best"] = df[["pageCount", "cplexOpt", "cplexUB"]].min(axis = 1) # add a column for the best known pagination size
df["cardinality"] = df["tiles"].apply(lambda tiles: sum(len(tile) for tile in tiles))
df_sorted_by_multiplicity = df.sort_values(by="avgMultiplicity") # for use with a moving window
print(df.info())
df.describe()
Out[ ]:
In [ ]:
print("There are a %s instances." % len(df))
Conjecture 1. The statistical difficulty of a given instance can be approximated by the difference between the average and the minimal number of pages in the paginations calculated by the various solvers.
Note that this measure of statistical difficulty is intrinsically correlated to the size of the best pagination:
In [ ]:
x = df[solvers].mean(axis=1) - df["best"]
y = df["best"]
plt.xlabel("Statistical difficulty")
plt.ylabel("Best pagination size")
plt.scatter(x, y, marker="o", s=1)
_ = plot_linear_regression(x, y)
The dispersion of the pagination sizes could have been measured in several other ways, for instance with the standard deviation (below).
In [ ]:
x = df["avgMultiplicity"]
y = df[solvers].std(axis=1)
axes = plt.gca()
axes.set_xlim([0, 70])
plot_linear_regression(x, y)
plt.scatter(x, y, marker="o", s=1)
plt.xlabel("Average multiplicity")
plt.ylabel("Average standard deviation")
plt.grid()
plt.show()
In [ ]:
result = df.groupby(round(2 * (df[solvers].mean(axis=1) - df["best"]))/2).size()
result.plot(kind="bar")
plt.yscale("symlog")
plt.xlabel("Statistical difficulty")
plt.ylabel("Number of instances (sym-log scale)")
plt.show()
print("Number of instances per statistical difficulty:\n", result)
print("Average statistical difficulty: %.02f" % (df[solvers].mean(axis=1) - df["best"]).mean())
print("Median statistical difficulty: %.02f" % (df[solvers].mean(axis=1) - df["best"]).median())
Conjecture 2. The statistical difficulty of a given random instance is strongly correlated to the density of its shared symbols, or average multiplicity.
In [ ]:
plt.figure(figsize=(10,5))
x = df["avgMultiplicity"]
y = df[solvers].mean(axis=1) - df["best"]
axes = plt.gca()
axes.set_xlim([0, 70])
axes.set_ylim([-1, 9.5])
plot_linear_regression(x, y)
plt.scatter(x, y, marker="o", s=1)
plt.xlabel("Average multiplicity")
plt.ylabel("Average range (statistical difficulty)")
plt.grid()
plt.savefig(os.path.join(OUTPUT_PATH, "difficulty_by_multiplicity.pdf"), bbox_inches='tight')
In [ ]:
plt.figure(figsize=(20, 10))
df["bitSize"] = df["symbolCount"] * df["tileCount"]
for (i, column) in enumerate(["symbolCount", "bitSize", "tileCount", "cardinality"], 1):
plt.subplot(2, 2, i)
x = df[column]
y = df[solvers].mean(axis=1) - df["best"]
if i in [1, 3]:
plt.ylabel("Average range (statistical difficulty)")
plt.scatter(x, y, marker="o", s=1)
correlation = plot_linear_regression(x, y)
plt.xlabel("%s (r = %s)" % (column, correlation))
plt.show()
In [ ]:
plt.figure(figsize=(10,6))
range_width = 2
ranges = np.arange(1, df["avgMultiplicity"].max() + range_width, range_width)
slices = pd.cut(df["avgMultiplicity"], ranges)
instances_per_slice = df.groupby(slices).size()
instances_per_slice.plot(kind="bar", width=0.9, color="#ffffbf")
cplex_instances = df[df["cplexOpt"].notnull() | df["cplexLB"].notnull() | df["cplexUB"].notnull()]
cplex_slices = pd.cut(cplex_instances["avgMultiplicity"], ranges)
cplex_instances.groupby(cplex_slices).size().plot(kind="bar", width=0.7, color='#abdda4')
cplex_solved_instances = df[df["cplexOpt"].notnull()]
cplex_solved_slices = pd.cut(cplex_solved_instances["avgMultiplicity"], ranges)
cplex_solved_instances.groupby(cplex_solved_slices).size().plot(kind="bar", width=0.5, color="#2b83ba")
plt.xlabel("Ranges of average multiplicity")
plt.ylabel("Number of instances (sym-log scale)")
plt.yscale('symlog')
axes = plt.gca()
axes.set_ylim(0, 3000)
plt.tick_params(axis='x', which='both', bottom='off', top='off')
axes.yaxis.grid(True)
plt.legend(["All instances", "Submitted to CPLEX", "Solved to optimality by CPLEX"])
plt.savefig(os.path.join(OUTPUT_PATH, "count_by_multiplicity.pdf"), bbox_inches='tight')
In [ ]:
range_width = 1
ranges = np.arange(1, df["avgMultiplicity"].max() + range_width, range_width)
slices = pd.cut(df["avgMultiplicity"], ranges)
instances_per_slice = df.groupby(slices).size()
for start in (4, 23, 53):
n = instances_per_slice[range_width * (start - 1)]
print("There are %d instances whose average multiplicity lies between %s and %s." % (n, start, start + range_width))
In [ ]:
(a, b) = (1, 9)
rate = 100.0 * sum(instances_per_slice[a-1:b-1]) / len(df)
print("%0.2f %% of the instances concentrate between average multiplicities %s and %s." % (rate, a, b))
In [ ]:
cplex_instances = df[df["cplexOpt"].notnull() | df["cplexLB"].notnull() | df["cplexUB"].notnull()]
print("%s instances (%.2f %%) submitted to CPLEX." % (len(cplex_instances), 100.0 * len(cplex_instances)/len(df)))
In [ ]:
print("CPLEX's success in less than one hour: %s instances (%.1f %%)." % (df["cplexOpt"].count(), 100.0 * df["cplexOpt"].count() / len(cplex_instances)))
In [ ]:
for above in (13, 20):
cplex_instances_above = cplex_instances[df["avgMultiplicity"] > above]
print("CPLEX's success in less than one hour above an average multiplicity of %s: %.1f %%." % (above, 100.0 * cplex_instances_above["cplexOpt"].count() / len(cplex_instances_above)))
In [ ]:
cplex_results = df[df["cplexOpt"].notnull() | df["cplexUB"].notnull()][["cplexOpt","cplexUB","pageCount"]]
print("All the %s instances for which CPLEX has found either a solution, either an upper bound:" % len(cplex_results))
cplex_results
Out[ ]:
In [ ]:
x = pd.Series.rolling(df_sorted_by_multiplicity["avgMultiplicity"], WINDOW, center=True).mean()
plt.figure(figsize=(10,5))
axes = plt.gca()
axes.set_xlim([2, 52])
for time in times:
solver_name = time[len("times"):]
if solver_name in EXCLUDED_SOLVER_NAMES:
continue
y = pd.Series.rolling(df_sorted_by_multiplicity[time], WINDOW, center=True).mean()
plt.plot(x, y, label=SOLVER_NAMES[solver_name])
plt.yscale('log')
plt.xlabel("Average multiplicity (rolling mean on %s instances)" % WINDOW)
plt.ylabel("Execution time (seconds, log scale)")
plt.grid()
plt.savefig(os.path.join(OUTPUT_PATH, "speed_by_multiplicity.pdf"), bbox_inches='tight')
plt.legend(loc=7) # legend not plotted for the paper version
plt.show()
In [ ]:
contents = [
df[times].min().map('{:,.2f}'.format),
df[times].max().map('{:,.2f}'.format),
df[times].mean().map('{:,.2f}'.format),
df[times].std().map('{:,.2f}'.format)
]
digest = pd.DataFrame(contents, index = ["min", "max", "mean", "std"])
digest.columns = SOLVER_NAMES.values()
print("Basic aggregations on execution times (in seconds):")
digest
Out[ ]:
The outcomes are plotted at $y = \frac{\mathrm{best~size}}{\mathrm{size}}$, with $y=1$ corresponding to the best known solution (which is either the optimal or the best feasible solution found by CPLEX, or the smallest approximation calculated for the given instance).
In [ ]:
x = pd.Series.rolling(df_sorted_by_multiplicity["avgMultiplicity"], WINDOW, center=True).mean()
plt.figure(figsize=(10,7))
axes = plt.gca()
axes.set_xlim([2, 52])
axes.set_ylim([0.74, 1.01])
axes.spines['right'].set_visible(False)
axes.spines['top'].set_visible(False)
for solver in solvers:
solver_name = solver[len("solvers"):]
if solver_name in EXCLUDED_SOLVER_NAMES:
continue
ratio = df_sorted_by_multiplicity["best"] / df_sorted_by_multiplicity[solver]
y = pd.Series.rolling(ratio, WINDOW, center=True).mean()
plt.plot(x, y, label=SOLVER_NAMES[solver_name])
plt.xlabel("Average multiplicity (rolling mean on %s instances)" % WINDOW)
plt.ylabel("Average pagination size vs. best known result")
plt.grid()
# move the legend to an empty place
legend = plt.legend(loc=7)
plt.draw()
bb = legend.legendPatch.get_bbox().inverse_transformed(axes.transAxes)
bb.set_points([[bb.x0 - 0.02, bb.y0 + 0.2], [bb.x1 - 0.02, bb.y1 + 0.2]])
legend.set_bbox_to_anchor(bb)
plt.savefig(os.path.join(OUTPUT_PATH, "relative_size_by_multiplicity.pdf"), bbox_inches='tight')
The column pageCount
gives the smallest pagination size found by the various heuristics:
In [ ]:
assert len(df[df["pageCount"] != df[solvers].min(axis=1)]) == 0
Hence, the optimal value found by CPLEX may be lesser than this one:
In [ ]:
suboptimal_instances_1 = df[df["cplexOpt"] < df["pageCount"]][["cplexOpt", "pageCount"] + solvers]
suboptimal_instances_1.columns = ["cplexOpt", "pageCount"] + list(SOLVER_NAMES.values())
print("The optimal solution is better than the best approximation for these %s instances:" % len(suboptimal_instances_1))
suboptimal_instances_1
Out[ ]:
It may happen that the upper bound found by CPLEX is less than the best page count found by the heuristics. In this case, we know that there exists a better pagination (although CPLEX cannot prove its optimality):
In [ ]:
suboptimal_instances_2 = df[df["cplexUB"] < df["pageCount"]][["cplexUB", "pageCount"] + solvers]
suboptimal_instances_2.columns = ["cplexOpt", "pageCount"] + list(SOLVER_NAMES.values())
print("For %s more instances, we know that the best approximation is not optimal:" % len(suboptimal_instances_2))
suboptimal_instances_2
Out[ ]:
The column best
gives the minimum pagination sizes found by the heuristics and CPLEX (including the upper bound):
In [ ]:
df[df["best"] < df["pageCount"]][["best", "pageCount"]]
Out[ ]:
In [ ]:
count = len(suboptimal_instances_1) + len(suboptimal_instances_2)
print("All in all, ILP improved on the heuristics in %s cases" % count, end=" ")
print("(%.02f %% of the %s selected instances)." % (100.0 * count / len(cplex_instances), len(cplex_instances)))
In [ ]:
prefix = ["avgMultiplicity", "pageCount"]
columns = [
"solversGeneticGroup",
"solversGeneticStandard",
"solversOverloadAndRemove",
"solversOverloadAndRemovePresort"
]
bad_gga = df[df["pageCount"] < df["solversGeneticGroup"]][prefix + columns]
for column in columns[1:]:
bad_gga[column] = bad_gga[column][bad_gga[column] < bad_gga["solversGeneticGroup"]]
bad_gga.columns = prefix + [SOLVER_NAMES[column[len("solvers"):]] for column in columns]
print("In %.02f %% of the cases," % (100.0 - 100.0 * len(bad_gga) / len(df)),)
print("Grouping GA was the best heuristics, except on these %s instances" % len(bad_gga), end=" ")
print("(greater values erased for clarity, sorted by increasing average multiplicity).")
bad_gga.sort_values(by="avgMultiplicity").fillna("")
Out[ ]:
In [ ]:
for column in bad_gga.columns[len(prefix) + 1:]:
count = bad_gga[column].count()
print("%s produced a better pagination than Grouping GA on %s instances (%.03f %%)." % (column, count, (100.0 * count / len(df))))