Investigating a Potential Correlation Between the Scalar Complexity of a Matrix and Its Compressed Size


In [1]:
def pattern_complexity(pattern):
    scalar_complexity = 0.0
    nodes_in = pattern.shape[1]
    for i in xrange(nodes_in - 1):
        for j in xrange(i + 1, nodes_in):
            scalar_complexity += numpy.inner(pattern[:, i], pattern[:, j])
        # normalise by the number of vectors products, i.e.,
        # nodes_in * (nodes_in - 1) / 2
    norm = float(nodes_in * (nodes_in - 1)) / 2.0
    scalar_complexity /= norm
    return numpy.reciprocal(scalar_complexity)

In [2]:
def generate_pattern(nodes_in=8, nodes_out=8, activated=4):
    assert activated <= nodes_out
    rand_int = numpy.random.randint
    rand_arr = numpy.random.random_sample
    pattern = numpy.zeros((nodes_out, nodes_in))
    for j in xrange(nodes_in):
        rows = set()
        while len(rows) < activated:
            rows.add(rand_int(nodes_out))
        values = rand_arr(activated)
        values /= values.sum()
        for (pos, i) in enumerate(rows):
            pattern[i, j] = values[pos]
    return pattern

In [3]:
import zlib

In [4]:
import bz2

In [5]:
import lzma

In [6]:
import blosc

In [7]:
%timeit generate_pattern()


10000 loops, best of 3: 121 us per loop

In [8]:
pattern = generate_pattern()
strng = pattern.tostring()

In [19]:
plt.matshow(pattern, cmap=plt.cm.gray_r, vmin=0.0, vmax=1.0)
plt.colorbar()
plt.show()



In [9]:
%timeit zlib.compress(strng, 9)


10000 loops, best of 3: 67.2 us per loop

In [10]:
%timeit bz2.compress(strng, 9)


1000 loops, best of 3: 324 us per loop

In [11]:
%timeit lzma.compress(strng, options={"level": 9})


10 loops, best of 3: 72.4 ms per loop

In [12]:
%timeit blosc.compress(strng, typesize=pattern.dtype.itemsize)


10000 loops, best of 3: 24.5 us per loop

In [13]:
number = int(1E4)

In [14]:
complexity = numpy.zeros(number, dtype=float)
zlib_res = numpy.zeros(number, dtype=int)
bz2_res = numpy.zeros(number, dtype=int)
lzma_res = numpy.zeros(number, dtype=int)
blosc_res = numpy.zeros(number, dtype=int)

In [15]:
for i in xrange(number):
    pattern = generate_pattern()
    complexity[i] = pattern_complexity(pattern)
    strng = pattern.tostring()
    zlib_res[i] = len(zlib.compress(strng, 9))
    bz2_res[i] = len(bz2.compress(strng, 9))
    lzma_res[i] = len(lzma.compress(strng, options={"level": 9}))
    blosc_res[i] = len(blosc.compress(strng, typesize=pattern.dtype.itemsize))

In [16]:
cpalette = ["#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"]

In [17]:
plt.scatter(complexity, zlib_res, label="zlib", c=cpalette[0], marker="^")
plt.scatter(complexity, bz2_res, label="bz2", c=cpalette[1], marker="v")
plt.scatter(complexity, lzma_res, label="lzma", c=cpalette[2], marker="p")
plt.scatter(complexity, blosc_res, label="blosc", c=cpalette[3], marker="s")
plt.xlabel("Scalar Complexity")
plt.ylabel("Length of Compressed Byte-String")
plt.axhline(len(strng), c="red", label="uncompressed")
plt.legend()
plt.show()