In [ ]:
from IPython.core.display import HTML, display
display(HTML('<style>.container { width:100%; !important } </style>'))
In [ ]:
import graphviz as gv
This notebook presents coding trees. Given an alphabet $\Sigma$ of characters, we define the set $\mathcal{K}$ of coding trees by induction:
$\texttt{Leaf}(c,f) \in \mathcal{K} $ if $c \in \Sigma$ and $f \in \mathbb{N}$
An expression of the form $\texttt{Leaf}(c,f)$ represent a leaf in a coding tree. Here $c$ is a letter from the alphabet $\Sigma$ and $f$ is the number of times that the letter $c$ occurs in the string $s$ that is to be encoded.
$\texttt{Node}(l,r) \in \mathcal{K}$ if $l \in\mathcal{K}$ and $r \in \mathcal{K}$
The expressions $\texttt{Node}(l,r)$ represent the inner nodes of the coding-tree.
The class CodingTree
is a superclass for constructing coding trees. It has one static variable sNodeCount
. This variable is used to equip all nodes with a unique identifier. This identifier is used to draw the trees using graphviz
.
Every object of class CodingTree
has a uniques id mID
that is stored as a member variable. This id is only used by graphviz
.
In [ ]:
class CodingTree:
sNodeCount = 0
def __init__(self):
CodingTree.sNodeCount += 1
self.mID = CodingTree.sNodeCount
def count(self):
"compute the number of characters"
pass
def cost(self):
"compute the number of bits used by this coding tree"
pass
def getID(self):
return self.mID # used only by graphviz
The function make_string
is a helper function that is used to simplify the implementation of __str__
.
self
is the object that is to be rendered as a stringattributes
is a list of those member variables that are used to produce the string
In [ ]:
def _make_string(self, Attributes):
# map the function __str__ to all attributes and join them with a comma
name = self.__class__.__name__
return f"{name}({', '.join(map(str, [getattr(self, at) for at in Attributes]))})"
CodingTree._make_string = _make_string
The method $t.\texttt{toDot}()$ takes a binary trie $t$ and returns a graph that depicts the tree $t$.
In [ ]:
def toDot(self):
dot = gv.Digraph(node_attr={'shape': 'record', 'style': 'rounded'})
nodeDict = {}
self._collectIDs(nodeDict)
for n, t in nodeDict.items():
if isinstance(t, Leaf):
dot.node(str(n), label='{' + str(t.mCharacter) + '|' + str(t.mFrequency) + '}')
elif isinstance(t, Node):
dot.node(str(n), label=str(t.count()))
else:
assert False, f'Unknown node {t}'
for n, t in nodeDict.items():
if isinstance(t, Node):
dot.edge(str(n), str(t.mLeft .getID()), label='0')
dot.edge(str(n), str(t.mRight.getID()), label='1')
return dot
CodingTree.toDot = toDot
The method $t.\texttt{collectIDs}(d)$ takes a coding tree $t$ and a dictionary $d$ and updates the dictionary so that the following holds: $$ d[\texttt{id}] = n \quad \mbox{for every node $n$ in $t$.} $$ Here, $\texttt{id}$ is the unique identifier of the node $n$, i.e. $d$ associates the identifiers with the corresponding nodes.
In [ ]:
def _collectIDs(self, nodeDict):
nodeDict[self.getID()] = self
if isinstance(self, Node):
self.mLeft ._collectIDs(nodeDict)
self.mRight._collectIDs(nodeDict)
CodingTree._collectIDs = _collectIDs
The class Leaf
represents a leaf of the form $\texttt{Leaf}(c, f)$. It maintains two member variables.
mCharacter
.mFrequancy
.
In [ ]:
class Leaf(CodingTree):
def __init__(self, c, f):
CodingTree.__init__(self)
self.mCharacter = c
self.mFrequency = f
def count(self):
return self.mFrequency
def cost(self):
return 0
def __str__(self):
return _make_string(self, ['mCharacter', 'mFrequency'])
def __lt__(self, other):
if isinstance(other, Node):
return True
return self.mCharacter < other.mCharacter
The class Node
represents an inner node of the form $\texttt{Node}(l, r)$. It maintains two member variables:
self.mLeft
is the left subtree $l$,self.mRight
is the right subtree $r$.
In [ ]:
class Node(CodingTree):
def __init__(self, l, r):
CodingTree.__init__(self)
self.mLeft = l
self.mRight = r
def count(self):
return self.mLeft.count() + self.mRight.count()
def cost(self):
return self.mLeft.cost() + self.mRight.cost() + self.count()
def __str__(self):
return _make_string(self, ['mLeft', 'mRight'])
def __lt__(self, other):
if isinstance(other, Leaf):
return False
return self.mLeft < other.mLeft
The module heapq
provides priority queues. The api is given at https://docs.python.org/3/library/heapq.html.
This module represents heaps as arrays.
In [ ]:
import heapq
In [ ]:
H = []
heapq.heappush(H, 7)
heapq.heappush(H, 1)
heapq.heappush(H, 0)
heapq.heappush(H, 6)
H
In [ ]:
a = heapq.heappop(H)
print('a = ', a)
H
The function $\texttt{coding_tree}$ implements Huffman's algorithm for data compression. The input $M$ is a set of pairs of the form $$ \bigl\{ (c_1, f_1), \cdots, (c_k, f_k)\bigr\} $$ where $c_i$ is a character and $f_i$ is the number of times this character occurs in the string $s$ that is to be encoded. Huffman's algorithm is greedy: It always combines those coding trees that have the least character count so far as this results in the smallest cost increase.
The heap H
that is maintained by this function is a priority queue which is represented by an array that is structured as a heap. The items in this priority queue are pairs of the form
$$ \bigl( t.\texttt{count}(), t \bigr) $$
where $t$ is a coding tree and $t.\texttt{count}()$ is the count of this coding tree.
In [ ]:
def coding_tree(M):
H = [] # empty priority queue
for c, f in M:
heapq.heappush(H, (f, Leaf(c, f)))
while len(H) > 1:
a = heapq.heappop(H)
b = heapq.heappop(H)
heapq.heappush(H, (a[0] + b[0], Node(a[1], b[1])))
return H[0][1]
Let us test this with a trivial example.
In [ ]:
import math
The function $\texttt{log_2}(n)$ computes $\log_2(n)$.
In [ ]:
def log_2(n):
return math.log(n) / math.log(2)
In [ ]:
log_2(4)
In [ ]:
def demo(M):
K = coding_tree(M)
display(K.toDot())
n = math.ceil(log_2(len(M)))
cost_huffman = K.cost()
cost_constant = n * K.count()
savings = (cost_constant - cost_huffman) / cost_constant
print(f'cost of encoding with Huffman coding tree : {cost_huffman} bits')
print(f'cost of encoding with {n} bits : {cost_constant} bits')
print(f'savings: {100 * savings}%')
return savings
In [ ]:
demo({ ('a', 990), ('b', 8), ('c', 1), ('d', 1) })
In [ ]:
demo({ ('a', 4), ('b', 9), ('c', 16), ('d', 25), ('e', 36), ('f', 49), ('g', 64) })
In [ ]:
demo({ ('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ('f', 6), ('g', 7), ('h', 8), ('i', 9), ('j', 10) })
In [ ]:
demo({ ('a', 1), ('b', 1), ('c', 2), ('d', 3), ('e', 5), ('f', 8), ('g', 13) })
The function $\texttt{demo_file}(\texttt{fn})$ reads the file fn
and calculates the frequency of all characters occurring in fn
. Using these frequencies it computes the the Huffman coding tree.
In [ ]:
def demo_file(fn):
with open(fn, 'r') as file:
s = file.read() # read file as string s
Frequencies = {}
for c in s:
f = Frequencies.get(c, None)
if f != None:
Frequencies[c] += 1
else:
Frequencies[c] = 1
M = { (c, f) for (c, f) in Frequencies.items() }
print(M)
return demo(M)
In [ ]:
demo_file('alice.txt')
In [ ]:
demo_file('moby-dick.txt')
In [ ]: