In [ ]:
from IPython.core.display import HTML, display
display(HTML('<style>.container { width:100%; !important } </style>'))

Huffman's Algorithm for Lossless Data Compression


In [ ]:
import graphviz as gv

This notebook presents coding trees. Given an alphabet $\Sigma$ of characters, we define the set $\mathcal{K}$ of coding trees by induction:

  • $\texttt{Leaf}(c,f) \in \mathcal{K} $ if $c \in \Sigma$ and $f \in \mathbb{N}$

    An expression of the form $\texttt{Leaf}(c,f)$ represent a leaf in a coding tree. Here $c$ is a letter from the alphabet $\Sigma$ and $f$ is the number of times that the letter $c$ occurs in the string $s$ that is to be encoded.

  • $\texttt{Node}(l,r) \in \mathcal{K}$ if $l \in\mathcal{K}$ and $r \in \mathcal{K}$

    The expressions $\texttt{Node}(l,r)$ represent the inner nodes of the coding-tree.

The class CodingTree is a superclass for constructing coding trees. It has one static variable sNodeCount. This variable is used to equip all nodes with a unique identifier. This identifier is used to draw the trees using graphviz.

Every object of class CodingTree has a uniques id mID that is stored as a member variable. This id is only used by graphviz.


In [ ]:
class CodingTree:
    sNodeCount = 0
    
    def __init__(self):
        CodingTree.sNodeCount += 1
        self.mID = CodingTree.sNodeCount
        
    def count(self):
        "compute the number of characters"
        pass
        
    def cost(self):
        "compute the number of bits used by this coding tree"
        pass
        
    def getID(self):
        return self.mID  # used only by graphviz

The function make_string is a helper function that is used to simplify the implementation of __str__.

  • self is the object that is to be rendered as a string
  • attributes is a list of those member variables that are used to produce the string

In [ ]:
def _make_string(self, Attributes):
        # map the function __str__ to all attributes and join them with a comma
        name = self.__class__.__name__
        return f"{name}({', '.join(map(str, [getattr(self, at) for at in Attributes]))})"
    
CodingTree._make_string = _make_string

The method $t.\texttt{toDot}()$ takes a binary trie $t$ and returns a graph that depicts the tree $t$.


In [ ]:
def toDot(self):
    dot = gv.Digraph(node_attr={'shape': 'record', 'style': 'rounded'})
    nodeDict = {}
    self._collectIDs(nodeDict)
    for n, t in nodeDict.items():
        if isinstance(t, Leaf):
            dot.node(str(n), label='{' + str(t.mCharacter) + '|' + str(t.mFrequency) + '}') 
        elif isinstance(t, Node):
            dot.node(str(n), label=str(t.count()))
        else:
            assert False, f'Unknown node {t}'
    for n, t in nodeDict.items():
        if isinstance(t, Node):
            dot.edge(str(n), str(t.mLeft .getID()), label='0')
            dot.edge(str(n), str(t.mRight.getID()), label='1')
    return dot

CodingTree.toDot = toDot

The method $t.\texttt{collectIDs}(d)$ takes a coding tree $t$ and a dictionary $d$ and updates the dictionary so that the following holds: $$ d[\texttt{id}] = n \quad \mbox{for every node $n$ in $t$.} $$ Here, $\texttt{id}$ is the unique identifier of the node $n$, i.e. $d$ associates the identifiers with the corresponding nodes.


In [ ]:
def _collectIDs(self, nodeDict):
    nodeDict[self.getID()] = self
    if isinstance(self, Node):
        self.mLeft ._collectIDs(nodeDict)
        self.mRight._collectIDs(nodeDict)
        
CodingTree._collectIDs = _collectIDs

The class Leaf represents a leaf of the form $\texttt{Leaf}(c, f)$. It maintains two member variables.

  • $c$ represents the character that is encoded. This character is stored in the member variable mCharacter.
  • $f$ represents the frequency of the character $c$ and is stored in the member variable mFrequancy.

In [ ]:
class Leaf(CodingTree):
    def __init__(self, c, f):
        CodingTree.__init__(self)
        self.mCharacter = c
        self.mFrequency = f
        
    def count(self):
        return self.mFrequency
    
    def cost(self):
        return 0
    
    def __str__(self):
        return _make_string(self, ['mCharacter', 'mFrequency'])
    
    def __lt__(self, other):
        if isinstance(other, Node):
            return True
        return self.mCharacter < other.mCharacter

The class Node represents an inner node of the form $\texttt{Node}(l, r)$. It maintains two member variables:

  • self.mLeft is the left subtree $l$,
  • self.mRight is the right subtree $r$.

In [ ]:
class Node(CodingTree):
    def __init__(self, l, r):
        CodingTree.__init__(self)
        self.mLeft  = l
        self.mRight = r

    def count(self):
        return self.mLeft.count() + self.mRight.count()
        
    def cost(self):
        return self.mLeft.cost() + self.mRight.cost() + self.count()
    
    def __str__(self):
        return _make_string(self, ['mLeft', 'mRight'])
    
    def __lt__(self, other):
        if isinstance(other, Leaf):
            return False
        return self.mLeft < other.mLeft

Building a Coding Tree

The module heapq provides priority queues. The api is given at https://docs.python.org/3/library/heapq.html. This module represents heaps as arrays.


In [ ]:
import heapq

In [ ]:
H = []
heapq.heappush(H, 7)
heapq.heappush(H, 1)
heapq.heappush(H, 0)
heapq.heappush(H, 6)
H

In [ ]:
a = heapq.heappop(H)
print('a = ', a)
H

The function $\texttt{coding_tree}$ implements Huffman's algorithm for data compression. The input $M$ is a set of pairs of the form $$ \bigl\{ (c_1, f_1), \cdots, (c_k, f_k)\bigr\} $$ where $c_i$ is a character and $f_i$ is the number of times this character occurs in the string $s$ that is to be encoded. Huffman's algorithm is greedy: It always combines those coding trees that have the least character count so far as this results in the smallest cost increase.

The heap H that is maintained by this function is a priority queue which is represented by an array that is structured as a heap. The items in this priority queue are pairs of the form $$ \bigl( t.\texttt{count}(), t \bigr) $$ where $t$ is a coding tree and $t.\texttt{count}()$ is the count of this coding tree.


In [ ]:
def coding_tree(M):
    H = []  # empty priority queue
    for c, f in M:
        heapq.heappush(H, (f, Leaf(c, f)))
    while len(H) > 1:
        a = heapq.heappop(H)
        b = heapq.heappop(H)
        heapq.heappush(H, (a[0] + b[0], Node(a[1], b[1])))
    return H[0][1]

Let us test this with a trivial example.


In [ ]:
import math

The function $\texttt{log_2}(n)$ computes $\log_2(n)$.


In [ ]:
def log_2(n):
    return math.log(n) / math.log(2)

In [ ]:
log_2(4)

In [ ]:
def demo(M):
    K = coding_tree(M)
    display(K.toDot())
    n = math.ceil(log_2(len(M)))
    cost_huffman  = K.cost()
    cost_constant = n * K.count()
    savings       = (cost_constant - cost_huffman) / cost_constant
    print(f'cost of encoding with Huffman coding tree : {cost_huffman} bits')
    print(f'cost of encoding with {n} bits              : {cost_constant} bits')
    print(f'savings: {100 * savings}%')
    return savings

In [ ]:
demo({ ('a', 990), ('b', 8), ('c', 1), ('d', 1) })

In [ ]:
demo({ ('a', 4), ('b', 9), ('c', 16), ('d', 25), ('e', 36), ('f', 49), ('g', 64) })

In [ ]:
demo({ ('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ('f', 6), ('g', 7), ('h', 8), ('i', 9), ('j', 10) })

In [ ]:
demo({ ('a', 1), ('b', 1), ('c', 2), ('d', 3), ('e', 5), ('f', 8), ('g', 13) })

The function $\texttt{demo_file}(\texttt{fn})$ reads the file fn and calculates the frequency of all characters occurring in fn. Using these frequencies it computes the the Huffman coding tree.


In [ ]:
def demo_file(fn):
    with open(fn, 'r') as file:
        s = file.read() # read file as string s
    Frequencies = {}
    for c in s:
        f = Frequencies.get(c, None)
        if f != None:
            Frequencies[c] += 1
        else:
            Frequencies[c] = 1
    M = { (c, f) for (c, f) in Frequencies.items() }
    print(M)
    return demo(M)

In [ ]:
demo_file('alice.txt')

In [ ]:
demo_file('moby-dick.txt')

In [ ]: