Huffman Coding


In [1]:
TEXT = "THIS IS AN EXAMPLE FOR HUFFMAN ENCODING"

In [2]:
from heapq import heappush, heappop


class HuffmanNode:

    def __init__(self, char=None, freq=None, left=None, right=None):
        self.char = str() if char is None else char
        self.freq = int() if freq is None else freq
        self.left = left
        self.right = right
        if left is not None and right is not None:
            self.char = left.char + right.char
            self.freq = left.freq + right.freq

    def __lt__(self, other):
        if self.freq < other.freq:
            return True
        elif self.freq == other.freq:
            if len(self.char) == len(other.char) and self.char > other.char:
                return True
            else:
                return len(self.char) < len(other.char)
        return False

    def __repr__(self):
        return "HuffmanNode: ({}, {})".format(self.freq, self.char)


def huffman_tree(text):

    # Calculate each letter frequency
    frequencies = {letter: text.count(letter) for letter in text}

    # Sort nodes
    # heapsort - https://docs.python.org/3/library/heapq.html#basic-examples
    nodes = []
    for letter, count in frequencies.items():
        heappush(nodes, HuffmanNode(char=letter, freq=count))

    # Generate tree
    while len(nodes) > 1:
        l_node = heappop(nodes)
        r_node = heappop(nodes)
        heappush(nodes, HuffmanNode(left=l_node, right=r_node))
    return heappop(nodes)  # root node or tree


def encode_message(text, tree):
    coding_table = __table(tree)
    message = str()
    for character in text:
        message += coding_table[character]
    return message


def __table(node, branch_label=None, coding_table=None):
    branch_label = str() if branch_label is None else branch_label
    coding_table = dict() if coding_table is None else coding_table
    if node.left is None and node.right is None:
        coding_table[node.char] = branch_label
    else:
        __table(node.left,
                branch_label=branch_label + '0',
                coding_table=coding_table)
        __table(node.right,
                branch_label=branch_label + '1',
                coding_table=coding_table)
    return coding_table

Coding


In [3]:
tree = huffman_tree(TEXT)
encoded_text = encode_message(TEXT, tree)
encoded_text


Out[3]:
'0110001001001111111011001111111011110000101110101010111000110111001111110110111000010011011010100010111100110000111110000101110100011110001010001100100010000'

Decoding


In [4]:
reverse_table = {value: key for key, value in __table(tree).items()}
compressed_list = list(encoded_text)
decoded_text = str()
substring = str()

while len(compressed_list):
    substring += compressed_list.pop(0)
    extraction = reverse_table.get(substring, None)
    if extraction is not None:
        decoded_text += extraction
        substring = str()
decoded_text


Out[4]:
'THIS IS AN EXAMPLE FOR HUFFMAN ENCODING'

Does it work?


In [5]:
BITS = list()
for character in TEXT:
    bits = bin(ord(character))[2:]
    bits = '00000000'[len(bits):] + bits
    BITS.extend([int(b) for b in bits])
    
print("Original (str): {}".format(TEXT))
print()
print("Size: {} bits".format(len(BITS)))
print("Size: {} chars".format(len(TEXT)))


Original (str): THIS IS AN EXAMPLE FOR HUFFMAN ENCODING

Size: 312 bits
Size: 39 chars

In [6]:
from_bits = "".join(chr(int("".join(map(str,encoded_text[i:i+8])),2)) for i in range(0,len(encoded_text),8))

print("Encoded (bits): {}".format(encoded_text))
print("Encoded (str): {}".format(from_bits))
print()
print("Size: {} bits".format(len(encoded_text)))
print("Size: {} chars".format(len(from_bits)))


Encoded (bits): 0110001001001111111011001111111011110000101110101010111000110111001111110110111000010011011010100010111100110000111110000101110100011110001010001100100010000
Encoded (str): bOìþðº®7?nj/0ø](È

Size: 157 bits
Size: 20 chars