In [1]:
TEXT = "THIS IS AN EXAMPLE FOR HUFFMAN ENCODING"
In [2]:
from heapq import heappush, heappop
class HuffmanNode:
def __init__(self, char=None, freq=None, left=None, right=None):
self.char = str() if char is None else char
self.freq = int() if freq is None else freq
self.left = left
self.right = right
if left is not None and right is not None:
self.char = left.char + right.char
self.freq = left.freq + right.freq
def __lt__(self, other):
if self.freq < other.freq:
return True
elif self.freq == other.freq:
if len(self.char) == len(other.char) and self.char > other.char:
return True
else:
return len(self.char) < len(other.char)
return False
def __repr__(self):
return "HuffmanNode: ({}, {})".format(self.freq, self.char)
def huffman_tree(text):
# Calculate each letter frequency
frequencies = {letter: text.count(letter) for letter in text}
# Sort nodes
# heapsort - https://docs.python.org/3/library/heapq.html#basic-examples
nodes = []
for letter, count in frequencies.items():
heappush(nodes, HuffmanNode(char=letter, freq=count))
# Generate tree
while len(nodes) > 1:
l_node = heappop(nodes)
r_node = heappop(nodes)
heappush(nodes, HuffmanNode(left=l_node, right=r_node))
return heappop(nodes) # root node or tree
def encode_message(text, tree):
coding_table = __table(tree)
message = str()
for character in text:
message += coding_table[character]
return message
def __table(node, branch_label=None, coding_table=None):
branch_label = str() if branch_label is None else branch_label
coding_table = dict() if coding_table is None else coding_table
if node.left is None and node.right is None:
coding_table[node.char] = branch_label
else:
__table(node.left,
branch_label=branch_label + '0',
coding_table=coding_table)
__table(node.right,
branch_label=branch_label + '1',
coding_table=coding_table)
return coding_table
In [3]:
tree = huffman_tree(TEXT)
encoded_text = encode_message(TEXT, tree)
encoded_text
Out[3]:
In [4]:
reverse_table = {value: key for key, value in __table(tree).items()}
compressed_list = list(encoded_text)
decoded_text = str()
substring = str()
while len(compressed_list):
substring += compressed_list.pop(0)
extraction = reverse_table.get(substring, None)
if extraction is not None:
decoded_text += extraction
substring = str()
decoded_text
Out[4]:
In [5]:
BITS = list()
for character in TEXT:
bits = bin(ord(character))[2:]
bits = '00000000'[len(bits):] + bits
BITS.extend([int(b) for b in bits])
print("Original (str): {}".format(TEXT))
print()
print("Size: {} bits".format(len(BITS)))
print("Size: {} chars".format(len(TEXT)))
In [6]:
from_bits = "".join(chr(int("".join(map(str,encoded_text[i:i+8])),2)) for i in range(0,len(encoded_text),8))
print("Encoded (bits): {}".format(encoded_text))
print("Encoded (str): {}".format(from_bits))
print()
print("Size: {} bits".format(len(encoded_text)))
print("Size: {} chars".format(len(from_bits)))