Huffman coding is a lossless data compression algorithm. The idea is to assign variable-length codes to input characters, lengths of the assigned codes are based on the frequencies of corresponding characters. The most frequent character gets the smallest code and the least frequent character gets the largest code.
In [1]:
from collections import Counter
from operator import itemgetter
In [2]:
def find_min(queue):
elem = min(queue, key=itemgetter(0))
queue.remove(elem)
return elem
def huffman_coding(text):
queue = [(c, e) for (e, c) in Counter(text).items()]
while len(queue) > 1:
c1, e1 = find_min(queue)
c2, e2 = find_min(queue)
queue.append((c1 + c2, (e1, e2)))
assert len(queue) == 1
mapping = dict()
create_codes(queue[0][1], mapping)
return mapping
In [3]:
def create_codes(node, mapping, code=''):
if isinstance(node, tuple):
create_codes(node[0], mapping, code + '0')
create_codes(node[1], mapping, code + '1')
else:
mapping[node] = code
In [4]:
def encode(text):
mapping = huffman_coding(text)
encoded = ''.join(map(lambda c: mapping[c], text))
return encoded
def decode(encoded, mapping):
rev_mapping = dict((m, c) for c, m in mapping.items())
decoded = []
word = ''
for c in encoded:
word += c
if word in rev_mapping:
decoded.append(rev_mapping[word])
word = ''
return ''.join(decoded)
In [5]:
text = 'alexandru'
mapping = huffman_coding(text)
print("Huffman codes")
for (e, m) in sorted(mapping.items()):
print(e, m)
print()
print("Encoded text:", encode(text))
print("Decoded text:", decode(encode(text), mapping))
In [6]:
text = 'abca'
mapping = huffman_coding(text)
print("Huffman codes")
for (e, m) in sorted(mapping.items()):
print(e, m)
print()
print("Encoded text:", encode(text))
print("Decoded text:", decode(encode(text), mapping))
In [7]:
text = 'abca'
mapping = huffman_coding(text)
print("Huffman codes")
for (e, m) in sorted(mapping.items()):
print(e, m)
print()
print("Encoded text:", encode(text))
print("Decoded text:", decode(encode(text), mapping))
In [8]:
text = 'abcaabccabda'
mapping = huffman_coding(text)
print("Huffman codes")
for (e, m) in sorted(mapping.items()):
print(e, m)
print()
print("Encoded text:", encode(text))
print("Decoded text:", decode(encode(text), mapping))