In [1]:
# importing
import numpy as np
In [2]:
# Huffman function
def huffman_recursive( symb_dict, show_steps = 0 ):
'''
Recursive implementation of Huffman coding
partly according to: https://gist.github.com/mreid/fdf6353ec39d050e972b
Note: For convenience the two most unlikely symbols are at the beginning/the first indices of the dict
IN: symb_dict ( dictionary of { letter : probability } )
show_steps ( boolean allowing output of intermediate codes/intermediate steps )
OUT: code_dict (dict of shape { letter: codeword } )
'''
# check that probability equals 1.0 (approximately)
np.testing.assert_almost_equal( sum( symb_dict.values() ), 1.0 )
# if length equals 2 use 1 bit,
# the shorter sequence obtaining leading 0
if len( symb_dict) == 2:
# compare sequence lengths and return coding ( shorter sequence coded by leading 0 )
if len( list( symb_dict.keys() )[1] ) > len( list( symb_dict.keys() )[0] ):
return dict( zip( symb_dict.keys(), ['1', '0' ] ) )
else:
return dict( zip( symb_dict.keys(), ['0', '1' ] ) )
# copy dict
symb_dict_new = symb_dict.copy()
# sort dict w.r.t. increasing probability
#
# NOTE: lambda is an on-the-fly definition of a function of syntax "lambda with variables: do";
# so lambda t: t[1] simply gets second element of t
symb_dict_new_sorted = sorted( symb_dict_new.items(), key=lambda t: t[1])
# if activated, show intermediate dicts for illustration
if show_steps:
dict_for_printing = [ ( key, round(val, 4) ) for key, val in symb_dict_new_sorted ]
print( dict_for_printing )
print( )
# find two least probable symbols
# NOTE: - [ i ] gives a dict entry;
# - [ i ][ 0 ] gives the key of the dict entry, corresponding to the symbol
s_N_1 = symb_dict_new_sorted[ 1 ][ 0 ]
s_N = symb_dict_new_sorted[ 0 ][ 0 ]
# pop according entries and create a new one with sum probability
# key is concatenation of the old symbols
p_N_1 = symb_dict_new.pop( s_N_1 )
p_N = symb_dict_new.pop( s_N )
symb_dict_new[ s_N + s_N_1 ] = p_N + p_N_1
# apply recursion for the reduced symbol set
code_dict = huffman_recursive( symb_dict_new, show_steps )
# get codeword and append '1'/'0' for going up/down respectively
cw = code_dict.pop( s_N + s_N_1 )
code_dict[ s_N_1 ] = cw + '1'
code_dict[ s_N ] = cw + '0'
return code_dict
In [3]:
# two booleans for
# choosing example to be used and
# choosing whether or not showing intermediate results of Huffman
example = 1
show_intermediate_steps = False
if example == 1:
# letters of german alphabet
# probability of letters in percent
# source: https://de.wikipedia.org/wiki/Buchstabenhaeufigkeit
letters_raw = (
('a', 6.51), ('b', 1.89), ('c', 3.06), ('d', 5.08), ('e', 17.40),
('f', 1.66), ('g', 3.01), ('h', 4.76), ('i', 7.55), ('j', 0.27),
('k', 1.21), ('l', 3.44), ('m', 2.53), ('n', 9.78), ('o', 2.51),
('p', 0.79), ('q', 0.02), ('r', 7.00), ('s', 7.27), ('t', 6.15),
('u', 4.35), ('v', 0.67), ('w', 1.89), ('x', 0.03), ('y', 0.04),
('z', 1.13)
)
elif example == 2:
# symbols as in the lecture
letters_raw = (
('x1', .25), ('x2', .2), ('x3', .2), ('x4', .15), ('x5', .07),
('x6', .05), ('x7', .025), ('x8', .025), ('x9', .02), ('x10', .01)
)
# elif example == 3:
#
# add your own example here!
#letters_raw = (
#
#)
# transform to dict and normalize to have sum equal to 1
symb_dict = dict( (x,y) for (x,y) in letters_raw )
s = sum( symb_dict.values() )
symb_dict.update( (key, val / s ) for key, val in symb_dict.items() )
# apply Huffman function defined above
code = huffman_recursive( symb_dict , show_intermediate_steps )
# print various information
print('-------------------------')
print('Huffman coding: \n\n {}\n'.format( sorted( code.items(), key=lambda t: t[0] ) ) )
# determine average codeword length
L = 0
for l, p in symb_dict.items():
L += p * len( code[ l ] )
print('-------------------------')
print('Average codeword length: \tL = {:2.2f}'.format( L ) )
p_letters = list( symb_dict.values() )
print('Entropy: \t\t\tH(X) = {}'.format( - np.sum( p_letters * np.log2( p_letters ) ) ) )
print('Max. Entropie: \t\t\tH0 = {:2.2f}'.format( np.log2(len(p_letters))) )
Remark: For this to work, example 1 above has to be chosen.
In [4]:
# define text
text = 'nachrichtentechnik ist toll'
# remove spaces
text_clean = text.replace(' ', '')
# code text by simply parsing symbols and concatenating according codewords
# requires previous sections to be completed
coded_text = []
for t in text_clean:
coded_text.append( code[ t ] )
# switch keys and values for decoding
decode = dict( (v, k) for (k, v) in code.items() )
# decode by parsing codewords and using "inverse dict"
decoded_text = []
for c in coded_text:
decoded_text += decode[ c ]
# transform decoded list to string
decoded_str = ''.join( decoded_text )
# print various information
print('-------------------------')
print('Original text: {}\n'.format( text_clean ) )
print('Number of letters: {}\n\n'.format( len( text_clean ) ) )
print('Coded: {}\n'.format( ''.join( coded_text ) ) )
print('Bits with Huffman: {}\n\n'.format( len( ''.join( coded_text ) ) ) )
print('Decoded: {}'.format( decoded_str ) )
In [ ]: