Content and Objectives

  • Realizing Huffman encoding by a recursive function constructing of the Huffman tree
  • Application of this function to several examples

Import


In [1]:
# importing
import numpy as np

Recursive implementation of Huffman


In [2]:
# Huffman function
def huffman_recursive( symb_dict, show_steps = 0 ):
    '''
    Recursive implementation of Huffman coding
    partly according to: https://gist.github.com/mreid/fdf6353ec39d050e972b
    
    Note: For convenience the two most unlikely symbols are at the beginning/the first indices of the dict
    
    IN: symb_dict ( dictionary of { letter : probability } )
        show_steps ( boolean allowing output of intermediate codes/intermediate steps )
    
    OUT: code_dict (dict of shape { letter: codeword } )
    '''
    
    # check that probability equals 1.0 (approximately) 
    np.testing.assert_almost_equal( sum( symb_dict.values() ), 1.0 )
  

    # if length equals 2 use 1 bit,
    # the shorter sequence obtaining leading 0
    if len( symb_dict) == 2:
        
        # compare sequence lengths and return coding ( shorter sequence coded by leading 0 )
        if len( list( symb_dict.keys() )[1] ) > len( list( symb_dict.keys() )[0] ):
            return dict( zip( symb_dict.keys(), ['1', '0' ] ) )
        
        else:         
            return dict( zip( symb_dict.keys(), ['0', '1' ] ) )      
  

    # copy dict
    symb_dict_new = symb_dict.copy() 
       
    # sort dict w.r.t. increasing probability
    #
    # NOTE: lambda is an on-the-fly definition of a function of syntax "lambda with variables: do";
    # so lambda t: t[1] simply gets second element of t
    symb_dict_new_sorted = sorted( symb_dict_new.items(), key=lambda t: t[1])
    
    # if activated, show intermediate dicts for illustration 
    if show_steps:
        dict_for_printing = [ ( key, round(val, 4) ) for key, val in symb_dict_new_sorted ]
        print( dict_for_printing )
        print( )
    
    # find two least probable symbols
    # NOTE: - [ i ] gives a dict entry; 
    #       - [ i ][ 0 ] gives the key of the dict entry, corresponding to the symbol
    s_N_1 = symb_dict_new_sorted[ 1 ][ 0 ]
    s_N = symb_dict_new_sorted[ 0 ][ 0 ]

    # pop according entries and create a new one with sum probability
    # key is concatenation of the old symbols
    p_N_1 = symb_dict_new.pop( s_N_1 )
    p_N = symb_dict_new.pop( s_N )
    
    symb_dict_new[ s_N + s_N_1 ] = p_N + p_N_1
    
    
    # apply recursion for the reduced symbol set
    code_dict = huffman_recursive( symb_dict_new, show_steps )
    
    
    # get codeword and append '1'/'0' for going up/down respectively
    cw = code_dict.pop( s_N + s_N_1 )

    code_dict[ s_N_1 ] = cw + '1'    
    code_dict[ s_N ] = cw + '0'
    
    return code_dict

Applying Huffman function to different examples


In [3]:
# two booleans for 
# choosing example to be used and 
# choosing whether or not showing intermediate results of Huffman
example = 1
show_intermediate_steps = False

if example == 1:
    # letters of german alphabet
    # probability of letters in percent
    # source: https://de.wikipedia.org/wiki/Buchstabenhaeufigkeit
    letters_raw = (
            ('a', 6.51), ('b', 1.89), ('c', 3.06), ('d', 5.08), ('e', 17.40), 
            ('f', 1.66), ('g', 3.01), ('h', 4.76), ('i', 7.55), ('j', 0.27), 
            ('k', 1.21), ('l', 3.44), ('m', 2.53), ('n', 9.78), ('o', 2.51), 
            ('p', 0.79), ('q', 0.02), ('r', 7.00), ('s', 7.27), ('t', 6.15), 
            ('u', 4.35), ('v', 0.67), ('w', 1.89), ('x', 0.03), ('y', 0.04), 
            ('z', 1.13)
        ) 

elif example == 2:
    # symbols as in the lecture
    letters_raw = (
        ('x1', .25), ('x2', .2), ('x3', .2), ('x4', .15), ('x5', .07),
        ('x6', .05), ('x7', .025), ('x8', .025), ('x9', .02), ('x10', .01)
        )
    
# elif example == 3:
    #
    # add your own example here!
    #letters_raw = (
    # 
    #)

    
# transform to dict and normalize to have sum equal to 1
symb_dict = dict( (x,y) for (x,y) in letters_raw )

s = sum( symb_dict.values() )   
symb_dict.update( (key, val / s ) for key, val in symb_dict.items() )    

      
# apply Huffman function defined above
code = huffman_recursive( symb_dict , show_intermediate_steps )

# print various information
print('-------------------------')

print('Huffman coding: \n\n {}\n'.format( sorted( code.items(), key=lambda t: t[0] ) ) )

# determine average codeword length
L = 0
for l, p in symb_dict.items():
    L += p * len( code[ l ] )

    
print('-------------------------')

print('Average codeword length: \tL = {:2.2f}'.format( L ) )

p_letters = list( symb_dict.values() )
print('Entropy: \t\t\tH(X) = {}'.format( - np.sum( p_letters * np.log2( p_letters ) ) ) )

print('Max. Entropie: \t\t\tH0 = {:2.2f}'.format( np.log2(len(p_letters))) )


-------------------------
Huffman coding: 

 [('a', '0001'), ('b', '001110'), ('c', '00000'), ('d', '1101'), ('e', '011'), ('f', '001100'), ('g', '11101'), ('h', '1001'), ('i', '0101'), ('j', '001101101'), ('k', '110001'), ('l', '00001'), ('m', '11100'), ('n', '101'), ('o', '11001'), ('p', '0011010'), ('q', '00110110010'), ('r', '0010'), ('s', '0100'), ('t', '1111'), ('u', '1000'), ('v', '00110111'), ('w', '001111'), ('x', '00110110011'), ('y', '0011011000'), ('z', '110000')]

-------------------------
Average codeword length: 	L = 4.10
Entropy: 			H(X) = 4.062887639566763
Max. Entropie: 			H0 = 4.70

Huffman-Coding of pre-defined text

Remark: For this to work, example 1 above has to be chosen.


In [4]:
# define text
text = 'nachrichtentechnik ist toll'

# remove spaces
text_clean = text.replace(' ', '')

# code text by simply parsing symbols and concatenating according codewords
# requires previous sections to be completed
coded_text = []
for t in text_clean:
    coded_text.append( code[ t ] )  

# switch keys and values for decoding
decode = dict( (v, k) for (k, v) in code.items() )

# decode by parsing codewords and using "inverse dict"
decoded_text = []
for c in coded_text:
    decoded_text += decode[ c ]
    

# transform decoded list to string
decoded_str = ''.join( decoded_text )

# print various information
print('-------------------------')

print('Original text: {}\n'.format( text_clean ) )

print('Number of letters: {}\n\n'.format( len( text_clean ) ) )

print('Coded: {}\n'.format( ''.join( coded_text ) ) )

print('Bits with Huffman: {}\n\n'.format( len( ''.join( coded_text ) ) ) )

print('Decoded: {}'.format( decoded_str ) )


-------------------------
Original text: nachrichtentechnikisttoll

Number of letters: 25


Coded: 1010001000001001001001010000010011111011101111101100000100110101011100010101010011111111110010000100001

Bits with Huffman: 103


Decoded: nachrichtentechnikisttoll

In [ ]: