In [22]:
import math
import string

def sign_entropy(sign_probability):
    return -math.log(sign_probability, 2)

def sign_mean_entropy(sign_probability):
    return sign_probability * sign_entropy(sign_probability)

class LZW:
    def __init__(self, input_):
        self.orig_input = input_
        self.output = []
        self.dict_ = {}
        self.dict_value = 256
        self.reset()
        
    def reset(self):
        self.prefix = None
        self.code = None
        self.code_old = None

    def init_dict(self, ord_as_index=False):
        self.dict_ = {}
        self.dict_value = 256
        
        for char in self.orig_input:
            if ord_as_index:
                self.dict_[ord(char)] = char
            
            else:
                self.dict_[char] = ord(char)

    def add_to_dict(self, char, value_as_index=False):
        if value_as_index:
            self.dict_[self.dict_value] = char
    
        else:
            self.dict_[char] = self.dict_value
            
        self.dict_value += 1

    def encode(self):
        self.reset()
        self.output = []
        self.prefix = self.orig_input[0]
        
        self.init_dict()
    
        for i in range(1, len(self.orig_input)):
            current_sign = self.orig_input[i]
            lookahead = ''.join([self.prefix, current_sign])
        
            if lookahead in self.dict_:
                self.prefix = lookahead
            
            else:
                print 'output: {0}; ascii: {1}'.format(self.dict_[self.prefix], self.prefix)
                self.output.append(self.dict_[self.prefix])
                self.add_to_dict(lookahead)
                self.prefix = current_sign
    
        if len(self.prefix) > 0:
            print 'output: {0}; ascii: {1}'.format(ord(self.prefix), self.prefix)
            self.output.append(ord(self.prefix))
        
        return  self.output
    
    def decode(self, init_dict=True):
        self.reset()
        self.code = self.output[0]
        print 'In: {0}; Out: {1}'.format(self.code, chr(self.code))
    
        if init_dict:
            self.init_dict(ord_as_index=True)
    
        for i in self.output[1::]:
            self.code_old = self.code
            self.code = i

            if self.code in self.dict_:
                print 'In: {0}; Out: {1}'.format(self.code, self.dict_[self.code])
                self.prefix = self.code_old
                sign = self.dict_[self.code][:1]
                self.add_to_dict(''.join([self.dict_[self.prefix], sign]), value_as_index=True)
        
            else:
                self.prefix = self.code_old
                sign = self.dict_[self.code_old][:1]
                print 'In: {0}; Out: {1}'.format(self.prefix, self.dict_[self.prefix])
                
    def print_statistics(self):
        print 'Input "{0}" has {1} Bytes'.format(self.orig_input, len(self.orig_input))
        print 'Ouput has {0} Bytes.'.format(len(self.output) * 2)
        print 'Reduction: {0} %'.format(100 - (100 / len(self.orig_input) * (len(self.output) * 2)))

In [23]:
input_string = 'BLAH_BLAH_BLAH_BLAH_BLAH_BLAH_BLAH_BLAH'
lzw = LZW(input_string)

In [24]:
output_stream = lzw.encode()


output: 66; ascii: B
output: 76; ascii: L
output: 65; ascii: A
output: 72; ascii: H
output: 95; ascii: _
output: 256; ascii: BL
output: 258; ascii: AH
output: 260; ascii: _B
output: 257; ascii: LA
output: 259; ascii: H_
output: 261; ascii: BLA
output: 265; ascii: H_B
output: 264; ascii: LAH
output: 263; ascii: _BL
output: 262; ascii: AH_
output: 266; ascii: BLAH
output: 269; ascii: _BLA
output: 72; ascii: H

In [25]:
lzw.decode()


In: 66; Out: B
In: 76; Out: L
In: 65; Out: A
In: 72; Out: H
In: 95; Out: _
In: 256; Out: BL
In: 258; Out: AH
In: 260; Out: _B
In: 257; Out: LA
In: 259; Out: H_
In: 261; Out: BLA
In: 265; Out: H_B
In: 264; Out: LAH
In: 263; Out: _BL
In: 262; Out: AH_
In: 266; Out: BLAH
In: 269; Out: _BLA
In: 72; Out: H

In [26]:
lzw.print_statistics()


Input "BLAH_BLAH_BLAH_BLAH_BLAH_BLAH_BLAH_BLAH" has 39 Bytes
Ouput has 36 Bytes.
Reduction: 28 %

In [30]:
input_ = [77, 73, 78, 85, 83, 257, 259, 256, 258, 260, 262, 83, 264, 259, 261, 265]

lzw_2 = LZW(None)
lzw_2.dict_ = {}
lzw_2.dict_[77] = 'M'
lzw_2.dict_[73] = 'I'
lzw_2.dict_[78] = 'N'
lzw_2.dict_[85] = 'U'
lzw_2.dict_[83] = 'S'


lzw_2.output = input_

lzw_2.decode(init_dict=False)

for o in lzw_2.output:
    print lzw_2.dict_[o],


In: 77; Out: M
In: 73; Out: I
In: 78; Out: N
In: 85; Out: U
In: 83; Out: S
In: 257; Out: IN
In: 259; Out: US
In: 256; Out: MI
In: 258; Out: NU
In: 260; Out: SI
In: 262; Out: USM
In: 83; Out: S
In: 264; Out: NUS
In: 259; Out: US
In: 261; Out: INU
In: 265; Out: SIU
M I N U S IN US MI NU SI USM S NUS US INU SIU

In [ ]: