Huffman Algorithm

In this programming problem and the next you'll code up the greedy algorithm from the lectures on Huffman coding.

The file huffman.txt describes an instance of the problem. It has the following format:

[number_of_symbols]

[weight of symbol #1]

[weight of symbol #2]

...

For example, the third line of the file is "6852892," indicating that the weight of the second symbol of the alphabet is 6852892. (We're using weights instead of frequencies, like in the "A More Complex Example" video.)

Your task in this problem is to run the Huffman coding algorithm from lecture on this data set.

ADVICE: If you're not getting the correct answer, try debugging your algorithm using some small test cases. And then post them to the discussion forum!


In [60]:
import collections
import numpy as np
import heapq


FILE = "huffman.txt"
# FILE = "test2.txt"

fp = open(FILE, 'r')

data = fp.readlines()
n= int(data[0])

weights = data[1:]
vertices = [i for i in range(n)]

print (n)


1000

In [61]:
minheap = [(int(weights[i]), i) for i in range(len(weights))]

# Create a MinHeap
heapq.heapify(minheap)

# The Huffman algorithm
while (len(minheap)>1):
    min1 = heapq.heappop(minheap)
    min2 = heapq.heappop(minheap)
    
    newweight = min1[0] + min2[0]
    newnode = (min1[1], min2[1])
    
    heapq.heappush(minheap, (newweight, newnode))
    
topNode = heapq.heappop(minheap)

In [62]:
# Tree Labeling
graph = {}

def label(node, graph, seq=""):
#     print ("nodes", node)
    
    # check if left is a leaf
    if (isinstance(node[0], int)):          
        graph[node[0]] = seq+"0"
    else:
        label(node[0], graph, seq + "0")

    # Check if right node is a leaf
    if (isinstance(node[1], int)):  
        graph[node[1]] = seq+"1"
    else:
        label(node[1], graph, seq+"1")
 
    
label(topNode[1] , graph,  "" )
print (graph)


{0: '000011001', 1: '1110111000', 2: '11100010010', 3: '001100101', 4: '10111110101', 5: '10111010000', 6: '1111010001', 7: '1101000010', 8: '11000011001', 9: '010111001', 10: '1111101100', 11: '100000011', 12: '100000110', 13: '0100101010', 14: '1110000111001', 15: '011001101', 16: '000010101', 17: '1100001000', 18: '00011011011', 19: '0100011011', 20: '1111101110', 21: '11110011111', 22: '1101001101', 23: '010101001', 24: '11001110001', 25: '0110000001101', 26: '1011010000', 27: '1110100101010', 28: '11000110000010', 29: '1101100101', 30: '100011000', 31: '1100011010', 32: '1111010010', 33: '101100011', 34: '010111010101', 35: '1111100011', 36: '101010111', 37: '10110110010', 38: '01101010011', 39: '001101010', 40: '010110100', 41: '100010111', 42: '011011100', 43: '010111011', 44: '000111101', 45: '1010001101', 46: '10110010110', 47: '1110100011', 48: '000001000', 49: '1101000111001', 50: '010010010', 51: '0100000011', 52: '0100001001', 53: '0110100010', 54: '1100000110', 55: '001110110', 56: '010001100', 57: '100111100', 58: '0000101111', 59: '1101010101', 60: '1111000111', 61: '000000010', 62: '1110010100', 63: '1101010110', 64: '1101110101', 65: '1100110100', 66: '01111110110110', 67: '1011100011', 68: '1110100101011', 69: '0110111010', 70: '11101100001', 71: '11101111000', 72: '11001011111', 73: '11010111001', 74: '1110001000', 75: '1111011000', 76: '011111111', 77: '10001010000', 78: '1101000011', 79: '1111111011010', 80: '111110010000', 81: '010001010', 82: '11111101000', 83: '1100101011', 84: '00000111000', 85: '011111100', 86: '011011010', 87: '1011000010100', 88: '10010010111', 89: '11010001010', 90: '11101001011', 91: '100001000010', 92: '000111110', 93: '1110010101', 94: '0001000000', 95: '000101001', 96: '1101110111110', 97: '1011111110', 98: '1100100111', 99: '1011110110', 100: '010000011', 101: '11000111011', 102: '1001001010', 103: '1000011010', 104: '100101001', 105: '101100010', 106: '1101110111100', 107: '0010001111', 108: '10100110110', 109: '1010011011110', 110: '11000101011', 111: '1110000000', 112: '001110010', 113: '0001011110', 114: '100011111', 115: '111101100110', 116: '0010011111001', 117: '010100111', 118: '101110110110', 119: '1011000010101', 120: '11101100000', 121: '111100111101', 122: '1010011111', 123: '01000110100', 124: '011011000', 125: '1111110011', 126: '100111010', 127: '1010110010', 128: '1111001111000', 129: '0100100011', 130: '1010001100', 131: '1100001111', 132: '010000010', 133: '0111010010', 134: '0111101101', 135: '10001010001', 136: '010100100', 137: '100001000011', 138: '101100001001', 139: '001101111', 140: '0011111000', 141: '101100001000', 142: '0000001111', 143: '0100001000', 144: '1001000010', 145: '001001001', 146: '101101010101', 147: '100010101', 148: '1011111101', 149: '011000101', 150: '111101111000', 151: '1101101011', 152: '1101110110', 153: '01000110101', 154: '1011011101', 155: '00101100001', 156: '00001110000', 157: '11011010011', 158: '1000100010', 159: '010011011', 160: '1100011100', 161: '00100001100', 162: '110010001100', 163: '1110000100', 164: '11011011111', 165: '1111101111', 166: '001001011', 167: '101000000', 168: '1011111011', 169: '001100110', 170: '1001100100', 171: '1111111100', 172: '001010000', 173: '1100010001', 174: '011111101101111', 175: '0000111001', 176: '0101000100', 177: '011100001', 178: '1101101110', 179: '1101011101', 180: '100011101', 181: '001110100', 182: '101001000', 183: '100001001', 184: '110100011101', 185: '101110011110', 186: '010010100', 187: '001001101', 188: '0111010011111', 189: '1100010111', 190: '110010001110', 191: '1111100000', 192: '0111010011110', 193: '1011111100', 194: '110101101110', 195: '011101011', 196: '000100110', 197: '1011100101', 198: '000001111', 199: '011011001', 200: '1101011010', 201: '010011000', 202: '1011100110', 203: '1100101100', 204: '001011111', 205: '11010011111', 206: '1101100001111', 207: '0000101110', 208: '010101111', 209: '110001110101', 210: '1100111100110011', 211: '0001001001', 212: '0100001101', 213: '100001101111', 214: '100110110', 215: '1101101101', 216: '1111000010', 217: '001011110', 218: '11101101100', 219: '1111110111', 220: '1101011111', 221: '011000110', 222: '10111011100', 223: '001000000', 224: '1101000001', 225: '1000001001', 226: '110001100001', 227: '1011110101', 228: '0100101011', 229: '011010011', 230: '010001111', 231: '011111011', 232: '1100111111', 233: '100011110', 234: '01010001010', 235: '001010001', 236: '00111100000', 237: '101000010', 238: '01101110111', 239: '1101110100', 240: '0010110001', 241: '1100101000', 242: '010110111', 243: '1000101001', 244: '011001001', 245: '11111110110111', 246: '1100000111', 247: '1101111101', 248: '11100110111001', 249: '100111000', 250: '11111101001', 251: '1111110110', 252: '110001110100', 253: '1110101011', 254: '1111111001', 255: '01101110110', 256: '1000100101', 257: '1100000010', 258: '011100101', 259: '1111000101', 260: '1100011011', 261: '110011001011', 262: '1101111000', 263: '1101011011000', 264: '1110101001', 265: '0110101110', 266: '111001001111', 267: '11101010001', 268: '1100100000', 269: '010011010', 270: '1111011100', 271: '1011110011', 272: '011000010', 273: '10000101111', 274: '011010111110', 275: '110000000110', 276: '00100111110001', 277: '1101101010', 278: '10010000111', 279: '0000000110', 280: '000011000', 281: '011001000', 282: '100101101', 283: '11001110000', 284: '01111010001', 285: '1110011110', 286: '00010010001', 287: '100111111', 288: '011011111', 289: '11011010010', 290: '010100000', 291: '001010100', 292: '000110101', 293: '001111011', 294: '1100111001', 295: '011100000', 296: '1101000111000', 297: '001111110', 298: '001000101', 299: '1110000001', 300: '0010010000', 301: '1000111001', 302: '1001010101', 303: '011110111', 304: '11010100101011', 305: '11011001100', 306: '110110111100', 307: '1011110000', 308: '0110101000', 309: '011010010', 310: '1111011010', 311: '110101101100101', 312: '00110110101', 313: '111001101110001', 314: '11100101101', 315: '101111100110', 316: '100001101100', 317: '1110001011', 318: '101101001100', 319: '101011010', 320: '011111000', 321: '1010010010', 322: '1000000001', 323: '011111101100', 324: '1111111101', 325: '000110010', 326: '1100000000', 327: '10111111111', 328: '111110110111', 329: '1111001000', 330: '000110000', 331: '000110110101', 332: '1101100100', 333: '1101101100', 334: '11100100000', 335: '0111111010', 336: '11101110011', 337: '100100100', 338: '011000111', 339: '101101100110', 340: '1111011101', 341: '11111110001', 342: '101100111', 343: '11110011101', 344: '111010010100', 345: '11011100110', 346: '11000111110', 347: '0111000111', 348: '1101111010', 349: '1110000101', 350: '1101100001110', 351: '0001101111', 352: '00000100111', 353: '1100010010', 354: '11001100100', 355: '0000011101', 356: '0110000001110100', 357: '1101001110', 358: '1110111101', 359: '1001011100', 360: '1111101011', 361: '10111010001', 362: '101010011', 363: '1111101010', 364: '1100101001', 365: '10111100011', 366: '101100101110', 367: '0100000010011', 368: '101000101110', 369: '001111111', 370: '011010101', 371: '1111001111001011', 372: '1001011000', 373: '10111100010', 374: '100101111', 375: '1110101100010', 376: '0100111111', 377: '1101101000', 378: '10111110100', 379: '000010001', 380: '101101010100', 381: '1110011001', 382: '1100111100110001', 383: '0101000101111', 384: '1110011100', 385: '11010110011', 386: '1110101101', 387: '1111011011', 388: '1001110110', 389: '1001111101', 390: '1111010101', 391: '000000000', 392: '1101001001', 393: '1011101011', 394: '000101110', 395: '100001111', 396: '100011010', 397: '01101010010', 398: '011110101', 399: '11100101111', 400: '10110101011', 401: '10000101110', 402: '101010101', 403: '001011101', 404: '1111010000', 405: '01111110111', 406: '001000100', 407: '10000100000', 408: '10111010101', 409: '1100010011', 410: '1100000011', 411: '010100011', 412: '00000011100', 413: '010110110', 414: '000110110100', 415: '1110001010', 416: '100010110', 417: '11010001011', 418: '101000011', 419: '10000010001', 420: '1111001010', 421: '1001010100', 422: '0110010110', 423: '101011000', 424: '101010010', 425: '1001110111', 426: '1100111110', 427: '00001110001', 428: '0010101100', 429: '1010101100', 430: '101011110', 431: '000100100001', 432: '10110111111', 433: '101111111100', 434: '100110111', 435: '0110000000', 436: '01101011110', 437: '010101100', 438: '1110000110', 439: '1000100100', 440: '100000010', 441: '010001011', 442: '010000000', 443: '0010000011', 444: '000100101', 445: '1101001011', 446: '11101001110', 447: '0010010001', 448: '11001111001100100', 449: '0111101100', 450: '001100011', 451: '000001110011', 452: '11111110110110', 453: '1100110000', 454: '1110111011', 455: '011100100', 456: '10000010000', 457: '000101100', 458: '010111010100', 459: '1100110110', 460: '1011010111', 461: '001111101', 462: '0010001110', 463: '1111000110', 464: '10110100111', 465: '11100010011', 466: '011101000', 467: '0011000011', 468: '00110110100', 469: '111000011101', 470: '110000100110000', 471: '1100111100110010100', 472: '1110001111', 473: '1100001001101', 474: '111100010000', 475: '100110001', 476: '1100100011011', 477: '011000001', 478: '1100100001', 479: '101001101110', 480: '010100001', 481: '1111101000', 482: '1010111011', 483: '1110111110', 484: '1110100001', 485: '01100000011100', 486: '10111010011', 487: '1111100111100', 488: '11101101101', 489: '11100101100', 490: '11010100101010', 491: '0111011100', 492: '1110000010', 493: '001010101', 494: '000111010', 495: '1101110001', 496: '1011000011', 497: '11100000111', 498: '1100011110', 499: '010100101', 500: '01101000111', 501: '0011111001', 502: '1111111111', 503: '1111100111101', 504: '001110111', 505: '1011101100', 506: '000001010', 507: '001111010', 508: '000100111', 509: '000110011', 510: '011111110', 511: '10111101001', 512: '111111101100', 513: '110000110000', 514: '101100110', 515: '101001010', 516: '001000010', 517: '1111010011', 518: '1010010011', 519: '000011010', 520: '1110101010', 521: '100111001', 522: '000100001', 523: '1010011010', 524: '000111100', 525: '10100000111', 526: '001001110', 527: '000100010', 528: '10101100111', 529: '00101100000', 530: '11001011110', 531: '1010100010', 532: '11111110101', 533: '00111100001', 534: '1110110011', 535: '000010010', 536: '011110011', 537: '1100100100', 538: '1111001111001010', 539: '100110011', 540: '1110011101', 541: '1110001101', 542: '010101000', 543: '10100000110', 544: '0111101001', 545: '0100010001', 546: '0001010000', 547: '1100101110', 548: '1110101111', 549: '1111010100', 550: '11000010010', 551: '10110111110', 552: '0111001110', 553: '11100101110', 554: '0100000010001000', 555: '1110110101', 556: '1010011011111', 557: '0001000001', 558: '100011001', 559: '11010111101', 560: '110000110001', 561: '010010000', 562: '100101000', 563: '0011000000', 564: '11010010100', 565: '1000111000', 566: '11101111001', 567: '10011001011', 568: '0000111100', 569: '10111010010', 570: '00000011101', 571: '10111110010', 572: '11011001101', 573: '1110101100011', 574: '100001101110', 575: '011101001101', 576: '010010011', 577: '010111111', 578: '1100100101', 579: '1011001010', 580: '1110100000', 581: '100000111', 582: '010011110', 583: '000010110', 584: '11110000001', 585: '010111110', 586: '001110001', 587: '011101101', 588: '1011100111111', 589: '01000000101', 590: '1101000110', 591: '010101101', 592: '0011000001', 593: '11100111110', 594: '10110101000', 595: '1011100111110', 596: '11000010011001', 597: '100100110', 598: '1100111100110000', 599: '10111101000', 600: '110011001010', 601: '11111110000', 602: '0110000001111', 603: '1100010000', 604: '010001110', 605: '01111101000', 606: '100010000', 607: '01001011110', 608: '0011000010', 609: '100101011', 610: '000011111', 611: '11011000110', 612: '1110110111', 613: '000000001', 614: '0010000111', 615: '1011010001', 616: '000111111', 617: '0000010010', 618: '100100000', 619: '0100000010000', 620: '111010110000', 621: '110000100110001', 622: '0111001111', 623: '1101000000', 624: '11101001111', 625: '010101011', 626: '000101101', 627: '1101010100', 628: '1111001001', 629: '011110010', 630: '010010111110', 631: '000010000', 632: '010111000', 633: '101111100111', 634: '1110100110', 635: '0011011011001', 636: '1011011010', 637: '111110110110', 638: '11110001001', 639: '11111011010', 640: '1100000101', 641: '10111011101', 642: '110011110010', 643: '10101100110', 644: '1101010010100', 645: '011101001110', 646: '00100001101', 647: '1101110010', 648: '11001110110', 649: '000111011', 650: '01101000110', 651: '11000000010', 652: '000011101', 653: '1011010010', 654: '11010110010', 655: '1100010110', 656: '10111010100', 657: '1100100010', 658: '0101101011', 659: '001001100', 660: '101001100', 661: '11011100111', 662: '010001001', 663: '0000011001', 664: '001111001', 665: '1011100010', 666: '0110000001100', 667: '11001010100', 668: '11011000111', 669: '001010011', 670: '1110011011101', 671: '110011101110', 672: '1110110100', 673: '1111110000', 674: '0101000101110', 675: '11010011001', 676: '11100111111', 677: '0001010001', 678: '000001101', 679: '101000100', 680: '100001101101', 681: '1011010110', 682: '1110111111', 683: '011001010', 684: '010000111', 685: '011000011', 686: '1100001110', 687: '101100101111', 688: '1101110111101', 689: '000000110', 690: '1110010010', 691: '1101011000', 692: '0110010111', 693: '001110101', 694: '11101011101', 695: '001101110', 696: '101001110', 697: '1111010111', 698: '001101000', 699: '000000101', 700: '10110100110100', 701: '101011011', 702: '1111001011', 703: '11001010101', 704: '1110000111111', 705: '011000100', 706: '1110110010', 707: '1100110001', 708: '011101100', 709: '100100101101', 710: '001001111101', 711: '00100111111', 712: '11100001110000', 713: '1110100100', 714: '11010001111', 715: '1001011101', 716: '1100100011010', 717: '011101010', 718: '001001010', 719: '101010100', 720: '0111000110', 721: '1101110000', 722: '11101011100', 723: '100010011', 724: '0000000111', 725: '0111111011010', 726: '001100010', 727: '011110001', 728: '011001110', 729: '11000111111', 730: '10111011010', 731: '1100011000000', 732: '110110111101', 733: '101100001011', 734: '1001111100', 735: '0011010011', 736: '1101111111', 737: '001110000', 738: '1111000011', 739: '0100001100', 740: '0010000010', 741: '0100100010', 742: '101101100111', 743: '0111011101', 744: '011100110', 745: '11011000010', 746: '0111011111', 747: '111110010001', 748: '1101010001', 749: '000010100', 750: '1110111010', 751: '1110011000', 752: '110011110011001011', 753: '001110011', 754: '1001011001', 755: '001010010', 756: '101101001101010', 757: '0100111110', 758: '010010110', 759: '1111010110', 760: '100110000', 761: '001011001', 762: '1100000100', 763: '100001110', 764: '1100110101', 765: '001101100', 766: '010110001', 767: '1100011001', 768: '100110100', 769: '1000100011', 770: '10011001010', 771: '11001111000', 772: '0001011111', 773: '101011100', 774: '10111101110', 775: '000111001', 776: '010010111111', 777: '0011110001', 778: '01011101011', 779: '11010100100', 780: '101100100', 781: '1111110001', 782: '110001100010', 783: '10110101001', 784: '010100110', 785: '010011101', 786: '00000100110', 787: '11110110010', 788: '011011011', 789: '11010111100', 790: '100110101', 791: '1000000000', 792: '1100110111', 793: '0101110100', 794: '100100010', 795: '110101101101', 796: '1110010001', 797: '011101001100', 798: '1100111100110010101', 799: '11100110110', 800: '010110000', 801: '1100100011110', 802: '1101100010', 803: '1000010110', 804: '010000001000101', 805: '11100001110001', 806: '1100001010', 807: '0100101110', 808: '1011111000', 809: '100001010', 810: '11000110000011', 811: '1101000100', 812: '101010000', 813: '1101111001', 814: '000101011', 815: '111101111001', 816: '11110000000', 817: '00100111110000', 818: '1111100001', 819: '1101100001100', 820: '001101101101', 821: '11111010010', 822: '1010101101', 823: '1100111101', 824: '11010111000', 825: '000010011', 826: '1110100010', 827: '1100001011', 828: '1100101101', 829: '0101101010', 830: '111000011110', 831: '011111001', 832: '000100011', 833: '1101100000', 834: '0111011110', 835: '100001100', 836: '010101010', 837: '000110001', 838: '1011101111', 839: '110000100111', 840: '1101111011', 841: '01111101001', 842: '001011100', 843: '011100010', 844: '1110011010', 845: '001011010', 846: '000100100000', 847: '01100000010', 848: '110101101100110', 849: '001101011', 850: '1011110010', 851: '11010011110', 852: '100011011', 853: '110101001011', 854: '1101010000', 855: '0011010010', 856: '1111100101', 857: '010101110', 858: '1101010011', 859: '101000101111', 860: '100100011', 861: '000001110010', 862: '11111001110', 863: '011001111', 864: '1100100110', 865: '000001011', 866: '11111110111', 867: '0100000010001001', 868: '11111001001', 869: '111100010001', 870: '01111010000', 871: '0100110010', 872: '011110000', 873: '000110100', 874: '101000111', 875: '10111101111', 876: '101100000', 877: '1011011011', 878: '011000000111011', 879: '101111111101', 880: '11110011110011', 881: '11010011000', 882: '101001011', 883: '1010011110', 884: '011010111111', 885: '101110110111', 886: '10111001110', 887: '0111110101', 888: '000101010', 889: '000111000', 890: '001011011', 891: '1110001110', 892: '110001100011', 893: '010100010110', 894: '110101101111', 895: '10111000010', 896: '101011111', 897: '1111100010', 898: '1111000001', 899: '111001101111', 900: '0010101101', 901: '10100010110', 902: '0001101100', 903: '1101010111', 904: '1011100100', 905: '11111110100', 906: '000011011', 907: '00110110111', 908: '110000000111', 909: '001000110', 910: '0010011110', 911: '10010000110', 912: '1100100011111', 913: '001100111', 914: '0000011000', 915: '1111001101', 916: '11110111101', 917: '0100110011', 918: '1100111010', 919: '11101110010', 920: '010111101', 921: '010110010', 922: '1101111110', 923: '101110000111', 924: '010111100', 925: '110101101100111', 926: '111001101110000', 927: '111001001110', 928: '1011011100', 929: '1111011111', 930: '0100000010010', 931: '1111110101', 932: '11011101110', 933: '1010000010', 934: '110001010100', 935: '1111110010', 936: '11001111001101', 937: '1111100110', 938: '0000111101', 939: '11011111001', 940: '11100000110', 941: '101101001101011', 942: '011011110', 943: '11110011100', 944: '1011100000', 945: '011001100', 946: '111100111100100', 947: '1010100011', 948: '1100110011', 949: '1101001000', 950: '1101100111', 951: '0100010000', 952: '1100111100111', 953: '100111101', 954: '11011111000', 955: '11100100001', 956: '1000000011', 957: '0110000001110101', 958: '111101100111', 959: '100100111', 960: '11100100110', 961: '01000000100011', 962: '11111010011', 963: '1011010011011', 964: '1111001100', 965: '1110001100', 966: '001010111', 967: '1100010100', 968: '110001010101', 969: '1111111110', 970: '111110011111', 971: '011111101101110', 972: '11010010101', 973: '11101011001', 974: '11101010000', 975: '0011011011000', 976: '1110110001', 977: '010000101', 978: '1010111010', 979: '1000010001', 980: '110011101111', 981: '010011100', 982: '100100101100', 983: '1011011110', 984: '1101110111111', 985: '1110000111110', 986: '101110000110', 987: '110101101100100', 988: '000000100', 989: '1100001101', 990: '0001101110', 991: '1011011000', 992: '001100100', 993: '1010001010', 994: '1000000010', 995: '010110011', 996: '1101100001101', 997: '011010110', 998: '011010000', 999: '100000101'}

Question 1: What is the maximum length of a codeword in the resulting Huffman code?

Question 2: Continuing the previous problem, what is the minimum length of a codeword in your Huffman code?


In [63]:
# Min and Max functions

def max_length(graph):
    L = 0
    for k,val in graph.items():
        if len(val) > L:
            L = len(val)
    return L

def min_length(graph):
    L = np.inf
    for k,val in graph.items():
        if len(val) < L:
            L = len(val)
    return L


print ("min:",min_length(graph))
print ("max:", max_length(graph))


min: 9
max: 19