In [12]:
import binascii

def load_anpa_file(path, debug=False): 
    data = ''
    lines = []
    subs = {'1e': ';', '1d': ';'}

    with open(path, 'rb') as fh:
        byte = fh.read(1)
        last = byte
        data += byte
        if debug:
            print binascii.hexlify(byte), byte
        while byte != "":
            byte = fh.read(1)
            if binascii.hexlify(byte) in subs:
                byte = subs[binascii.hexlify(byte)]
            if debug:
                print binascii.hexlify(byte), byte
            data += byte
            if binascii.hexlify(last + byte) == '0d0a':
                if debug:
                    print data
                lines.append(data)
                data = ''
            last = byte
    return lines




In [15]:
def cleaner(string, debug=False):
    cleaned = ''
    banned = ['0a', '0d', '1f', '06', '08', '19']
    for char in string:
        if binascii.hexlify(char) not in banned:
            if debug:
                print binascii.hexlify(char), char
            cleaned += char
    return cleaned.strip()

def region_parser(lines):
    state = ''
    header = []
    results = {}

    for line in lines:
        if binascii.hexlify(line) == '0d0a':
            return results
        line = cleaner(line)
        if line == '^By The Associated Press=':
            state = 'EnterHeader'
        elif line.find('returns from Illinois by Geographic Region') != -1 and line.endswith('<'):
            state = 'EndHeader'
        elif line.startswith('^') and line.endswith('<'):
            state = 'BeginRegions'
        elif state == 'BeginRegions':
            state = 'InRegions'
        elif state == 'InRegions' and line.startswith('^RegTotals'):
            state = 'EndRegions'

        if state == 'EndHeader' and line.find(';') != -1 and len(line.split(';')):
            raw = line.split(';')
            for r in raw:
                if cleaner(r):
                    header.append(cleaner(r))
        elif state == 'InRegions' and line.find(';') != -1 and len(line.split(';')):
            raw = line.split(';')
            idx = 0
            region = ''
            for r in raw:
                temp = cleaner(r)
                if temp:
                    if idx == 0:
                        region = temp
                        results[region] = {}
                    else:
                        results[region][header[idx - 1]] = int(temp.replace(',', ''))
                    idx += 1

In [25]:
def print_results(results):
    special = ['Chicago', 'CookSuburb', 'DuPage', 'Kane', 'Kendall', 'Lake', 'McHenry', 'Will']
    banned = ['PR', 'TP']
    for region in special:
        print region, results[region]

    downstate = {}
    total = {}
    for key in results[special[0]].keys():
        downstate[key] = 0
        total[key] = 0
    for region in results:
        for key in results[special[0]].keys():
            total[key] += results[region][key]
            if region not in special:
                for key in results[region]:
                    downstate[key] += results[region][key]
    print 'Downstate', downstate
    print 'Total', total

print_results(region_parser(load_anpa_file('IL-Dem-Pres-Reg-Geo-2-Takes.anpa')))


Chicago {'PR': 0, 'Sanders': 0, 'DeLaFnt': 0, 'Clinton': 0, 'Cohen': 0, 'TP': 2069}
CookSuburb {'PR': 0, 'Sanders': 0, 'DeLaFnt': 0, 'Clinton': 0, 'Cohen': 0, 'TP': 1599}
DuPage {'PR': 0, 'Sanders': 0, 'DeLaFnt': 0, 'Clinton': 0, 'Cohen': 0, 'TP': 869}
Kane {'PR': 0, 'Sanders': 0, 'DeLaFnt': 0, 'Clinton': 0, 'Cohen': 0, 'TP': 301}
Kendall {'PR': 0, 'Sanders': 0, 'DeLaFnt': 0, 'Clinton': 0, 'Cohen': 0, 'TP': 84}
Lake {'PR': 0, 'Sanders': 0, 'DeLaFnt': 0, 'Clinton': 0, 'Cohen': 0, 'TP': 415}
McHenry {'PR': 0, 'Sanders': 0, 'DeLaFnt': 0, 'Clinton': 0, 'Cohen': 0, 'TP': 212}
Will {'PR': 0, 'Sanders': 0, 'DeLaFnt': 0, 'Clinton': 0, 'Cohen': 0, 'TP': 300}
Downstate {'PR': 0, 'Sanders': 0, 'DeLaFnt': 0, 'Clinton': 0, 'Cohen': 0, 'TP': 25470}
Total {'PR': 0, 'Sanders': 0, 'DeLaFnt': 0, 'Clinton': 0, 'Cohen': 0, 'TP': 10094}

In [ ]: