Notebook to tidy 1999 Polling Place Data

Libraries


In [161]:
import pandas as pd
import xlrd
import os
from os import listdir
from os.path import isfile, join
from IPython.display import display, HTML

Functions

get_files(filepath)

  • Takes a path to a folder
  • Returns a list of files in that folder

In [162]:
# filter a list of files for xls files
def get_files(filepath):
    return [f for f in listdir(filepath) if isfile(join(filepath, f))]

In [163]:
filepath = '1999_referenda/polling_places'
print(get_files(filepath)[:4])


['ppsact.txt', 'ppsact.xls', 'ppsnsw.txt', 'ppsnsw.xls']

get_xls(files)

  • Takes a list of file names
  • Returns those that are .xls files

In [164]:
# filter a list of files for xls files
def get_xls(files):
    r = []
    for file in files:
        if 'xls' in file:
            r.append(file)
    return r

In [165]:
filepath = '1999_referenda/polling_places'
files = get_files(filepath)
files_xls = get_xls(files)
print(files_xls[:4])


['ppsact.xls', 'ppsnsw.xls', 'ppsnt.xls', 'ppsqld.xls']

get_polling_places(filename, columns)

  • Takes a path to a polling place file, and a list defining the columns we should end up with
  • Returns a data frame containing the rows that represented polling places

A bit of an explanation

The files are a bit of a mess, they look like this:

The approach I take below is to loop through each row of the spreadsheet, and

  • if the row doesn't have any blank cells in the first columns of the spreadsheet
  • and if the row doesn't begin with polling place name then append the row to a data frame

Then I use the file name to add a state column

This will result in code we can run across all the states spreadsheets to end up with one file


In [166]:
def get_polling_places(filename, columns):

    # get data
    x = xlrd.open_workbook(file)
    x_sheet = x.sheet_by_index(0)
    
    # data frame to preserve outside loop
    df_polling_places = pd.DataFrame()
    
    # for each row in the spreadsheet
    for rowindex in range(x_sheet.nrows):
        row_slice = x_sheet.row_slice(
            rowx=rowindex, # take the index of this loop, 0 -> max rows
            start_colx=0, # we care about the first 6 columns hence 0->6
            end_colx=6
        )
        
        # take keys off the row
        row = []
        for cell in row_slice:
            row.append(cell.value)

        # if there are any blank cells, this isn't a real row of data
        if row.count('') == 0:
            # except one of those rows is the header row, which we want to throw out as well
            if row[0] != 'Polling Place Name':  

                # make a data frame
                df = pd.DataFrame(data=[row], columns = columns)

                # if the data frame is empty, create it, if not, append to it
                if df_polling_places.empty:
                    df_polling_places = df
                else:
                    df_polling_places = df_polling_places.append(df)
    
    # work out state from filename
    # state will be the chars from 3 chars after the final slash to .
    # as the files are in the form 'ppsnt.xls'
    
    # everything after the final slash
    needle = file.rfind('/')
    length = len(file)
    filepart = file[needle+1:length]
    
    # from the third char to the dot
    needle = filepart.find('.')
    state = filepart[3:needle].upper()
    
    # create state column
    df_polling_places['state'] = state
    
    # put state column at front
    
    #Make a list of all of the columns in the df
    cols = list(df_polling_places.columns.values)
    
    # remove state from the list
    cols.pop(cols.index('state'))
    
    # put state at the front of this list
    df_polling_places = df_polling_places[['state']+cols]
    
    return df_polling_places

In [167]:
file = '1999_referenda/polling_places/ppsnt.xls'

columns = [
    'polling_place',
    'premises',
    'address',
    'suburb',
    'postcode',
    'wheelchair_access'
]

get_polling_places(file, columns).head(5)


Out[167]:
state polling_place premises address suburb postcode wheelchair_access
0 NT Alice Springs Civic Centre (Foyer) Todd St ALICE SPRINGS 870.0 F
0 NT Alyangula Alyangula Area School Flinders St ALYANGULA 885.0 F
0 NT Anula Anula Primary School Yanyula Dr ANULA 812.0 A
0 NT Bakewell Bakewell Primary School Hutchison Tce PALMERSTON 830.0 F
0 NT Batchelor Batchelor Area School Nurndina St BATCHELOR 845.0 F

Run over each file, merge data frame


In [168]:
filepath = '1999_referenda/polling_places/'

columns = [
    'polling_place',
    'premises',
    'address',
    'suburb',
    'postcode',
    'wheelchair_access'
]

# get xls files from filepath
files = get_files(filepath)
files_xls = get_xls(files)

df_polling_places = pd.DataFrame()

for file in files_xls:
    
    file = filepath + file
    df = get_polling_places(file, columns)

    # if the data frame is empty, create it, if not, append to it
    if df_polling_places.empty:
        df_polling_places = df
    else:
        df_polling_places = df_polling_places.append(df)

df_polling_places.head(3)


Out[168]:
state polling_place premises address suburb postcode wheelchair_access
0 ACT Bonython Bonython Primary School Hurtle Ave BONYTHON 2905.0 F
0 ACT Calwell Calwell High School Casey Cres CALWELL 2905.0 F
0 ACT Canberra Hospital The Canberra Hospital Blding 2 Level 3 Yamba Dr GARRAN 2605.0 F

Dedup polling places

Where polling places occur across multiple seats, they have the naming scheme:
['Pakenham Upper (La Trobe)','Pakenham Upper (McMillan)']

This code strips those fields both to 'Pakenham Upper', and then deduplicates them

left_of_bracket(s)

  • takes string
    • if string contains left bracket, returns whatever occurs before the bracket
    • if string doesn't contain left bracket, returns whole string

In [169]:
def left_of_bracket(s):
    if '(' in s:
        needle = s.find('(')
        r = s[:needle-1].strip()
        return r
    else:
        return s

In [170]:
print(left_of_bracket('Croydon Park'))
print(left_of_bracket('Croydon Park West (Adelaide)'))


Croydon Park
Croydon Park West

In [171]:
# initial number of rows
print('inital num rows: ' +  str(len(df_polling_places.index)))

# for each value, return the value of left_of_bracket
# left_of_bracket returns a whole string, or
# if it contains a (, whatever is to the left of it
lambda_polling_places = lambda x: left_of_bracket(x)

# update polling place column
df_polling_places['polling_place'] = df_polling_places['polling_place'].apply(lambda_polling_places)

# deduplicate
df_polling_places = df_polling_places.drop_duplicates()

# print final number of rows
print('final num rows: ' +  str(len(df_polling_places.index)))


inital num rows: 7336
final num rows: 7050

Are any booths a non-unique combo of state and polling_place?


In [172]:
try:
    tmp = pd.concat(g for _, g in df_polling_places.groupby(['state','polling_place']) if len(g) > 1)
except:
    print('no duplicates found')
else:
    display(tmp)


state polling_place premises address suburb postcode wheelchair_access
0 NSW Blacktown West Blacktown West Public School Kildare Rd BLACKTOWN 2148.0 N
0 NSW Blacktown West Blacktown West Public School Lancaster St BLACKTOWN 2148.0 N
0 NSW Hyde Park Jubilee Hall, Kingsland Rd. Kingsland Rd BERALA 2141.0 A
0 NSW Hyde Park Unitarian Church Hall 15 Francis St EAST SYDNEY 2010.0 N
0 NSW Punchbowl Punchbowl Public School 1333 Canterbury Rd PUNCHBOWL 2196.0 N
0 NSW Punchbowl Punchbowl Public School 1333 Canterbury Rd PUNCHBOWL 2196.0 A
0 NSW Sutherland East Presbyterian Church Hall Cnr Flora & Glencoe Sts SUTHERLAND 2232.0 A
0 NSW Sutherland East Presbyterian Pre School Cnr Flora & Glencoe Sts SUTHERLAND 2232.0 A
0 NSW Sutton Sutton Public School Victoria St SUTTON 2620.0 A
0 NSW Sutton Sutton Public School Victoria St SUTTON 2620.0 N
0 NSW Turramurra South Turramurra South Public School Kissing Point Rd SOUTH TURRAMURRA 2074.0 N
0 NSW Turramurra South Turramurra Public School Kissing Point Rd SOUTH TURRAMURRA 2074.0 A
0 NSW Walters Road Walters Road Public School Walters Rd BLACKTOWN 2148.0 N
0 NSW Walters Road Walters Road Public School 158 Walters Rd BLACKTOWN 2148.0 N

Clean these up

  • All of these duplicates are in NSW
  • Almost all of these clearly represent bad data entry, for instance, it seems unlikely Blacktown West Public School had two different addresses on the same day in 1999!
  • The 'Hyde Park' booth lists two totally different addresses, which is concerning
    • It appears this is the only booth where a name was repeated for a totally different polling place within the same state. Goddamnit NSW.
    • We'll patch this here, and in our results file when we get to that

In [173]:
print('inital num rows: ' +  str(len(df_polling_places.index)))

# patch the two hyde park booths

df_polling_places.loc[
    (df_polling_places['polling_place'] == 'Hyde Park') &
    (df_polling_places['state'] == 'NSW') &
    (df_polling_places['premises'] == 'Unitarian Church Hall'),
    ['polling_place']
] = 'Hyde Park - Unitarian Church Hall'

df_polling_places.loc[
    (df_polling_places['polling_place'] == 'Hyde Park') &
    (df_polling_places['state'] == 'NSW') &
    (df_polling_places['premises'] == 'Jubilee Hall, Kingsland Rd.'),
    ['polling_place']
] = 'Hyde Park - Jubilee Hall'

df_polling_places = df_polling_places.drop_duplicates(
    ['state','polling_place'],
    keep='first' # where state and polling place have dups, keep the first row you find
)

print('final num rows: ' +  str(len(df_polling_places.index)))

# just to be sure, run that find-duplicates code again
try:
    tmp = pd.concat(g for _, g in df_polling_places.groupby(['state','polling_place']) if len(g) > 1)
except:
    print('no duplicates found')
else:
    display(tmp)


inital num rows: 7050
final num rows: 7044
no duplicates found

Save polling places to csv


In [174]:
outfile = '1999_referenda_output/polling_places.csv'

df_polling_places.to_csv(
    outfile,
    sep=',',
    index=False,
    encoding='UTF8'
)