Notebook to tidy 1999 Polling Place Data

Libraries



In [161]:

    
import pandas as pd
import xlrd
import os
from os import listdir
from os.path import isfile, join
from IPython.display import display, HTML

Functions

`get_files(filepath)`

Takes a path to a folder
Returns a list of files in that folder



In [162]:

    
# filter a list of files for xls files
def get_files(filepath):
    return [f for f in listdir(filepath) if isfile(join(filepath, f))]



In [163]:

    
filepath = '1999_referenda/polling_places'
print(get_files(filepath)[:4])









    



['ppsact.txt', 'ppsact.xls', 'ppsnsw.txt', 'ppsnsw.xls']

`get_xls(files)`

Takes a list of file names
Returns those that are .xls files



In [164]:

    
# filter a list of files for xls files
def get_xls(files):
    r = []
    for file in files:
        if 'xls' in file:
            r.append(file)
    return r



In [165]:

    
filepath = '1999_referenda/polling_places'
files = get_files(filepath)
files_xls = get_xls(files)
print(files_xls[:4])









    



['ppsact.xls', 'ppsnsw.xls', 'ppsnt.xls', 'ppsqld.xls']

`get_polling_places(filename, columns)`

Takes a path to a polling place file, and a list defining the columns we should end up with
Returns a data frame containing the rows that represented polling places

A bit of an explanation

The files are a bit of a mess, they look like this:

The approach I take below is to loop through each row of the spreadsheet, and

if the row doesn't have any blank cells in the first columns of the spreadsheet
and if the row doesn't begin with polling place name then append the row to a data frame

Then I use the file name to add a state column

This will result in code we can run across all the states spreadsheets to end up with one file



In [166]:

    
def get_polling_places(filename, columns):

    # get data
    x = xlrd.open_workbook(file)
    x_sheet = x.sheet_by_index(0)
    
    # data frame to preserve outside loop
    df_polling_places = pd.DataFrame()
    
    # for each row in the spreadsheet
    for rowindex in range(x_sheet.nrows):
        row_slice = x_sheet.row_slice(
            rowx=rowindex, # take the index of this loop, 0 -> max rows
            start_colx=0, # we care about the first 6 columns hence 0->6
            end_colx=6
        )
        
        # take keys off the row
        row = []
        for cell in row_slice:
            row.append(cell.value)

        # if there are any blank cells, this isn't a real row of data
        if row.count('') == 0:
            # except one of those rows is the header row, which we want to throw out as well
            if row[0] != 'Polling Place Name':  

                # make a data frame
                df = pd.DataFrame(data=[row], columns = columns)

                # if the data frame is empty, create it, if not, append to it
                if df_polling_places.empty:
                    df_polling_places = df
                else:
                    df_polling_places = df_polling_places.append(df)
    
    # work out state from filename
    # state will be the chars from 3 chars after the final slash to .
    # as the files are in the form 'ppsnt.xls'
    
    # everything after the final slash
    needle = file.rfind('/')
    length = len(file)
    filepart = file[needle+1:length]
    
    # from the third char to the dot
    needle = filepart.find('.')
    state = filepart[3:needle].upper()
    
    # create state column
    df_polling_places['state'] = state
    
    # put state column at front
    
    #Make a list of all of the columns in the df
    cols = list(df_polling_places.columns.values)
    
    # remove state from the list
    cols.pop(cols.index('state'))
    
    # put state at the front of this list
    df_polling_places = df_polling_places[['state']+cols]
    
    return df_polling_places



In [167]:

    
file = '1999_referenda/polling_places/ppsnt.xls'

columns = [
    'polling_place',
    'premises',
    'address',
    'suburb',
    'postcode',
    'wheelchair_access'
]

get_polling_places(file, columns).head(5)









    Out[167]:







  
    
      
      state
      polling_place
      premises
      address
      suburb
      postcode
      wheelchair_access
    
  
  
    
      0
      NT
      Alice Springs
      Civic Centre (Foyer)
      Todd St
      ALICE SPRINGS
      870.0
      F
    
    
      0
      NT
      Alyangula
      Alyangula Area School
      Flinders St
      ALYANGULA
      885.0
      F
    
    
      0
      NT
      Anula
      Anula Primary School
      Yanyula Dr
      ANULA
      812.0
      A
    
    
      0
      NT
      Bakewell
      Bakewell Primary School
      Hutchison Tce
      PALMERSTON
      830.0
      F
    
    
      0
      NT
      Batchelor
      Batchelor Area School
      Nurndina St
      BATCHELOR
      845.0
      F

Run over each file, merge data frame



In [168]:

    
filepath = '1999_referenda/polling_places/'

columns = [
    'polling_place',
    'premises',
    'address',
    'suburb',
    'postcode',
    'wheelchair_access'
]

# get xls files from filepath
files = get_files(filepath)
files_xls = get_xls(files)

df_polling_places = pd.DataFrame()

for file in files_xls:
    
    file = filepath + file
    df = get_polling_places(file, columns)

    # if the data frame is empty, create it, if not, append to it
    if df_polling_places.empty:
        df_polling_places = df
    else:
        df_polling_places = df_polling_places.append(df)

df_polling_places.head(3)









    Out[168]:







  
    
      
      state
      polling_place
      premises
      address
      suburb
      postcode
      wheelchair_access
    
  
  
    
      0
      ACT
      Bonython
      Bonython Primary School
      Hurtle Ave
      BONYTHON
      2905.0
      F
    
    
      0
      ACT
      Calwell
      Calwell High School
      Casey Cres
      CALWELL
      2905.0
      F
    
    
      0
      ACT
      Canberra Hospital
      The Canberra Hospital
      Blding 2 Level 3 Yamba Dr
      GARRAN
      2605.0
      F

Dedup polling places

Where polling places occur across multiple seats, they have the naming scheme:
['Pakenham Upper (La Trobe)','Pakenham Upper (McMillan)']

This code strips those fields both to 'Pakenham Upper', and then deduplicates them

`left_of_bracket(s)`

takes string
- if string contains left bracket, returns whatever occurs before the bracket
- if string doesn't contain left bracket, returns whole string



In [169]:

    
def left_of_bracket(s):
    if '(' in s:
        needle = s.find('(')
        r = s[:needle-1].strip()
        return r
    else:
        return s



In [170]:

    
print(left_of_bracket('Croydon Park'))
print(left_of_bracket('Croydon Park West (Adelaide)'))









    



Croydon Park
Croydon Park West



In [171]:

    
# initial number of rows
print('inital num rows: ' +  str(len(df_polling_places.index)))

# for each value, return the value of left_of_bracket
# left_of_bracket returns a whole string, or
# if it contains a (, whatever is to the left of it
lambda_polling_places = lambda x: left_of_bracket(x)

# update polling place column
df_polling_places['polling_place'] = df_polling_places['polling_place'].apply(lambda_polling_places)

# deduplicate
df_polling_places = df_polling_places.drop_duplicates()

# print final number of rows
print('final num rows: ' +  str(len(df_polling_places.index)))









    



inital num rows: 7336
final num rows: 7050

Are any booths a non-unique combo of state and polling_place?



In [172]:

    
try:
    tmp = pd.concat(g for _, g in df_polling_places.groupby(['state','polling_place']) if len(g) > 1)
except:
    print('no duplicates found')
else:
    display(tmp)









    







  
    
      
      state
      polling_place
      premises
      address
      suburb
      postcode
      wheelchair_access
    
  
  
    
      0
      NSW
      Blacktown West
      Blacktown West Public School
      Kildare Rd
      BLACKTOWN
      2148.0
      N
    
    
      0
      NSW
      Blacktown West
      Blacktown West Public School
      Lancaster St
      BLACKTOWN
      2148.0
      N
    
    
      0
      NSW
      Hyde Park
      Jubilee Hall, Kingsland Rd.
      Kingsland Rd
      BERALA
      2141.0
      A
    
    
      0
      NSW
      Hyde Park
      Unitarian Church Hall
      15 Francis St
      EAST SYDNEY
      2010.0
      N
    
    
      0
      NSW
      Punchbowl
      Punchbowl Public School
      1333 Canterbury Rd
      PUNCHBOWL
      2196.0
      N
    
    
      0
      NSW
      Punchbowl
      Punchbowl Public School
      1333 Canterbury Rd
      PUNCHBOWL
      2196.0
      A
    
    
      0
      NSW
      Sutherland East
      Presbyterian Church Hall
      Cnr Flora & Glencoe Sts
      SUTHERLAND
      2232.0
      A
    
    
      0
      NSW
      Sutherland East
      Presbyterian Pre School
      Cnr Flora & Glencoe Sts
      SUTHERLAND
      2232.0
      A
    
    
      0
      NSW
      Sutton
      Sutton Public School
      Victoria St
      SUTTON
      2620.0
      A
    
    
      0
      NSW
      Sutton
      Sutton Public School
      Victoria St
      SUTTON
      2620.0
      N
    
    
      0
      NSW
      Turramurra South
      Turramurra South Public School
      Kissing Point Rd
      SOUTH TURRAMURRA
      2074.0
      N
    
    
      0
      NSW
      Turramurra South
      Turramurra Public School
      Kissing Point Rd
      SOUTH TURRAMURRA
      2074.0
      A
    
    
      0
      NSW
      Walters Road
      Walters Road Public School
      Walters Rd
      BLACKTOWN
      2148.0
      N
    
    
      0
      NSW
      Walters Road
      Walters Road Public School
      158 Walters Rd
      BLACKTOWN
      2148.0
      N

Clean these up

All of these duplicates are in NSW
Almost all of these clearly represent bad data entry, for instance, it seems unlikely Blacktown West Public School had two different addresses on the same day in 1999!
The 'Hyde Park' booth lists two totally different addresses, which is concerning
- It appears this is the only booth where a name was repeated for a totally different polling place within the same state. Goddamnit NSW.
- We'll patch this here, and in our results file when we get to that



In [173]:

    
print('inital num rows: ' +  str(len(df_polling_places.index)))

# patch the two hyde park booths

df_polling_places.loc[
    (df_polling_places['polling_place'] == 'Hyde Park') &
    (df_polling_places['state'] == 'NSW') &
    (df_polling_places['premises'] == 'Unitarian Church Hall'),
    ['polling_place']
] = 'Hyde Park - Unitarian Church Hall'

df_polling_places.loc[
    (df_polling_places['polling_place'] == 'Hyde Park') &
    (df_polling_places['state'] == 'NSW') &
    (df_polling_places['premises'] == 'Jubilee Hall, Kingsland Rd.'),
    ['polling_place']
] = 'Hyde Park - Jubilee Hall'

df_polling_places = df_polling_places.drop_duplicates(
    ['state','polling_place'],
    keep='first' # where state and polling place have dups, keep the first row you find
)

print('final num rows: ' +  str(len(df_polling_places.index)))

# just to be sure, run that find-duplicates code again
try:
    tmp = pd.concat(g for _, g in df_polling_places.groupby(['state','polling_place']) if len(g) > 1)
except:
    print('no duplicates found')
else:
    display(tmp)









    



inital num rows: 7050
final num rows: 7044
no duplicates found

Save polling places to csv



In [174]:

    
outfile = '1999_referenda_output/polling_places.csv'

df_polling_places.to_csv(
    outfile,
    sep=',',
    index=False,
    encoding='UTF8'
)

state	polling_place	premises	address	suburb	postcode	wheelchair_access
NT	Alice Springs	Civic Centre (Foyer)	Todd St	ALICE SPRINGS	870.0	F
NT	Alyangula	Alyangula Area School	Flinders St	ALYANGULA	885.0	F
NT	Anula	Anula Primary School	Yanyula Dr	ANULA	812.0	A
NT	Bakewell	Bakewell Primary School	Hutchison Tce	PALMERSTON	830.0	F
NT	Batchelor	Batchelor Area School	Nurndina St	BATCHELOR	845.0	F

state	polling_place	premises	address	suburb	postcode	wheelchair_access
ACT	Bonython	Bonython Primary School	Hurtle Ave	BONYTHON	2905.0	F
ACT	Calwell	Calwell High School	Casey Cres	CALWELL	2905.0	F
ACT	Canberra Hospital	The Canberra Hospital	Blding 2 Level 3 Yamba Dr	GARRAN	2605.0	F

state	polling_place	premises	address	suburb	postcode	wheelchair_access
NSW	Blacktown West	Blacktown West Public School	Kildare Rd	BLACKTOWN	2148.0	N
NSW	Blacktown West	Blacktown West Public School	Lancaster St	BLACKTOWN	2148.0	N
NSW	Hyde Park	Jubilee Hall, Kingsland Rd.	Kingsland Rd	BERALA	2141.0	A
NSW	Hyde Park	Unitarian Church Hall	15 Francis St	EAST SYDNEY	2010.0	N
NSW	Punchbowl	Punchbowl Public School	1333 Canterbury Rd	PUNCHBOWL	2196.0	N
NSW	Punchbowl	Punchbowl Public School	1333 Canterbury Rd	PUNCHBOWL	2196.0	A
NSW	Sutherland East	Presbyterian Church Hall	Cnr Flora & Glencoe Sts	SUTHERLAND	2232.0	A
NSW	Sutherland East	Presbyterian Pre School	Cnr Flora & Glencoe Sts	SUTHERLAND	2232.0	A
NSW	Sutton	Sutton Public School	Victoria St	SUTTON	2620.0	A
NSW	Sutton	Sutton Public School	Victoria St	SUTTON	2620.0	N
NSW	Turramurra South	Turramurra South Public School	Kissing Point Rd	SOUTH TURRAMURRA	2074.0	N
NSW	Turramurra South	Turramurra Public School	Kissing Point Rd	SOUTH TURRAMURRA	2074.0	A
NSW	Walters Road	Walters Road Public School	Walters Rd	BLACKTOWN	2148.0	N
NSW	Walters Road	Walters Road Public School	158 Walters Rd	BLACKTOWN	2148.0	N