Notebook to tidy 1999 Referenda Data

Libraries


In [218]:
import pandas as pd
import xlrd
import os
from os import listdir
from os.path import isfile, join

Functions

left_of_bracket(s)

  • takes string
    • if string contains left bracket, returns whatever occurs before the bracket
    • if string doesn't contain left bracket, returns whole string

In [219]:
def left_of_bracket(s):
    if '(' in s:
        needle = s.find('(')
        r = s[:needle-1].strip()
        return r
    else:
        return s

In [220]:
print(left_of_bracket('Croydon Park'))
print(left_of_bracket('Croydon Park West (Adelaide)'))


Croydon Park
Croydon Park West

referenda_by_electorate_get_metadata(filepath)

  • takes filepath, string
  • get properties of data, state, electorate, et al

In [221]:
def referenda_by_electorate_get_metadata(filepath):
    x = xlrd.open_workbook(filepath)
    x_sheet = x.sheet_by_index(0)

    date = x_sheet.cell(0,0).value.strip()
    referenda = x_sheet.cell(5,0).value.strip()
    state = x_sheet.cell(8,0).value.strip()
    electorate = x_sheet.cell(9,0).value.strip()
    
    r = dict(
        date = date,
        referenda = referenda,
        state = state,
        electorate = electorate
    )
    
    return r

In [222]:
filepath = '1999_referenda/results_by_polling_place/v4_3ADEL.XLS'
referenda_by_electorate_get_metadata(filepath)


Out[222]:
{'date': 'Referendum: 6 November 1999',
 'electorate': 'Adelaide (86272)',
 'referenda': 'the Constitutional Alteration (Establishment of Republic) 1999',
 'state': 'South Australia'}

split_electorate_enrolment(s)

  • takes string containing electorate and enrolment
    • Field in the form: "Adelaide (86272)"
  • splits them, returns dict containing both

In [223]:
def split_electorate_enrolment(s):
    
    # splitting text in the form 'Adelaide (86272)'
    needle = s.find('(')
    
    enrolment = s[needle:].strip() # take right of needle
    enrolment = enrolment[1:-1] # remove first and last chars, ie, remove brackets
    electorate = s[:needle-1].strip() # take everything from the start to one place before the bracket
    
    r = dict(
        electorate = electorate,
        enrolment = enrolment
    )
    
    return r

In [224]:
split_electorate_enrolment('Adelaide (86272)')


Out[224]:
{'electorate': 'Adelaide', 'enrolment': '86272'}

state_abbreviation(state)

  • Converts South Australia to SA, et al

In [225]:
def state_abbreviation(state):
    spaces = state.count(' ')
    
    if spaces == 2:
        bits = state.split(' ')
        r=''
        for b in bits:
            r = r + b[:1].upper() # for each word in state grab first letter
        return r
    elif 'Australia' in state:
        r = state[:1].upper() + 'A'
        return r
    elif state == 'Queensland':
        return 'QLD'
    elif state == 'Northern Territory':
        return 'NT'
    else:
        r = state[:3].upper()
        return r

In [226]:
print(state_abbreviation('New South Wales'))
print(state_abbreviation('Victoria'))
print(state_abbreviation('Queensland'))
print(state_abbreviation('South Australia'))
print(state_abbreviation('Western Australia'))
print(state_abbreviation('Tasmania'))
print(state_abbreviation('Northern Territory'))
print(state_abbreviation('Australian Capital Territory'))


NSW
VIC
QLD
SA
WA
TAS
NT
ACT

tidy_referenda_date(s)

  • takes string containing referenda date
    • Field in the form: "Referendum: 6 November 1999"
  • returns datetime in the form 1999-11-06 00:00:00

In [227]:
def tidy_referenda_date(s):
    # date in the form "Referendum: 6 November 1999"
    needle = s.find(':')
    needle = needle + 1 # skip the semicolon and the space
    s = s[needle:].strip()

    # parse as date
    r = pd.to_datetime(s)
    
    return r

In [228]:
print(tidy_referenda_date('Referendum: 6 November 1999'))


1999-11-06 00:00:00

referenda_by_electorate_tidy_metadata(metadata)

  • takes metadata, dict, containing state, electorate, referenda and date keys
  • runs separate tidy up functions properties of data, state, electorate, et al

In [229]:
def referenda_by_electorate_tidy_metadata(metadata):
    electorate = metadata['electorate']
    state = metadata['state']
    date = metadata['date']
    
    # tidy electorate, get enrolment
    electorate_enrolment = split_electorate_enrolment(electorate)
    
    metadata['electorate'] = electorate_enrolment['electorate']
    metadata['enrolment'] = electorate_enrolment['enrolment']
    
    # tidy date
    date = tidy_referenda_date(date)
    
    metadata['date'] = date
    
    # tidy state
    state = state_abbreviation(state)
    
    metadata['state'] = state
    
    return metadata

In [230]:
metadata = {
    'date': 'Referendum: 6 November 1999',
    'referenda': 'the Constitutional Alteration (Establishment of Republic) 1999',
    'state': 'South Australia',
    'electorate': 'Adelaide (86272)'
}

print('Before:')
print(metadata)

metadata = referenda_by_electorate_tidy_metadata(metadata)

print('After:')
print(metadata)


Before:
{'date': 'Referendum: 6 November 1999', 'referenda': 'the Constitutional Alteration (Establishment of Republic) 1999', 'state': 'South Australia', 'electorate': 'Adelaide (86272)'}
After:
{'date': Timestamp('1999-11-06 00:00:00'), 'referenda': 'the Constitutional Alteration (Establishment of Republic) 1999', 'state': 'SA', 'electorate': 'Adelaide', 'enrolment': '86272'}

referenda_by_electorate_get_data(filepath)

  • takes filepath
  • returns data

In [231]:
def referenda_by_electorate_get_data(filepath):
    # define headers for dataframe
    columnheaders = [
        'polling_place_raw',
        'yes_or_no',
        'yes_n',
        'yes_p',
        'no_n',
        'no_p',
        'formal_n',
        'formal_p',
        'informal_n',
        'informal_p',
        'total_n',
        'total_p'
    ]
    
    # read file
    df_result = pd.read_excel(
        filepath,
        skiprows = 12, # the data starts on the 13th row
        names=columnheaders
    )
    
    # percentages are represented by 61.72 rather than 0.6172, this fixes that
    
    lambda_fix_percentages = lambda x: x/100
    
    # run on the relevant columns
    df_result['yes_p'] = df_result['yes_p'].apply(lambda_fix_percentages)
    df_result['no_p'] = df_result['no_p'].apply(lambda_fix_percentages)
    df_result['formal_p'] = df_result['formal_p'].apply(lambda_fix_percentages)
    df_result['informal_p'] = df_result['informal_p'].apply(lambda_fix_percentages)
    df_result['total_p'] = df_result['total_p'].apply(lambda_fix_percentages)

    # polling places in >1 electorate are represented like "Croydon (Adelaide)"
    # this creates a poling place column, so the frame can be combined for mapping

    lambda_polling_places = lambda x: left_of_bracket(x)

    # use above function to make a polling place column, i.e, remove ' (Adelaide)' from 'Enfield North (Adelaide)'
    df_result['polling_place'] = df_result['polling_place_raw'].apply(lambda_polling_places)
    
    return df_result

In [232]:
filepath = '1999_referenda/results_by_polling_place/v4_3ADEL.XLS'
referenda_by_electorate_get_data(filepath).head(15)


Out[232]:
polling_place_raw yes_or_no yes_n yes_p no_n no_p formal_n formal_p informal_n informal_p total_n total_p polling_place
0 Adelaide (Adelaide) Yes 282 0.6144 177 0.3856 459 0.9871 6 0.0129 465 0.0057 Adelaide
1 Adelaide East Yes 465 0.6700 229 0.3300 694 0.9914 6 0.0086 700 0.0086 Adelaide East
2 Adelaide Hospital Yes 187 0.6172 116 0.3828 303 0.9806 6 0.0194 309 0.0038 Adelaide Hospital
3 Adelaide South Yes 1056 0.6467 577 0.3533 1633 0.9903 16 0.0097 1649 0.0202 Adelaide South
4 Adelaide West Yes 205 0.6487 111 0.3513 316 0.9875 4 0.0125 320 0.0039 Adelaide West
5 Black Forest East Yes 1206 0.6009 801 0.3991 2007 0.9858 29 0.0142 2036 0.0250 Black Forest East
6 Blair Athol North No 628 0.4102 903 0.5898 1531 0.9922 12 0.0078 1543 0.0189 Blair Athol North
7 Broadview No 971 0.4793 1055 0.5207 2026 0.9946 11 0.0054 2037 0.0250 Broadview
8 Brompton Yes 581 0.6109 370 0.3891 951 0.9896 10 0.0104 961 0.0118 Brompton
9 Clarence Park Yes 661 0.5669 505 0.4331 1166 0.9932 8 0.0068 1174 0.0144 Clarence Park
10 College Park Yes 765 0.6722 373 0.3278 1138 0.9939 7 0.0061 1145 0.0140 College Park
11 Croydon (Adelaide) Yes 256 0.5482 211 0.4518 467 0.9894 5 0.0106 472 0.0058 Croydon
12 Croydon Park No 819 0.4881 859 0.5119 1678 0.9888 19 0.0112 1697 0.0208 Croydon Park
13 Croydon Park West (Adelaide) No 234 0.4835 250 0.5165 484 0.9918 4 0.0082 488 0.0060 Croydon Park West
14 Enfield No 554 0.4128 788 0.5872 1342 0.9832 23 0.0168 1365 0.0167 Enfield

referenda_by_electorate_add_metadata(data, metadata)

  • takes data frame for electorate result
  • adds chosen metadata

In [233]:
def referenda_by_electorate_add_metadata(data, metadata):

    # add metadata columns as desired
    data['state'] = metadata['state']
    data['electorate'] = metadata['electorate']
    #data['enrolment'] = metadata['enrolment']
    #data['date'] = metadata['date']
    #data['referenda'] = metadata['referenda']
    
    # reorder columns - if you uncomment above, make sure to include below also
    data = data[[
        #'referenda',
        #'date',
        'state',
        'electorate',
        #'enrolment',
        'polling_place',
        'polling_place_raw',
        'yes_or_no',
        'yes_n',
        'yes_p',
        'no_n',
        'no_p',
        'formal_n',
        'formal_p',
        'informal_n',
        'informal_p',
        'total_n',
        'total_p'
    ]]

    return data

In [234]:
# get file
filepath = '1999_referenda/results_by_polling_place/v4_3ADEL.XLS'

# get and tidy metadata
metadata = referenda_by_electorate_get_metadata(filepath)
metadata = referenda_by_electorate_tidy_metadata(metadata)

# get data
data = referenda_by_electorate_get_data(filepath)

# append metadata to data
data = referenda_by_electorate_add_metadata(data, metadata)

data.head(15)


Out[234]:
state electorate polling_place polling_place_raw yes_or_no yes_n yes_p no_n no_p formal_n formal_p informal_n informal_p total_n total_p
0 SA Adelaide Adelaide Adelaide (Adelaide) Yes 282 0.6144 177 0.3856 459 0.9871 6 0.0129 465 0.0057
1 SA Adelaide Adelaide East Adelaide East Yes 465 0.6700 229 0.3300 694 0.9914 6 0.0086 700 0.0086
2 SA Adelaide Adelaide Hospital Adelaide Hospital Yes 187 0.6172 116 0.3828 303 0.9806 6 0.0194 309 0.0038
3 SA Adelaide Adelaide South Adelaide South Yes 1056 0.6467 577 0.3533 1633 0.9903 16 0.0097 1649 0.0202
4 SA Adelaide Adelaide West Adelaide West Yes 205 0.6487 111 0.3513 316 0.9875 4 0.0125 320 0.0039
5 SA Adelaide Black Forest East Black Forest East Yes 1206 0.6009 801 0.3991 2007 0.9858 29 0.0142 2036 0.0250
6 SA Adelaide Blair Athol North Blair Athol North No 628 0.4102 903 0.5898 1531 0.9922 12 0.0078 1543 0.0189
7 SA Adelaide Broadview Broadview No 971 0.4793 1055 0.5207 2026 0.9946 11 0.0054 2037 0.0250
8 SA Adelaide Brompton Brompton Yes 581 0.6109 370 0.3891 951 0.9896 10 0.0104 961 0.0118
9 SA Adelaide Clarence Park Clarence Park Yes 661 0.5669 505 0.4331 1166 0.9932 8 0.0068 1174 0.0144
10 SA Adelaide College Park College Park Yes 765 0.6722 373 0.3278 1138 0.9939 7 0.0061 1145 0.0140
11 SA Adelaide Croydon Croydon (Adelaide) Yes 256 0.5482 211 0.4518 467 0.9894 5 0.0106 472 0.0058
12 SA Adelaide Croydon Park Croydon Park No 819 0.4881 859 0.5119 1678 0.9888 19 0.0112 1697 0.0208
13 SA Adelaide Croydon Park West Croydon Park West (Adelaide) No 234 0.4835 250 0.5165 484 0.9918 4 0.0082 488 0.0060
14 SA Adelaide Enfield Enfield No 554 0.4128 788 0.5872 1342 0.9832 23 0.0168 1365 0.0167

get_republic_files(filedir)

  • takes directory
  • finds the files we want for the republic referendum

In [235]:
def get_republic_files(filedir):
    # get all filenames from dir
    files = [f for f in listdir(filedir) if isfile(join(filedir, f))]

    # empty list for the files we want
    the_files_we_want = []

    # loop through files, keep the ones we want
    # those have the extension .XLS, and
    # in the worlds laziest naming scheme, *3ASTO.* is the republic, *3ASTO_2* is constitutional recognition
    for f in files:
        if ('.XLS' in f) & ('_2' not in f):
            the_files_we_want.append(f)

    return the_files_we_want

In [236]:
filedir = '1999_referenda/results_by_polling_place/'
print(get_republic_files(filedir)[:3])


['v4_3ADEL.XLS', 'v4_3ASTO.XLS', 'v4_3BALL.XLS']

Put it all together - get the Republic Referendum Data


In [237]:
filedir = '1999_referenda/results_by_polling_place/'
files = get_republic_files(filedir)

# placeholder for the data
df_republic = pd.DataFrame()

for f in files:
    # build file path
    filepath = filedir + '/' + f
    
    # get metadata
    metadata = referenda_by_electorate_get_metadata(filepath)
    metadata = referenda_by_electorate_tidy_metadata(metadata)
    
    # get data
    data = referenda_by_electorate_get_data(filepath)
    
    # add metadata to data
    data = referenda_by_electorate_add_metadata(data, metadata)

    # if df_republic is empty, this is first loop, so override
    # if not, it isn't the first loop, so append data
    if df_republic.empty:
        df_republic = data
    else:
        df_republic = df_republic.append(data)

In [238]:
df_republic.head(20)


Out[238]:
state electorate polling_place polling_place_raw yes_or_no yes_n yes_p no_n no_p formal_n formal_p informal_n informal_p total_n total_p
0 SA Adelaide Adelaide Adelaide (Adelaide) Yes 282 0.6144 177 0.3856 459 0.9871 6 0.0129 465 0.0057
1 SA Adelaide Adelaide East Adelaide East Yes 465 0.6700 229 0.3300 694 0.9914 6 0.0086 700 0.0086
2 SA Adelaide Adelaide Hospital Adelaide Hospital Yes 187 0.6172 116 0.3828 303 0.9806 6 0.0194 309 0.0038
3 SA Adelaide Adelaide South Adelaide South Yes 1056 0.6467 577 0.3533 1633 0.9903 16 0.0097 1649 0.0202
4 SA Adelaide Adelaide West Adelaide West Yes 205 0.6487 111 0.3513 316 0.9875 4 0.0125 320 0.0039
5 SA Adelaide Black Forest East Black Forest East Yes 1206 0.6009 801 0.3991 2007 0.9858 29 0.0142 2036 0.0250
6 SA Adelaide Blair Athol North Blair Athol North No 628 0.4102 903 0.5898 1531 0.9922 12 0.0078 1543 0.0189
7 SA Adelaide Broadview Broadview No 971 0.4793 1055 0.5207 2026 0.9946 11 0.0054 2037 0.0250
8 SA Adelaide Brompton Brompton Yes 581 0.6109 370 0.3891 951 0.9896 10 0.0104 961 0.0118
9 SA Adelaide Clarence Park Clarence Park Yes 661 0.5669 505 0.4331 1166 0.9932 8 0.0068 1174 0.0144
10 SA Adelaide College Park College Park Yes 765 0.6722 373 0.3278 1138 0.9939 7 0.0061 1145 0.0140
11 SA Adelaide Croydon Croydon (Adelaide) Yes 256 0.5482 211 0.4518 467 0.9894 5 0.0106 472 0.0058
12 SA Adelaide Croydon Park Croydon Park No 819 0.4881 859 0.5119 1678 0.9888 19 0.0112 1697 0.0208
13 SA Adelaide Croydon Park West Croydon Park West (Adelaide) No 234 0.4835 250 0.5165 484 0.9918 4 0.0082 488 0.0060
14 SA Adelaide Enfield Enfield No 554 0.4128 788 0.5872 1342 0.9832 23 0.0168 1365 0.0167
15 SA Adelaide Enfield North Enfield North (Adelaide) No 548 0.4374 705 0.5626 1253 0.9890 14 0.0110 1267 0.0155
16 SA Adelaide Enfield South Enfield South No 470 0.4663 538 0.5337 1008 0.9882 12 0.0118 1020 0.0125
17 SA Adelaide Everard Park Everard Park Yes 396 0.5366 342 0.4634 738 0.9906 7 0.0094 745 0.0091
18 SA Adelaide Fullarton Fullarton Yes 1365 0.6003 909 0.3997 2274 0.9930 16 0.0070 2290 0.0281
19 SA Adelaide Goodwood Goodwood Yes 1334 0.6707 655 0.3293 1989 0.9905 19 0.0095 2008 0.0246

Monkey Patch

In their infinite wisdom, the AEC gave the name "Hyde Park" to two different booths in NSW.

It is the only non-unique combination of ['state','polling_place'] in the country for the 1999 referenda.

This patches the booth names so they line up with the same fix I've applied in the polling places import.

AEC: 😠😞


In [239]:
df_republic.loc[
    (df_republic['polling_place'] == 'Hyde Park') &
    (df_republic['state'] == 'NSW') &
    (df_republic['electorate'] == 'Sydney'),
    ['polling_place']
] = 'Hyde Park - Unitarian Church Hall'

df_republic.loc[
    (df_republic['polling_place'] == 'Hyde Park') &
    (df_republic['state'] == 'NSW') &
    (df_republic['electorate'] == 'Reid'),
    ['polling_place']
] = 'Hyde Park - Jubilee Hall'

Remove summary rows

The spreadsheet had total rows, that don't make sense in this data frame where we want to be able to do things like sum by seat or state


In [240]:
df_republic = df_republic.reset_index()

# remove summary lines that don't make sense removing electorate, such as absent
pp_to_remove = [
    'Division Summary',
    'Total Ordinary'
]

df_republic = df_republic[~df_republic['polling_place'].isin(pp_to_remove)]

Write it out to a CSV


In [241]:
outfile = '1999_referenda_output/republic_referendum_by_electorate_by_polling_place.csv'

df_republic.to_csv(
    outfile,
    sep=',',
    index=False,
    encoding='UTF8'
)

Results by Polling Place

Above we have generated results by polling place by electorate

For any mapping work we might wish to do, we need one row per physical polling place, summing votes cast across different electorates

Import data


In [242]:
# import data
filepath = '1999_referenda_output/republic_referendum_by_electorate_by_polling_place.csv'

df_republic = pd.read_csv(
    filepath
)

# see import
df_republic.head(3)


Out[242]:
index state electorate polling_place polling_place_raw yes_or_no yes_n yes_p no_n no_p formal_n formal_p informal_n informal_p total_n total_p
0 0 SA Adelaide Adelaide Adelaide (Adelaide) Yes 282 0.6144 177 0.3856 459 0.9871 6 0.0129 465 0.0057
1 1 SA Adelaide Adelaide East Adelaide East Yes 465 0.6700 229 0.3300 694 0.9914 6 0.0086 700 0.0086
2 2 SA Adelaide Adelaide Hospital Adelaide Hospital Yes 187 0.6172 116 0.3828 303 0.9806 6 0.0194 309 0.0038

Run transformation


In [243]:
# limit columns
df_republic_by_pp = df_republic[[
    'state',
    'polling_place',
    'yes_n',
    'no_n',
    'formal_n',
    'informal_n',
    'total_n'    
]]

# group by state and polling_place, sum other columns
df_republic_by_pp = df_republic_by_pp.groupby([
    'state',
    'polling_place'
]).sum()

# rebuild percentage columns
df_republic_by_pp['yes_p'] = round(df_republic_by_pp['yes_n']/df_republic_by_pp['formal_n'],4)
df_republic_by_pp['no_p'] = round(df_republic_by_pp['no_n']/df_republic_by_pp['formal_n'],4)
df_republic_by_pp['formal_p'] = round(df_republic_by_pp['formal_n']/df_republic_by_pp['total_n'],4)
df_republic_by_pp['informal_p'] = round(df_republic_by_pp['informal_n']/df_republic_by_pp['total_n'],4)

df_republic_by_pp = df_republic_by_pp.reset_index()

# remove summary lines that don't make sense removing electorate, such as absent
pp_to_remove = [
    'Division Summary',
    'Total Ordinary',
    'ABSENT',
    'PRE_POLL',
    'POSTAL',
    'PROVISIONAL',
    'Special Hospitals',
    'Remote Mobiles'
]

df_republic_by_pp = df_republic_by_pp[~df_republic_by_pp['polling_place'].isin(pp_to_remove)]

# check result
df_republic_by_pp.head(3)


Out[243]:
state polling_place yes_n no_n formal_n informal_n total_n yes_p no_p formal_p informal_p
1 ACT Ainslie 1372 500 1872 24 1896 0.7329 0.2671 0.9873 0.0127
2 ACT Ainslie North 1608 749 2357 29 2386 0.6822 0.3178 0.9878 0.0122
3 ACT Aranda 2200 787 2987 21 3008 0.7365 0.2635 0.9930 0.0070

Output to csv


In [244]:
outfile = '1999_referenda_output/republic_referendum_by_polling_place.csv'

df_republic_by_pp.to_csv(
    outfile,
    sep=',',
    index=False,
    encoding='UTF8'
)