In [218]:
import pandas as pd
import xlrd
import os
from os import listdir
from os.path import isfile, join
In [219]:
def left_of_bracket(s):
if '(' in s:
needle = s.find('(')
r = s[:needle-1].strip()
return r
else:
return s
In [220]:
print(left_of_bracket('Croydon Park'))
print(left_of_bracket('Croydon Park West (Adelaide)'))
In [221]:
def referenda_by_electorate_get_metadata(filepath):
x = xlrd.open_workbook(filepath)
x_sheet = x.sheet_by_index(0)
date = x_sheet.cell(0,0).value.strip()
referenda = x_sheet.cell(5,0).value.strip()
state = x_sheet.cell(8,0).value.strip()
electorate = x_sheet.cell(9,0).value.strip()
r = dict(
date = date,
referenda = referenda,
state = state,
electorate = electorate
)
return r
In [222]:
filepath = '1999_referenda/results_by_polling_place/v4_3ADEL.XLS'
referenda_by_electorate_get_metadata(filepath)
Out[222]:
In [223]:
def split_electorate_enrolment(s):
# splitting text in the form 'Adelaide (86272)'
needle = s.find('(')
enrolment = s[needle:].strip() # take right of needle
enrolment = enrolment[1:-1] # remove first and last chars, ie, remove brackets
electorate = s[:needle-1].strip() # take everything from the start to one place before the bracket
r = dict(
electorate = electorate,
enrolment = enrolment
)
return r
In [224]:
split_electorate_enrolment('Adelaide (86272)')
Out[224]:
In [225]:
def state_abbreviation(state):
spaces = state.count(' ')
if spaces == 2:
bits = state.split(' ')
r=''
for b in bits:
r = r + b[:1].upper() # for each word in state grab first letter
return r
elif 'Australia' in state:
r = state[:1].upper() + 'A'
return r
elif state == 'Queensland':
return 'QLD'
elif state == 'Northern Territory':
return 'NT'
else:
r = state[:3].upper()
return r
In [226]:
print(state_abbreviation('New South Wales'))
print(state_abbreviation('Victoria'))
print(state_abbreviation('Queensland'))
print(state_abbreviation('South Australia'))
print(state_abbreviation('Western Australia'))
print(state_abbreviation('Tasmania'))
print(state_abbreviation('Northern Territory'))
print(state_abbreviation('Australian Capital Territory'))
In [227]:
def tidy_referenda_date(s):
# date in the form "Referendum: 6 November 1999"
needle = s.find(':')
needle = needle + 1 # skip the semicolon and the space
s = s[needle:].strip()
# parse as date
r = pd.to_datetime(s)
return r
In [228]:
print(tidy_referenda_date('Referendum: 6 November 1999'))
In [229]:
def referenda_by_electorate_tidy_metadata(metadata):
electorate = metadata['electorate']
state = metadata['state']
date = metadata['date']
# tidy electorate, get enrolment
electorate_enrolment = split_electorate_enrolment(electorate)
metadata['electorate'] = electorate_enrolment['electorate']
metadata['enrolment'] = electorate_enrolment['enrolment']
# tidy date
date = tidy_referenda_date(date)
metadata['date'] = date
# tidy state
state = state_abbreviation(state)
metadata['state'] = state
return metadata
In [230]:
metadata = {
'date': 'Referendum: 6 November 1999',
'referenda': 'the Constitutional Alteration (Establishment of Republic) 1999',
'state': 'South Australia',
'electorate': 'Adelaide (86272)'
}
print('Before:')
print(metadata)
metadata = referenda_by_electorate_tidy_metadata(metadata)
print('After:')
print(metadata)
In [231]:
def referenda_by_electorate_get_data(filepath):
# define headers for dataframe
columnheaders = [
'polling_place_raw',
'yes_or_no',
'yes_n',
'yes_p',
'no_n',
'no_p',
'formal_n',
'formal_p',
'informal_n',
'informal_p',
'total_n',
'total_p'
]
# read file
df_result = pd.read_excel(
filepath,
skiprows = 12, # the data starts on the 13th row
names=columnheaders
)
# percentages are represented by 61.72 rather than 0.6172, this fixes that
lambda_fix_percentages = lambda x: x/100
# run on the relevant columns
df_result['yes_p'] = df_result['yes_p'].apply(lambda_fix_percentages)
df_result['no_p'] = df_result['no_p'].apply(lambda_fix_percentages)
df_result['formal_p'] = df_result['formal_p'].apply(lambda_fix_percentages)
df_result['informal_p'] = df_result['informal_p'].apply(lambda_fix_percentages)
df_result['total_p'] = df_result['total_p'].apply(lambda_fix_percentages)
# polling places in >1 electorate are represented like "Croydon (Adelaide)"
# this creates a poling place column, so the frame can be combined for mapping
lambda_polling_places = lambda x: left_of_bracket(x)
# use above function to make a polling place column, i.e, remove ' (Adelaide)' from 'Enfield North (Adelaide)'
df_result['polling_place'] = df_result['polling_place_raw'].apply(lambda_polling_places)
return df_result
In [232]:
filepath = '1999_referenda/results_by_polling_place/v4_3ADEL.XLS'
referenda_by_electorate_get_data(filepath).head(15)
Out[232]:
In [233]:
def referenda_by_electorate_add_metadata(data, metadata):
# add metadata columns as desired
data['state'] = metadata['state']
data['electorate'] = metadata['electorate']
#data['enrolment'] = metadata['enrolment']
#data['date'] = metadata['date']
#data['referenda'] = metadata['referenda']
# reorder columns - if you uncomment above, make sure to include below also
data = data[[
#'referenda',
#'date',
'state',
'electorate',
#'enrolment',
'polling_place',
'polling_place_raw',
'yes_or_no',
'yes_n',
'yes_p',
'no_n',
'no_p',
'formal_n',
'formal_p',
'informal_n',
'informal_p',
'total_n',
'total_p'
]]
return data
In [234]:
# get file
filepath = '1999_referenda/results_by_polling_place/v4_3ADEL.XLS'
# get and tidy metadata
metadata = referenda_by_electorate_get_metadata(filepath)
metadata = referenda_by_electorate_tidy_metadata(metadata)
# get data
data = referenda_by_electorate_get_data(filepath)
# append metadata to data
data = referenda_by_electorate_add_metadata(data, metadata)
data.head(15)
Out[234]:
In [235]:
def get_republic_files(filedir):
# get all filenames from dir
files = [f for f in listdir(filedir) if isfile(join(filedir, f))]
# empty list for the files we want
the_files_we_want = []
# loop through files, keep the ones we want
# those have the extension .XLS, and
# in the worlds laziest naming scheme, *3ASTO.* is the republic, *3ASTO_2* is constitutional recognition
for f in files:
if ('.XLS' in f) & ('_2' not in f):
the_files_we_want.append(f)
return the_files_we_want
In [236]:
filedir = '1999_referenda/results_by_polling_place/'
print(get_republic_files(filedir)[:3])
In [237]:
filedir = '1999_referenda/results_by_polling_place/'
files = get_republic_files(filedir)
# placeholder for the data
df_republic = pd.DataFrame()
for f in files:
# build file path
filepath = filedir + '/' + f
# get metadata
metadata = referenda_by_electorate_get_metadata(filepath)
metadata = referenda_by_electorate_tidy_metadata(metadata)
# get data
data = referenda_by_electorate_get_data(filepath)
# add metadata to data
data = referenda_by_electorate_add_metadata(data, metadata)
# if df_republic is empty, this is first loop, so override
# if not, it isn't the first loop, so append data
if df_republic.empty:
df_republic = data
else:
df_republic = df_republic.append(data)
In [238]:
df_republic.head(20)
Out[238]:
In their infinite wisdom, the AEC gave the name "Hyde Park" to two different booths in NSW.
It is the only non-unique combination of ['state','polling_place'] in the country for the 1999 referenda.
This patches the booth names so they line up with the same fix I've applied in the polling places import.
AEC: 😠😞
In [239]:
df_republic.loc[
(df_republic['polling_place'] == 'Hyde Park') &
(df_republic['state'] == 'NSW') &
(df_republic['electorate'] == 'Sydney'),
['polling_place']
] = 'Hyde Park - Unitarian Church Hall'
df_republic.loc[
(df_republic['polling_place'] == 'Hyde Park') &
(df_republic['state'] == 'NSW') &
(df_republic['electorate'] == 'Reid'),
['polling_place']
] = 'Hyde Park - Jubilee Hall'
In [240]:
df_republic = df_republic.reset_index()
# remove summary lines that don't make sense removing electorate, such as absent
pp_to_remove = [
'Division Summary',
'Total Ordinary'
]
df_republic = df_republic[~df_republic['polling_place'].isin(pp_to_remove)]
In [241]:
outfile = '1999_referenda_output/republic_referendum_by_electorate_by_polling_place.csv'
df_republic.to_csv(
outfile,
sep=',',
index=False,
encoding='UTF8'
)
In [242]:
# import data
filepath = '1999_referenda_output/republic_referendum_by_electorate_by_polling_place.csv'
df_republic = pd.read_csv(
filepath
)
# see import
df_republic.head(3)
Out[242]:
In [243]:
# limit columns
df_republic_by_pp = df_republic[[
'state',
'polling_place',
'yes_n',
'no_n',
'formal_n',
'informal_n',
'total_n'
]]
# group by state and polling_place, sum other columns
df_republic_by_pp = df_republic_by_pp.groupby([
'state',
'polling_place'
]).sum()
# rebuild percentage columns
df_republic_by_pp['yes_p'] = round(df_republic_by_pp['yes_n']/df_republic_by_pp['formal_n'],4)
df_republic_by_pp['no_p'] = round(df_republic_by_pp['no_n']/df_republic_by_pp['formal_n'],4)
df_republic_by_pp['formal_p'] = round(df_republic_by_pp['formal_n']/df_republic_by_pp['total_n'],4)
df_republic_by_pp['informal_p'] = round(df_republic_by_pp['informal_n']/df_republic_by_pp['total_n'],4)
df_republic_by_pp = df_republic_by_pp.reset_index()
# remove summary lines that don't make sense removing electorate, such as absent
pp_to_remove = [
'Division Summary',
'Total Ordinary',
'ABSENT',
'PRE_POLL',
'POSTAL',
'PROVISIONAL',
'Special Hospitals',
'Remote Mobiles'
]
df_republic_by_pp = df_republic_by_pp[~df_republic_by_pp['polling_place'].isin(pp_to_remove)]
# check result
df_republic_by_pp.head(3)
Out[243]:
In [244]:
outfile = '1999_referenda_output/republic_referendum_by_polling_place.csv'
df_republic_by_pp.to_csv(
outfile,
sep=',',
index=False,
encoding='UTF8'
)